[Feature] Support Qwen-7B, dynamic NTK scaling and logN scaling in turbomind (#230)

* qwen support * dynamic ntk & logn attn * fix ntk & add chat template * fix ntk scaling & stop words * fix lint * add tiktoken to requirements.txt * fix tokenizer, set model format automatically * update model.py * update readme * fix lint

[Feature] Support Qwen-7B, dynamic NTK scaling and logN scaling in turbomind (#230)
* qwen support * dynamic ntk & logn attn * fix ntk & add chat template * fix ntk scaling & stop words * fix lint * add tiktoken to requirements.txt * fix tokenizer, set model format automatically * update model.py * update readme * fix lint
4a60b45d · Li Zhang · GitHub · 62b60db7 · 4a60b45d · 4a60b45d
Unverified Commit 4a60b45d authored Aug 18, 2023 by Li Zhang Committed by GitHub Aug 18, 2023
20 changed files
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ ______________________________________________________________________

 ## News 🎉

+- \[2023/08\] TurboMind supports Qwen-7B, dynamic NTK-RoPE scaling and dynamic logN scaling
 - \[2023/08\] TurboMind supports Windows (tp=1)
 - \[2023/08\] TurboMind supports 4-bit inference, 2.4x faster than FP16, the fastest open-source implementation🚀. Check [this](./docs/en/w4a16.md) guide for detailed info
 - \[2023/08\] LMDeploy has launched on the [HuggingFace Hub](https://huggingface.co/lmdeploy), providing ready-to-use 4-bit models.

--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -13,6 +13,7 @@ ______________________________________________________________________

 ## 更新 🎉

+- \[2023/08\] TurboMind 支持 Qwen-7B，动态NTK-RoPE缩放，动态logN缩放
 - \[2023/08\] TurboMind 支持 Windows (tp=1)
 - \[2023/08\] TurboMind 支持 4-bit 推理，速度是 FP16 的 2.4 倍，是目前最快的开源实现🚀。部署方式请看[这里](./docs/zh_cn/w4a16.md)
 - \[2023/08\] LMDeploy 开通了 [HuggingFace Hub](https://huggingface.co/lmdeploy) ，提供开箱即用的 4-bit 模型

--- a/examples/cpp/llama/tokenizer.py
+++ b/examples/cpp/llama/tokenizer.py
@@ -16,7 +16,8 @@ class Tokenizer:
            self.pad_id = self.model.pad_id()
        else:
            from transformers import AutoTokenizer
-            self.model = AutoTokenizer.from_pretrained(model_file)
+            self.model = AutoTokenizer.from_pretrained(model_file,
+                                                       trust_remote_code=True)
            self.vocab_size = self.model.vocab_size
            self.start_id = self.model.bos_token_id
            self.end_id = self.model.eos_token_id

--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -161,6 +161,36 @@ If a question does not make any sense, or is not factually coherent, explain why
        return f'{self.b_inst} {prompt} {self.e_inst} '


+@MODELS.register_module(name='qwen-7b')
+class Qwen7BChat(BaseModel):
+    """Chat template for Qwen-7B-Chat."""
+
+    def __init__(self):
+        super().__init__()
+        self.session_len = 8192
+        self.top_p = 0.5
+        self.top_k = 40
+        self.temperature = 1.0
+
+        self.im_start = '<|im_start|>'
+        self.im_end = '<|im_end|>'
+        self.system = 'You are a helpful assistant.'
+
+    def get_prompt(self, prompt, sequence_start=True):
+        if sequence_start:
+            return f'{self.im_start}system\n{self.system}{self.im_end}' \
+                   f'\n{self.im_start}user\n{prompt}{self.im_end}' \
+                   f'\n{self.im_start}assistant\n'
+
+        return f'\n{self.im_start}user\n{prompt}{self.im_end}' \
+               f'\n{self.im_start}assistant\n'
+
+    @property
+    def stop_words(self):
+        """Return the stop-words' token ids."""
+        return [151645]  # <|im_end|>
+
+
 def main(model_name: str = 'test'):
    assert model_name in MODELS.module_dict.keys(), \
        f"'{model_name}' is not supported. " \

--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -16,7 +16,7 @@ from sentencepiece import SentencePieceProcessor
 import lmdeploy
 from lmdeploy.model import MODELS

-supported_formats = ['llama', 'hf', 'awq']
+supported_formats = ['llama', 'hf', 'awq', 'qwen']


 def get_package_root_path():
@@ -84,7 +84,7 @@ def copy_triton_model_templates(_path: str):
        return None


-def tokenizer_info(model_path: str):
+def tokenizer_info_sp(model_path: str):
    """Return the vocabulary size, bos token id and eos token id.

    Args:
@@ -101,6 +101,13 @@ def tokenizer_info(model_path: str):
    return n_words, bos_id, eos_id


+def tokenizer_info_qwen(model_dir: str):
+    n_words = 151851
+    bos_id = 0
+    eos_id = 151643
+    return n_words, bos_id, eos_id
+
+
 def export(model_name: str,
           num_layer: int,
           norm_eps: float,
@@ -111,7 +118,11 @@ def export(model_name: str,
           tp: int,
           size_per_head: int = 128,
           group_size: int = 0,
-           weight_type: str = 'fp16'):
+           weight_type: str = 'fp16',
+           max_position_embeddings: int = 0,
+           use_dynamic_ntk: int = 0,
+           use_logn_attn: int = 0,
+           tokenizer_info=tokenizer_info_sp):
    """Export deploying information to a config file.

    Args:
@@ -191,7 +202,7 @@ def export(model_name: str,
        head_num=head_num,
        kv_head_num=kv_head_num,
        size_per_head=size_per_head,
-        vocab_size=vocab_size,
+        vocab_size=_vocab_size,
        num_layer=num_layer,
        rotary_embedding=size_per_head,
        inter_size=inter_size,
@@ -210,7 +221,11 @@ def export(model_name: str,
        cache_chunk_size=1,
        use_context_fmha=1,
        quant_policy=0,
-        tensor_para_size=tp))
+        tensor_para_size=tp,
+        # extra attention params
+        max_position_embeddings=max_position_embeddings,
+        use_dynamic_ntk=int(use_dynamic_ntk),
+        use_logn_attn=int(use_logn_attn)))

    config = configparser.ConfigParser()
    for section, key_values in cfg.items():
@@ -725,6 +740,134 @@ def deploy_awq(model_name: str, model_path: str, tokenizer_path: str,
                  group_size=group_size)


+def deploy_qwen(model_name: str, model_path: str, tokenizer_path: str,
+                triton_models_path: str, tp: int):
+    """Deploy a model with huggingface transformers' format.
+
+    Args:
+        model_name (str): the name of the to-be-deployed model
+        model_path (str): the path of the directory where the model weight
+          files are
+        tokenizer_path (str): the path of the tokenizer model path
+        triton_models_path (str): the path of the exported triton models
+        tp (int): the number of tensor parallelism
+        quant_path (str): path of the quantized model, which can be None
+        group_size (int): a parameter used in AWQ to quantize fp16 weights
+            to 4 bits
+    """
+
+    if osp.exists(model_path):
+        shutil.copy(osp.join(model_path, 'qwen.tiktoken'),
+                    osp.join(triton_models_path, 'tokenizer'))
+        for _file in os.listdir(model_path):
+            if _file.endswith('.json') or _file.endswith('.py'):
+                json_path = osp.join(model_path, _file)
+                shutil.copy(json_path,
+                            osp.join(triton_models_path, 'tokenizer', _file))
+        with get_package_root_path() as root_path:
+            shutil.copy(osp.join(root_path, 'turbomind/tokenizer.py'),
+                        osp.join(triton_models_path, 'tokenizer'))
+    else:
+        print(f'tokenizer model {tokenizer_path} does not exist')
+        exit(-1)
+
+    # read model arguments from params.json
+    try:
+        params_path = osp.join(model_path, 'config.json')
+        with open(params_path) as f:
+            config = json.load(f)
+            num_layer = config['num_hidden_layers']
+            norm_eps = config['layer_norm_epsilon']
+            if 'num_key_value_heads' in config:
+                kv_head_num = config['num_key_value_heads']
+            else:
+                kv_head_num = config['num_attention_heads']
+            seq_length = config['seq_length']
+            use_dynamic_ntk = config['use_dynamic_ntk']
+            use_logn_attn = config['use_logn_attn']
+    except Exception as e:
+        print(f'get "num_hidden_layers" and "layer_norm_epsilon" from '
+              f'{params_path} failed: {e}')
+        return False
+
+    # convert weights from hf to turbomind
+    model_params = {}
+
+    _files = [file for file in os.listdir(model_path) if file.endswith('.bin')]
+    _files = sorted(_files)
+    print(_files)
+
+    _params = {}
+    for _file in _files:
+        _tmp = torch.load(osp.join(model_path, _file), map_location='cpu')
+        _params.update(_tmp)
+
+    def get_tensor(name, trans=True):
+        """return a transposed tensor according its name."""
+        if trans:
+            return _params[name].cuda().t()
+        else:
+            return _params[name].cuda()
+
+    for i in range(num_layer):
+        print(i)
+
+        # qkv weights
+        qkv_w = get_tensor(f'transformer.h.{i}.attn.c_attn.weight')
+        q_w, k_w, v_w = torch.split(qkv_w, qkv_w.size(-1) // 3, dim=-1)
+        q_w, k_w = permute(q_w), permute(k_w)
+        qkv_w = merge_qkv(q_w, k_w, v_w, tp, dim=2)
+        model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv_w
+
+        # qkv bias
+        qkv_b = get_tensor(f'transformer.h.{i}.attn.c_attn.bias')
+        q_b, k_b, v_b = torch.split(qkv_b, qkv_b.size(-1) // 3)
+        q_b, k_b = permute(q_b), permute(k_b)
+        qkv_b = merge_qkv(q_b, k_b, v_b, tp, dim=1)
+        model_params[f'layers.{i}.attention.w_qkv.bias'] = qkv_b
+
+        # o weights
+        o_w = get_tensor(f'transformer.h.{i}.attn.c_proj.weight')
+        model_params[f'layers.{i}.attention.wo.weight'] = o_w
+        model_params[f'layers.{i}.attention.wo.bias'] = torch.zeros_like(q_b)
+
+        # ffn weights
+        # ours: w2(silu(w1(x)) * w3(x))
+        # qwen: c_proj(w1(x) * silu(w2(x)))
+        w1 = get_tensor(f'transformer.h.{i}.mlp.w2.weight')
+        w3 = get_tensor(f'transformer.h.{i}.mlp.w1.weight')
+        w2 = get_tensor(f'transformer.h.{i}.mlp.c_proj.weight')
+        model_params[f'layers.{i}.feed_forward.w1.weight'] = w1
+        model_params[f'layers.{i}.feed_forward.w2.weight'] = w2
+        model_params[f'layers.{i}.feed_forward.w3.weight'] = w3
+
+        # norm weights
+        attn_norm = get_tensor(f'transformer.h.{i}.ln_1.weight')
+        ffn_norm = get_tensor(f'transformer.h.{i}.ln_2.weight')
+
+        model_params[f'layers.{i}.attention_norm.weight'] = attn_norm
+        model_params[f'layers.{i}.ffn_norm.weight'] = ffn_norm
+
+    other = [('tok_embeddings.weight', 'transformer.wte.weight'),
+             ('norm.weight', 'transformer.ln_f.weight'),
+             ('output.weight', 'lm_head.weight')]
+    for ft, hf in other:
+        model_params[ft] = get_tensor(hf, trans=False)
+
+    return export(model_name,
+                  num_layer,
+                  norm_eps,
+                  kv_head_num,
+                  model_params,
+                  model_path,
+                  triton_models_path,
+                  tp,
+                  max_position_embeddings=seq_length,
+                  use_dynamic_ntk=use_dynamic_ntk,
+                  use_logn_attn=use_logn_attn,
+                  tokenizer_info=tokenizer_info_qwen)
+
+
 def pack_model_repository(workspace_path: str):
    """package the model repository.

@@ -752,7 +895,7 @@ def pack_model_repository(workspace_path: str):

 def main(model_name: str,
         model_path: str,
-         model_format: str = 'hf',
+         model_format: str = None,
         tokenizer_path: str = None,
         dst_path: str = './workspace',
         tp: int = 1,
@@ -777,6 +920,9 @@ def main(model_name: str,
        f"'{model_name}' is not supported. " \
        f'The supported models are: {MODELS.module_dict.keys()}'

+    if model_format is None:
+        model_format = 'qwen' if model_name == 'qwen-7b' else 'hf'
+
    if model_format not in supported_formats:
        print(f'the model format "{model_format}" is not supported. '
              f'The supported format are: {supported_formats}')
@@ -803,6 +949,9 @@ def main(model_name: str,
    elif model_format == 'awq':
        res = deploy_awq(model_name, model_path, tokenizer_path,
                         triton_models_path, tp, quant_path, group_size)
+    elif model_format == 'qwen':
+        res = deploy_qwen(model_name, model_path, tokenizer_path,
+                          triton_models_path, tp)

    # update `tensor_para_size` in `triton_models/interactive/config.pbtxt`
    with open(osp.join(triton_models_path, 'interactive/config.pbtxt'),

--- a/lmdeploy/turbomind/tokenizer.py
+++ b/lmdeploy/turbomind/tokenizer.py
 # Copyright (c) OpenMMLab. All rights reserved.
+import json
 import os.path as osp
 from typing import Sequence, Union

@@ -99,6 +100,13 @@ class HuggingFaceTokenizer:
            if hasattr(self.model, 'backend_tokenizer'):
                self.model.backend_tokenizer.save(backend_tokenizer_file)

+        if self.model.eos_token_id is None:
+            generation_config_file = osp.join(model_dir,
+                                              'generation_config.json')
+            with open(generation_config_file, 'r') as f:
+                cfg = json.load(f)
+                self.model.eos_token_id = cfg['eos_token_id']
+
    @property
    def vocab_size(self):
        """vocabulary size."""

--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ pybind11
 safetensors
 sentencepiece
 setuptools
+tiktoken
 torch
 transformers
 tritonclient[all]
--- a/src/turbomind/kernels/decoder_masked_multihead_attention.h
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention.h
@@ -60,13 +60,6 @@ struct Multihead_attention_params_base {
    // The input Vs and the associated bias. Dimensions B x D and D, resp.
    const T *v = nullptr, *v_bias = nullptr;

-    // The cache for the Ks. The size must be at least B x L x D.
-    T* k_cache = nullptr;
-    // The cache for the Vs. The size must be at least B x L x D.
-    T* v_cache = nullptr;
-    // The indirections to use for cache when beam sampling.
-    const int* cache_indir = nullptr;
-
    // scales
    const float* query_weight_output_scale               = nullptr;
    const float* attention_qk_scale                      = nullptr;
@@ -108,10 +101,6 @@ struct Multihead_attention_params_base {
    // The slope per head of linear position bias to attention score (H).
    const T* linear_bias_slopes = nullptr;

-    const T*   ia3_key_weights   = nullptr;
-    const T*   ia3_value_weights = nullptr;
-    const int* ia3_tasks         = nullptr;
-
    const float* qkv_scale_out       = nullptr;
    const float* attention_out_scale = nullptr;
    int          int8_mode           = 0;
@@ -123,17 +112,15 @@ struct Multihead_attention_params_base {

 template<typename T>
 struct Multihead_attention_params: public Multihead_attention_params_base<T> {
-    // allows to exist attention eary
-    bool* finished = nullptr;
-
-    // required in case of masked attention with different length
-    const int* length_per_sample = nullptr;
-
-    T**    k_cache_per_sample         = nullptr;
-    T**    v_cache_per_sample         = nullptr;
-    size_t kv_cache_per_sample_offset = 0;
-    bool   k_cache_interleaved        = true;
-    int    num_kv_heads               = 0;
+    bool*      finished                   = nullptr;
+    const int* length_per_sample          = nullptr;
+    T**        k_cache_per_sample         = nullptr;
+    T**        v_cache_per_sample         = nullptr;
+    size_t     kv_cache_per_sample_offset = 0;
+    int        num_kv_heads               = 0;
+    int        max_position_embeddings    = 0;
+    bool       use_dynamic_ntk            = false;
+    bool       use_logn_attn              = false;
 };

 template<class T>

--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
@@ -41,15 +41,12 @@

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-// !!! Specialize the launcher for Cross attention
 template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
 void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
 {
    constexpr int THREADS_PER_VALUE = threads_per_value_t<T, Dh_MAX>::value;
-    // constexpr bool DO_CROSS_ATTENTION = std::is_same<KERNEL_PARAMS_TYPE, Cross_multihead_attention_params<T>>::value;
-    const int tlength = params.timestep;

-    FT_CHECK(params.cache_indir == nullptr);
+    const int tlength = params.timestep;

    if (params.int8_mode == 4) {
        if (tlength < 32) {

--- a/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -18,8 +18,6 @@
 #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
 #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
 #include "src/turbomind/macro.h"
-// #include "src/turbomind/utils/cuda_bf16_wrapper.h"
-// #include "src/turbomind/utils/cuda_fp8_utils.h"
 #include "src/turbomind/utils/cuda_type_utils.cuh"
 #include <assert.h>
 #include <float.h>
@@ -592,42 +590,6 @@ __inline__ __device__ uint4 vec_conversion<uint4, Float8_>(const Float8_& a)
    return b;
 }

-#ifdef ENABLE_FP8
-// fp8_t
-template<>
-__inline__ __device__ float vec_conversion<float, __nv_fp8_e4m3>(const __nv_fp8_e4m3& a)
-{
-    return float(a);
-}
-template<>
-__inline__ __device__ __nv_fp8_e4m3 vec_conversion<__nv_fp8_e4m3, float>(const float& a)
-{
-    return __nv_fp8_e4m3(a);
-}
-// fp8_2_t
-template<>
-__inline__ __device__ float2 vec_conversion<float2, fp8_2_t>(const fp8_2_t& a)
-{
-    return float2(a);
-}
-template<>
-__inline__ __device__ fp8_2_t vec_conversion<fp8_2_t, float2>(const float2& a)
-{
-    return fp8_2_t(a);
-}
-// fp8_4_t
-template<>
-__inline__ __device__ float4 vec_conversion<float4, fp8_4_t>(const fp8_4_t& a)
-{
-    return float4(a);
-}
-template<>
-__inline__ __device__ fp8_4_t vec_conversion<fp8_4_t, float4>(const float4& a)
-{
-    return fp8_4_t(a);
-}
-#endif  // ENABLE_FP8
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 template<int THREADS_PER_KEY, typename K_vec, int N>
@@ -868,19 +830,6 @@ inline __device__ void convert_from_float(bf16_8_t& dst, Float8_ src)

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-#ifdef ENABLE_FP8
-inline __device__ void convert_from_float(fp8_4_t& dst, float4 src)
-{
-    dst = fp8_4_t(src);
-}
-inline __device__ void convert_from_float(fp8_2_t& dst, float2 src)
-{
-    dst = fp8_2_t(src);
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 inline __device__ void convert_from_float(float2& dst, float2 src)
 {
    dst = src;
@@ -1365,8 +1314,6 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    // The thread in the block.
    const int tidx = threadIdx.x;

-    constexpr bool handle_kv = true;
-
    // While doing the product Q*K^T for the different keys we track the max.
    float qk_max = -FLT_MAX;

@@ -1422,28 +1369,36 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>

    Qk_vec_k k_bias;
    zero(k_bias);
-    if (handle_kv) {
-        k_bias =
-            !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.k_bias != nullptr ?
-                vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(&params.k_bias[k_bias_offset])) :
-                k_bias;
-    }
+
+    k_bias = !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.k_bias != nullptr ?
+                 vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(&params.k_bias[k_bias_offset])) :
+                 k_bias;

    // Computes the Q/K values with bias.
    q = add(q, q_bias);
-    if (handle_kv) {
-        k = add(k, k_bias);
+    k = add(k, k_bias);
+
+    float rotary_emb_base = 10000.f;
+    if (params.use_dynamic_ntk) {
+        // +1 because of `length_per_sample == context_length - 1`
+        rotary_emb_base = rotary_embedding_get_base(params.length_per_sample[bi] + 1,
+                                                    params.max_position_embeddings,
+                                                    params.rotary_embedding_dim,
+                                                    rotary_emb_base);
    }

    // Padded len
    const int padd_len = (params.total_padding_tokens == nullptr) ? 0 : params.total_padding_tokens[bi];
    if (params.rotary_embedding_dim > 0) {
-        if (handle_kv) {
-            apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, params.timestep - padd_len);
-        }
-        else {
-            apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, params.timestep - padd_len);
-        }
+        apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, rotary_emb_base, params.timestep - padd_len);
+    }
+
+    if (params.use_logn_attn) {
+        T log_n_scaling;
+        // +1 because of `length_per_sample == context_length - 1`
+        convert_from_float(log_n_scaling,
+                           logn_attn_get_scaling(params.length_per_sample[bi] + 1, params.max_position_embeddings));
+        q = mul<Qk_vec_k, T, Qk_vec_k>(log_n_scaling, q);
    }

    if (!is_masked) {
@@ -1462,47 +1417,23 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
        // The position of the thread in that 16B chunk.
        int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;

-        if (handle_kv && group_leader) {
+        if (group_leader) {
            // Trigger the stores to global memory.
            if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
-                if (!params.k_cache_per_sample) {
-                    // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
-                    int offset = bhi * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B
-                                 + tlength_circ * QK_ELTS_IN_16B + ci;

-                    if (!QUANT_POLICY) {
-                        *reinterpret_cast<Qk_vec_m*>(&params.k_cache[offset]) = vec_conversion<Qk_vec_m, Qk_vec_k>(k);
-                    }
-                    else if (QUANT_POLICY == 4) {
-                        using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec_k>::value>::type;
-                        Packed_Int8_t k_int8 = quant(k, k_scale, k_zp);
+                int offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + tlength_circ * Dh
+                             + co * QK_ELTS_IN_16B + ci;

-                        int8_t* dst_ptr                                     = reinterpret_cast<int8_t*>(params.k_cache);
-                        *reinterpret_cast<Packed_Int8_t*>(&dst_ptr[offset]) = k_int8;
-                    }
+                if (!QUANT_POLICY) {
+                    *reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) =
+                        vec_conversion<Qk_vec_m, Qk_vec_k>(k);
                }
-                else {
-                    int offset;
-                    if (params.k_cache_interleaved) {
-                        offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh
-                                 + co * params.memory_max_len * QK_ELTS_IN_16B + tlength_circ * QK_ELTS_IN_16B + ci;
-                    }
-                    else {
-                        offset = params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh
-                                 + tlength_circ * Dh + co * QK_ELTS_IN_16B + ci;
-                    }
+                else if (QUANT_POLICY == 4) {
+                    using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec_k>::value>::type;
+                    Packed_Int8_t k_int8 = quant(k, k_scale, k_zp);

-                    if (!QUANT_POLICY) {
-                        *reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) =
-                            vec_conversion<Qk_vec_m, Qk_vec_k>(k);
-                    }
-                    else if (QUANT_POLICY == 4) {
-                        using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec_k>::value>::type;
-                        Packed_Int8_t k_int8 = quant(k, k_scale, k_zp);
-
-                        int8_t* dst_ptr = reinterpret_cast<int8_t*>(params.k_cache_per_sample[bi]);
-                        *reinterpret_cast<Packed_Int8_t*>(&dst_ptr[offset]) = k_int8;
-                    }
+                    int8_t* dst_ptr = reinterpret_cast<int8_t*>(params.k_cache_per_sample[bi]);
+                    *reinterpret_cast<Packed_Int8_t*>(&dst_ptr[offset]) = k_int8;
                }
            }
        }
@@ -1584,33 +1515,21 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    int8_t* k_cache_batch_int8 = nullptr;

    if (!QUANT_POLICY) {
-        k_cache_batch = params.k_cache_per_sample ? (params.k_cache_per_sample[bi] + params.kv_cache_per_sample_offset
-                                                     + kvhi * params.memory_max_len * Dh + ki) :
-                                                    &params.k_cache[bhi * params.memory_max_len * Dh + ki];
-        // Base pointer for the beam's batch, before offsetting with indirection buffer
-        // T* k_cache_batch = &params.k_cache[bbhi * params.memory_max_len * Dh + ki];
+        k_cache_batch =
+            params.k_cache_per_sample[bi] + params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + ki;
    }
    else if (QUANT_POLICY == 4) {
        // convert k_cache_per_sample to int8
-        if (params.k_cache_per_sample) {
-            int8_t* ptr        = reinterpret_cast<int8_t*>(params.k_cache_per_sample[bi]);
-            k_cache_batch_int8 = ptr + params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + ki;
-        }
-        else {
-            int8_t* ptr        = reinterpret_cast<int8_t*>(params.k_cache);
-            k_cache_batch_int8 = &ptr[bhi * params.memory_max_len * Dh + ki];
-        }
+        int8_t* ptr        = reinterpret_cast<int8_t*>(params.k_cache_per_sample[bi]);
+        k_cache_batch_int8 = ptr + params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + ki;
    }

    // Pick a number of keys to make sure all the threads of a warp enter (due to shfl_sync).
    // int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
    int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;

-    // prefix prompt length if has
-    const int prefix_prompt_length = (params.prefix_prompt_lengths == nullptr) ? 0 : params.prefix_prompt_lengths[bi];
-
    // Iterate over the keys/timesteps to compute the various (Q*K^T)_{ti} values.
-    const int* beam_indices = HAS_BEAMS ? &params.cache_indir[bi_seq_len_offset] : nullptr;
+    // const int* beam_indices = HAS_BEAMS ? &params.cache_indir[bi_seq_len_offset] : nullptr;

    for (int ti = first_step + ko; ti < ti_end; ti += K_PER_ITER) {
        const int ti_circ = ti % params.memory_max_len;
@@ -1622,8 +1541,7 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
        zero(k_vec_zero);
 #pragma unroll
        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-            int jj =
-                params.k_cache_interleaved ? ii * params.memory_max_len + ti_circ : ti_circ * Dh / QK_ELTS_IN_16B + ii;
+            int jj = ti_circ * Dh / QK_ELTS_IN_16B + ii;
            // if( ti < params.timestep ) {
            const bool within_bounds = (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.memory_max_len);
            if (ti < tlength) {
@@ -1632,9 +1550,6 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
                }
                else {
                    int beam_offset = 0;
-                    if (HAS_BEAMS) {
-                        beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh;
-                    }

                    if (!QUANT_POLICY) {
                        k[ii] = vec_conversion<K_vec_k, K_vec_m>(
@@ -1771,24 +1686,15 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
    int8_t* v_cache_batch_int8 = nullptr;

    if (!QUANT_POLICY) {
-
-        v_cache = params.v_cache_per_sample ? (params.v_cache_per_sample[bi] + params.kv_cache_per_sample_offset
-                                               + kvhi * params.memory_max_len * Dh + vi) :
-                                              &params.v_cache[bhi * params.memory_max_len * Dh + vi];
+        v_cache =
+            params.v_cache_per_sample[bi] + params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + vi;
        // Base pointer for the beam's batch, before offsetting with indirection buffer
        // T* v_cache_batch = &params.v_cache[bbhi * params.memory_max_len * Dh + vi];
        v_cache_batch = v_cache;
    }
    else if (QUANT_POLICY == 4) {
-        if (params.v_cache_per_sample) {
-            int8_t* ptr  = reinterpret_cast<int8_t*>(params.v_cache_per_sample[bi]);
-            v_cache_int8 = ptr + params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + vi;
-        }
-        else {
-            int8_t* ptr  = reinterpret_cast<int8_t*>(params.v_cache);
-            v_cache_int8 = &ptr[bhi * params.memory_max_len * Dh + vi];
-        }
-
+        int8_t* ptr        = reinterpret_cast<int8_t*>(params.v_cache_per_sample[bi]);
+        v_cache_int8       = ptr + params.kv_cache_per_sample_offset + kvhi * params.memory_max_len * Dh + vi;
        v_cache_batch_int8 = v_cache_int8;
    }

@@ -1819,19 +1725,14 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
        // the compiler cannot optimize the codes automatically.
        const int min_length = min(tlength, params.memory_max_len);
        for (int ti = first_step + vo; ti < min_length; ti += V_PER_ITER) {
-            // Fetch offset based on cache_indir when beam sampling
-            const int beam_src    = HAS_BEAMS ? params.cache_indir[bi_seq_len_offset + ti] : 0;
-            const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0;
            // Load the values from the cache.
            V_vec_k v;

            if (!QUANT_POLICY) {
-                v = vec_conversion<V_vec_k, V_vec_m>(
-                    *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * Dh]));
+                v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[ti * Dh]));
            }
            else if (QUANT_POLICY == 4) {
-                Packed_Int8_t v_vec_m_int8 =
-                    *reinterpret_cast<const Packed_Int8_t*>(&v_cache_batch_int8[beam_offset + ti * Dh]);
+                Packed_Int8_t  v_vec_m_int8  = *reinterpret_cast<const Packed_Int8_t*>(&v_cache_batch_int8[ti * Dh]);
                Packed_Float_t v_vec_m_float = dequant(v_vec_m_int8, v_scale, v_zp);

                v = vec_conversion<V_vec_k, Packed_Float_t>(v_vec_m_float);
@@ -1867,18 +1768,13 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
            }
            const int ti_circ = ti % params.memory_max_len;

-            // Fetch offset based on cache_indir when beam sampling
-            const int beam_src    = HAS_BEAMS ? params.cache_indir[bi_seq_len_offset + ti_circ] : 0;
-            const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0;
            // Load the values from the cache.
            V_vec_k v;
            if (!QUANT_POLICY) {
-                v = vec_conversion<V_vec_k, V_vec_m>(
-                    *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti_circ * Dh]));
+                v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[ti_circ * Dh]));
            }
            else if (QUANT_POLICY == 4) {
-                Packed_Int8_t v_vec_m_int8 =
-                    *reinterpret_cast<const Packed_Int8_t*>(&v_cache_batch_int8[beam_offset + ti_circ * Dh]);
+                Packed_Int8_t v_vec_m_int8 = *reinterpret_cast<const Packed_Int8_t*>(&v_cache_batch_int8[ti_circ * Dh]);
                Packed_Float_t v_vec_m_float = dequant(v_vec_m_int8, v_scale, v_zp);

                v = vec_conversion<V_vec_k, Packed_Float_t>(v_vec_m_float);
@@ -1927,11 +1823,9 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
        }

        // Store the V values to cache
-        if (handle_kv && group_leader) {
+        if (group_leader) {

            // Store the values with bias back to global memory in the cache for V.
-            //*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
-
            if (!QUANT_POLICY) {
                *reinterpret_cast<V_vec_m*>(&v_cache[tlength_circ * Dh]) = vec_conversion<V_vec_m, V_vec_k>(v);
            }

--- a/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
+++ b/src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
--- a/src/turbomind/kernels/unfused_attention_kernels.cu
+++ b/src/turbomind/kernels/unfused_attention_kernels.cu
--- a/src/turbomind/kernels/unfused_attention_kernels.h
+++ b/src/turbomind/kernels/unfused_attention_kernels.h
@@ -19,25 +19,6 @@

 namespace turbomind {

-template<typename T>
-void invokeAddQKVBiasIA3Transpose(T*           q_buf,
-                                  T*           k_buf,
-                                  T*           v_buf,
-                                  T*           Q,
-                                  const T*     bias_Q,
-                                  T*           K,
-                                  const T*     bias_K,
-                                  T*           V,
-                                  const T*     bias_V,
-                                  const int    batch_size,
-                                  const int    seq_len,
-                                  const int    head_num,
-                                  const int    size_per_head,
-                                  const int*   ia3_tasks,
-                                  const T*     ia3_key_weights,
-                                  const T*     ia3_value_weights,
-                                  cudaStream_t stream);
-
 template<typename T, typename T_IN>
 struct MaskedSoftmaxParam {
    // Common parameters.
@@ -69,27 +50,6 @@ void invokeTransposeQKV(T*           dst,
                        const int    int8_mode,
                        cudaStream_t stream);

-template<typename T>
-void invokeAddQKVBiasIA3RebuildPadding(T*           Q,
-                                       const T*     bias_Q,
-                                       T*           K,
-                                       const T*     bias_K,
-                                       T*           V,
-                                       const T*     bias_V,
-                                       T*           q_buf,
-                                       T*           k_buf,
-                                       T*           v_buf,
-                                       const int    batch_size,
-                                       const int    seq_len,
-                                       const int    head_num,
-                                       const int    size_per_head,
-                                       const int    valid_word_num,
-                                       const int*   mask_offset,
-                                       const int*   ia3_tasks,
-                                       const T*     ia3_key_weights,
-                                       const T*     ia3_value_weights,
-                                       cudaStream_t stream);
-
 template<typename T>
 void invokeTransposeAttentionOutRemovePadding(T*           src,
                                              T*           dst,
@@ -103,36 +63,26 @@ void invokeTransposeAttentionOutRemovePadding(T*           src,
                                              const int    int8_mode,
                                              cudaStream_t stream);

-// Prefix Prompt Parameters
 template<typename T>
-struct PrefixPromptBatchWeightsParam {
-    const T**  d_prefix_prompt_batch    = nullptr;
-    const int* d_prefix_prompt_lengths  = nullptr;
-    const int  max_prefix_prompt_length = 0;
-    // l * 2 * hidden_units_ / tensor_para_.world_size_
-    const size_t prefix_prompt_layer_offset_per_seq = 0;
-};
-
-template<typename T>
-void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
-                                    T*                               k_buf,
-                                    T*                               v_buf,
-                                    PrefixPromptBatchWeightsParam<T> param,
-                                    T*                               QKV,
-                                    const T*                         qkv_bias,
-                                    const int*                       padding_offset,
-                                    const int*                       history_length,
-                                    const int                        batch_size,
-                                    const int                        seq_len,
-                                    const int                        token_num,
-                                    const int                        head_num,
-                                    const int                        kv_head_num,
-                                    const int                        size_per_head,
-                                    const int                        rotary_embedding_dim,
-                                    const int                        neox_rotary_style,
-                                    const float*                     scale,
-                                    const int                        int8_mode,
-                                    cudaStream_t                     stream);
+void invokeAddFusedQKVBiasTranspose(T*           q_buf,
+                                    T*           k_buf,
+                                    T*           v_buf,
+                                    T*           QKV,
+                                    const T*     qkv_bias,
+                                    const int*   padding_offset,
+                                    const int*   history_length,
+                                    const int*   input_length,
+                                    const int    batch_size,
+                                    const int    seq_len,
+                                    const int    token_num,
+                                    const int    head_num,
+                                    const int    kv_head_num,
+                                    const int    size_per_head,
+                                    const int    rotary_embedding_dim,
+                                    int          max_position_embeddings,
+                                    bool         use_dynamic_ntk,
+                                    bool         use_logn_attn,
+                                    cudaStream_t stream);

 template<typename T>
 void invokeTranspose4d(T*           dst,

--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -163,21 +163,21 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
    invokeAddFusedQKVBiasTranspose(q_buf_2_,
                                   k_buf_2_,
                                   v_buf_2_,
-                                   PrefixPromptBatchWeightsParam<T>{},
                                   qkv_buf_,
                                   weights->qkv.bias,
                                   padding_offset,  // padding_offset,
                                   history_length,  // used for applying rotary embedding
+                                   input_length,
                                   batch_size,
                                   max_q_len,  // seq_len
                                   num_token,  // batch_size * seq_len
                                   local_head_num_,
                                   local_kv_head_num_,
                                   size_per_head_,
-                                   rotary_embedding_dim_,
-                                   neox_rotary_style_,
-                                   nullptr,  // query_weight.scale_out
-                                   0,        // int8 mode
+                                   params_.rotray_embedding_dim,
+                                   params_.max_position_embeddings,
+                                   params_.use_dynamic_ntk,
+                                   params_.use_logn_attn,
                                   stream_);
    sync_check_cuda_error();


--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.h
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.h
@@ -23,6 +23,7 @@

 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
 #include "src/turbomind/models/llama/LlamaLinear.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/nccl_utils.h"

@@ -34,26 +35,24 @@ public:
    void freeBuffer();
    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);

-    LlamaContextAttentionLayer(size_t           head_num,
-                               size_t           kv_head_num,
-                               size_t           size_per_head,
-                               size_t           rotary_embedding_dim,
-                               bool             neox_rotary_style,
-                               NcclParam        tensor_para,
-                               cudaStream_t     stream,
-                               cublasMMWrapper* cublas_wrapper,
-                               IAllocator*      allocator,
-                               bool             is_free_buffer_after_forward,
-                               bool             use_fmha,
-                               int              quant_policy):
+    LlamaContextAttentionLayer(size_t               head_num,
+                               size_t               kv_head_num,
+                               size_t               size_per_head,
+                               LlamaAttentionParams attn_params,
+                               NcclParam            tensor_para,
+                               cudaStream_t         stream,
+                               cublasMMWrapper*     cublas_wrapper,
+                               IAllocator*          allocator,
+                               bool                 is_free_buffer_after_forward,
+                               bool                 use_fmha,
+                               int                  quant_policy):
        head_num_(head_num),
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
        local_head_num_(head_num / tensor_para.world_size_),
        local_kv_head_num_(kv_head_num / tensor_para.world_size_),
        head_n_rep_(head_num / kv_head_num),
-        rotary_embedding_dim_(rotary_embedding_dim),
-        neox_rotary_style_(neox_rotary_style),
+        params_(attn_params),
        tensor_para_(tensor_para),
        stream_(stream),
        cublas_wrapper_(cublas_wrapper),
@@ -99,10 +98,9 @@ private:
    const size_t local_kv_head_num_;
    const size_t local_head_num_;
    const size_t head_n_rep_;
-    const size_t rotary_embedding_dim_;
    const bool   is_free_buffer_after_forward_;

-    const bool neox_rotary_style_;
+    const LlamaAttentionParams params_;

    const bool use_fmha_;
    const int  quant_policy_;

--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
@@ -61,15 +61,17 @@ void LlamaContextDecoder<T>::freeBuffer()
 }

 template<typename T>
-void LlamaContextDecoder<T>::initialize(size_t kv_head_num, bool use_fmha, int quant_policy)
+void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
+                                        size_t                      kv_head_num,
+                                        bool                        use_fmha,
+                                        int                         quant_policy)
 {
    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);

    context_attention_layer_ = new LlamaContextAttentionLayer<T>(head_num_,
                                                                 kv_head_num,
                                                                 size_per_head_,
-                                                                 rotary_embedding_dim_,
-                                                                 false,  // neox_rotary_style
+                                                                 attn_params,
                                                                 tensor_para_,
                                                                 stream_,
                                                                 cublas_wrapper_,
@@ -124,32 +126,31 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session&
 }

 template<typename T>
-LlamaContextDecoder<T>::LlamaContextDecoder(size_t           head_num,
-                                            size_t           kv_head_num,
-                                            size_t           size_per_head,
-                                            size_t           inter_size,
-                                            size_t           num_layer,
-                                            size_t           rotary_embedding_dim,
-                                            float            rmsnorm_eps,
-                                            NcclParam        tensor_para,
-                                            cudaStream_t     stream,
-                                            cublasMMWrapper* cublas_wrapper,
-                                            IAllocator*      allocator,
-                                            bool             is_free_buffer_after_forward,
-                                            bool             use_fmha,
-                                            int              quant_policy):
+LlamaContextDecoder<T>::LlamaContextDecoder(size_t                      head_num,
+                                            size_t                      kv_head_num,
+                                            size_t                      size_per_head,
+                                            size_t                      inter_size,
+                                            size_t                      num_layer,
+                                            const LlamaAttentionParams& attn_params,
+                                            float                       rmsnorm_eps,
+                                            NcclParam                   tensor_para,
+                                            cudaStream_t                stream,
+                                            cublasMMWrapper*            cublas_wrapper,
+                                            IAllocator*                 allocator,
+                                            bool                        is_free_buffer_after_forward,
+                                            bool                        use_fmha,
+                                            int                         quant_policy):
    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
    head_num_(head_num),
    size_per_head_(size_per_head),
    inter_size_(inter_size),
    hidden_units_(head_num * size_per_head),
    num_layer_(num_layer),
-    rotary_embedding_dim_(rotary_embedding_dim),
    rmsnorm_eps_(rmsnorm_eps),
    tensor_para_(tensor_para),
    data_type_(getTensorType<T>())
 {
-    initialize(kv_head_num, use_fmha, quant_policy);
+    initialize(attn_params, kv_head_num, use_fmha, quant_policy);
 }

 template<typename T>

--- a/src/turbomind/models/llama/LlamaContextDecoder.h
+++ b/src/turbomind/models/llama/LlamaContextDecoder.h
@@ -20,14 +20,11 @@

 #pragma once

-// #include "src/turbomind/kernels/add_residual_kernels.h"
-// #include "src/turbomind/kernels/layernorm_kernels.h"
 #include "src/turbomind/layers/BaseLayer.h"
-// #include "src/turbomind/layers/FfnLayer.h"
-// #include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
 #include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/allocator.h"
 #include "src/turbomind/utils/cublasMMWrapper.h"
@@ -43,13 +40,12 @@ protected:
    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
    void freeBuffer() override;

-    void initialize(size_t kv_head_num, bool use_fmha, int quant_policy);
+    void initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, bool use_fmha, int quant_policy);

    size_t head_num_;
    size_t size_per_head_;
    size_t inter_size_;
    size_t num_layer_;
-    size_t rotary_embedding_dim_;
    size_t hidden_units_;
    float  rmsnorm_eps_;

@@ -87,20 +83,20 @@ protected:
                         bool                                           is_final);

 public:
-    LlamaContextDecoder(size_t           head_num,
-                        size_t           kv_head_num,
-                        size_t           size_per_head,
-                        size_t           inter_size,
-                        size_t           num_layer,
-                        size_t           rotary_embedding_dim,
-                        float            rmsnorm_eps,
-                        NcclParam        tensor_para,
-                        cudaStream_t     stream,
-                        cublasMMWrapper* cublas_wrapper,
-                        IAllocator*      allocator,
-                        bool             is_free_buffer_after_forward,
-                        bool             use_fmha,
-                        int              quant_policy);
+    LlamaContextDecoder(size_t                      head_num,
+                        size_t                      kv_head_num,
+                        size_t                      size_per_head,
+                        size_t                      inter_size,
+                        size_t                      num_layer,
+                        const LlamaAttentionParams& attn_params,
+                        float                       rmsnorm_eps,
+                        NcclParam                   tensor_para,
+                        cudaStream_t                stream,
+                        cublasMMWrapper*            cublas_wrapper,
+                        IAllocator*                 allocator,
+                        bool                        is_free_buffer_after_forward,
+                        bool                        use_fmha,
+                        int                         quant_policy);

    ~LlamaContextDecoder() override;


--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
@@ -23,37 +23,37 @@
 #include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/models/llama/llama_utils.h"

 namespace turbomind {

 template<typename T>
-LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
-                              size_t           kv_head_num,
-                              size_t           size_per_head,
-                              size_t           inter_size,
-                              size_t           num_layer,
-                              size_t           rotary_embedding_dim,
-                              float            rmsnorm_eps,
-                              NcclParam        tensor_para,
-                              cudaStream_t     stream,
-                              cublasMMWrapper* cublas_wrapper,
-                              IAllocator*      allocator,
-                              bool             is_free_buffer_after_forward,
-                              int              quant_policy):
+LlamaDecoder<T>::LlamaDecoder(size_t                      head_num,
+                              size_t                      kv_head_num,
+                              size_t                      size_per_head,
+                              size_t                      inter_size,
+                              size_t                      num_layer,
+                              const LlamaAttentionParams& attn_params,
+                              float                       rmsnorm_eps,
+                              NcclParam                   tensor_para,
+                              cudaStream_t                stream,
+                              cublasMMWrapper*            cublas_wrapper,
+                              IAllocator*                 allocator,
+                              bool                        is_free_buffer_after_forward,
+                              int                         quant_policy):
    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
    head_num_(head_num),
    size_per_head_(size_per_head),
    inter_size_(inter_size),
    num_layer_(num_layer),
-    rotary_embedding_dim_(rotary_embedding_dim),
    hidden_units_(head_num * size_per_head),
    rmsnorm_eps_(rmsnorm_eps),
    tensor_para_(tensor_para),
    data_type_(getTensorType<T>())
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
-    initialize(kv_head_num, quant_policy);
+    initialize(attn_params, kv_head_num, quant_policy);
 }

 template<typename T>
@@ -65,15 +65,14 @@ LlamaDecoder<T>::~LlamaDecoder()
 }

 template<typename T>
-void LlamaDecoder<T>::initialize(size_t kv_head_num, int quant_policy)
+void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int quant_policy)
 {
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);

    self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
                                                                  kv_head_num,
                                                                  size_per_head_,
-                                                                  rotary_embedding_dim_,
-                                                                  false,  // neox_rotary_style
+                                                                  attn_params,
                                                                  tensor_para_,
                                                                  stream_,
                                                                  cublas_wrapper_,

--- a/src/turbomind/models/llama/LlamaDecoder.h
+++ b/src/turbomind/models/llama/LlamaDecoder.h
@@ -20,10 +20,10 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.h

 #include "src/turbomind/layers/BaseLayer.h"
-// #include "src/turbomind/layers/FfnLayer.h"
 #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
 #include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
 #include "src/turbomind/models/llama/LlamaFfnLayer.h"
+#include "src/turbomind/models/llama/llama_params.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
 #include "src/turbomind/utils/nccl_utils.h"

@@ -35,13 +35,12 @@ protected:
    void allocateBuffer() override;  // deprecated
    void allocateBuffer(size_t batch_size);
    void freeBuffer() override;
-    void initialize(size_t kv_head_num, int quant_policy);
+    void initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int quant_policy);

    size_t head_num_;
    size_t size_per_head_;
    size_t inter_size_;
    size_t num_layer_;
-    size_t rotary_embedding_dim_;
    size_t hidden_units_;
    float  rmsnorm_eps_;

@@ -69,19 +68,19 @@ protected:
    void forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer);

 public:
-    LlamaDecoder(size_t           head_num,
-                 size_t           kv_head_num,
-                 size_t           size_per_head,
-                 size_t           inter_size,
-                 size_t           num_layer,
-                 size_t           rotary_embedding_dim,
-                 float            rmsnorm_eps,
-                 NcclParam        tensor_para,
-                 cudaStream_t     stream,
-                 cublasMMWrapper* cublas_wrapper,
-                 IAllocator*      allocator,
-                 bool             is_free_buffer_after_forward,
-                 int              quant_policy);
+    LlamaDecoder(size_t                      head_num,
+                 size_t                      kv_head_num,
+                 size_t                      size_per_head,
+                 size_t                      inter_size,
+                 size_t                      num_layer,
+                 const LlamaAttentionParams& attn_params,
+                 float                       rmsnorm_eps,
+                 NcclParam                   tensor_para,
+                 cudaStream_t                stream,
+                 cublasMMWrapper*            cublas_wrapper,
+                 IAllocator*                 allocator,
+                 bool                        is_free_buffer_after_forward,
+                 int                         quant_policy);

    ~LlamaDecoder() override;


--- a/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -61,6 +61,9 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
                                                      const int    kv_head_num,
                                                      const int    size_per_head,
                                                      const int    rotary_embedding_dim,
+                                                      const int    max_position_embeddings,
+                                                      const bool   use_dynamic_ntk,
+                                                      const bool   use_logn_attn,
                                                      const int    memory_max_len,
                                                      const int*   prefix_prompt_lengths,
                                                      const int    max_prefix_prompt_length,
@@ -110,13 +113,9 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,

    FT_CHECK(k_cache_per_sample && v_cache_per_sample);

-    params.k_cache                    = reinterpret_cast<DataType*>(key_cache);
-    params.v_cache                    = reinterpret_cast<DataType*>(value_cache);
    params.k_cache_per_sample         = reinterpret_cast<DataType**>(k_cache_per_sample);
    params.v_cache_per_sample         = reinterpret_cast<DataType**>(v_cache_per_sample);
    params.kv_cache_per_sample_offset = kv_cache_per_sample_offset;
-    params.k_cache_interleaved        = false;
-    params.cache_indir                = cache_indir;
    params.batch_size                 = inference_batch_size;
    params.beam_width                 = beam_width;
    params.memory_max_len             = memory_max_len;
@@ -128,8 +127,12 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
    params.num_heads    = head_num;
    params.num_kv_heads = kv_head_num;

-    params.hidden_size_per_head = size_per_head;
-    params.rotary_embedding_dim = rotary_embedding_dim;
+    params.hidden_size_per_head    = size_per_head;
+    params.rotary_embedding_dim    = rotary_embedding_dim;
+    params.max_position_embeddings = max_position_embeddings;
+    params.use_dynamic_ntk         = use_dynamic_ntk;
+    params.use_logn_attn           = use_logn_attn;
+
    // Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust)
    params.inv_sqrt_dh = 1.F / (sqrtf((float)params.hidden_size_per_head) * q_scaling);

@@ -146,10 +149,6 @@ static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
    }
    params.max_input_length = max_input_len;

-    params.ia3_tasks         = ia3_tasks;
-    params.ia3_key_weights   = reinterpret_cast<const DataType*>(ia3_key_weights);
-    params.ia3_value_weights = reinterpret_cast<const DataType*>(ia3_value_weights);
-
    params.int8_mode = int8_mode;

    if (int8_mode & QuantPolicy::kCacheKVInt8) {
@@ -185,8 +184,6 @@ void LlamaDecoderSelfAttentionLayer<T>::freeBuffer()
    if (is_allocate_buffer_) {
        allocator_->free((void**)(&qkv_buf_));
        allocator_->free((void**)(&context_buf_));
-        // allocator_->free((void**)(&k_cache_buf_));
-        // allocator_->free((void**)(&v_cache_buf_));
        is_allocate_buffer_ = false;
    }
 }
@@ -263,7 +260,10 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
        local_head_num_,
        local_kv_head_num_,
        size_per_head_,
-        rotary_embedding_dim_,
+        params_.rotray_embedding_dim,
+        params_.max_position_embeddings,
+        params_.use_dynamic_ntk,
+        params_.use_logn_attn,
        memory_len,
        nullptr,  // prefix_prompt_lengths
        0,        // max_prefix_prompt_length