Merge pull request #57 from UnicornChan/develop-0.1.3

[feature] release 0.1.3

Merge pull request #57 from UnicornChan/develop-0.1.3
[feature] release 0.1.3
233bbb8c · UnicornChan · GitHub · 67f8b370 · 4d1d561d · 233bbb8c
Unverified Commit 233bbb8c authored Aug 29, 2024 by UnicornChan Committed by GitHub Aug 29, 2024
20 changed files
--- a/ktransformers/ktransformers_ext/bench/bench_attention.py
+++ b/ktransformers/ktransformers_ext/bench/bench_attention.py
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-28 10:32:05
+Version      : 1.0.0
+LastEditors  : Jianwei Dong 
+LastEditTime : 2024-08-28 10:32:05
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+import os, sys
+import time
+
+sys.path.append(os.path.dirname(__file__) + "/../build")
+import cpuinfer_ext
+import torch
+
+layer_num = 10
+kv_head_num = 8
+q_head_num = 32
+head_dim = 128
+block_len = 128
+anchor_num = 1
+
+anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
+kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
+retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
+layer_step: int = 1
+token_step: int = 1
+layer_offset: int = 0
+max_thread_num: int = 64
+max_batch_size: int = 1
+max_block_num: int = 1024
+CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
+
+warm_up_iter = 1000
+test_iter = 10000
+
+
+def bench_linear(cache_seqlen: int):
+    with torch.inference_mode(mode=True):
+        cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
+        seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
+
+        config = cpuinfer_ext.kvcache.KVCacheConfig(
+            layer_num,
+            kv_head_num,
+            q_head_num,
+            head_dim,
+            block_len,
+            anchor_num,
+            anchor_type,
+            kv_type,
+            retrieval_type,
+            layer_step,
+            token_step,
+            layer_offset,
+            max_block_num,
+            max_batch_size,
+            max_thread_num,
+        )
+        local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
+        block_table = (
+            torch.arange(max_block_num, dtype=torch.int32, device="cpu")
+            .contiguous()
+            .view(1, -1)
+        )
+
+        for layer_idx in range(layer_num):
+            k_cache = torch.randn(
+                (1, cache_seqlen, kv_head_num, head_dim),
+                dtype=torch.float16,
+                device="cpu",
+            ).contiguous()
+            v_cache = torch.randn(
+                (1, cache_seqlen, kv_head_num, head_dim),
+                dtype=torch.float16,
+                device="cpu",
+            ).contiguous()
+
+            CPUInfer.submit(
+                local_kvcache.update_kvcache_fp16(
+                    k_cache.data_ptr(),
+                    v_cache.data_ptr(),
+                    layer_idx,
+                    block_table.data_ptr(),
+                    1,
+                    max_block_num,
+                    seqlens_zero.data_ptr(),
+                    cache_seqlen,
+                )
+            )
+            CPUInfer.sync()
+
+        input = torch.randn(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+        output = torch.empty(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+
+        # attn_lse: (bsz, q_len, q_head_num)
+        attn_lse = torch.empty(
+            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
+        ).contiguous()
+        input = input / 100
+
+        # warm up
+        for i in range(warm_up_iter):
+            CPUInfer.submit(
+                local_kvcache.attn(
+                    input.data_ptr(),
+                    output.data_ptr(),
+                    attn_lse.data_ptr(),
+                    i % layer_num,
+                    0,
+                    1,
+                    1,
+                    max_block_num,
+                    block_table.data_ptr(),
+                    cache_seqlens.data_ptr(),
+                    -1,
+                    -1,
+                    -1,
+                )
+            )
+            CPUInfer.sync()
+
+        # test
+        start = time.perf_counter()
+        for i in range(test_iter):
+            CPUInfer.submit(
+                local_kvcache.attn(
+                    input.data_ptr(),
+                    output.data_ptr(),
+                    attn_lse.data_ptr(),
+                    i % layer_num,
+                    0,
+                    1,
+                    1,
+                    max_block_num,
+                    block_table.data_ptr(),
+                    cache_seqlens.data_ptr(),
+                    -1,
+                    -1,
+                    -1,
+                )
+            )
+            CPUInfer.sync()
+        end = time.perf_counter()
+        total_time = end - start
+        print("cache sequence length: ", cache_seqlen)
+        print("Time(s): ", total_time)
+        print("Iteration: ", test_iter)
+        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
+        print(
+            "Bandwidth: ",
+            cache_seqlen
+            * kv_head_num
+            * head_dim
+            * 2
+            * 2
+            * test_iter
+            / total_time
+            / 1000
+            / 1000
+            / 1000,
+            "GB/s",
+        )
+        print("")
+
+
+bench_linear(1024)
+bench_linear(4096)
+bench_linear(16384)
+bench_linear(32768)
+bench_linear(65536)
--- a/ktransformers/ktransformers_ext/bench/bench_attention_torch.py
+++ b/ktransformers/ktransformers_ext/bench/bench_attention_torch.py
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-28 10:32:05
+Version      : 1.0.0
+LastEditors  : Jianwei Dong 
+LastEditTime : 2024-08-28 10:32:05
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+import os, sys
+import time
+
+sys.path.append(os.path.dirname(__file__) + "/../build")
+import cpuinfer_ext
+import torch
+
+layer_num = 10
+kv_head_num = 8
+q_head_num = 32
+head_dim = 128
+block_len = 128
+anchor_num = 1
+warm_up_iter = 1000
+test_iter = 10000
+
+
+def bench_linear(cache_seqlen: int, device):
+    with torch.inference_mode(mode=True):
+
+        kvcaches = []
+
+        for layer_idx in range(layer_num):
+            k_cache = torch.randn(
+                (1, 32, cache_seqlen, head_dim),
+                dtype=torch.float16,
+                device=device,
+            ).contiguous()
+            v_cache = torch.randn(
+                (1, 32, cache_seqlen, head_dim),
+                dtype=torch.float16,
+                device=device,
+            ).contiguous()
+
+            kvcaches.append((k_cache, v_cache))
+
+        input = torch.randn(
+            (1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
+        ).contiguous()
+        input = input / 100
+
+        # warm up
+        for i in range(warm_up_iter):
+            k_cache = kvcaches[i % layer_num][0]
+            v_cache = kvcaches[i % layer_num][1]
+            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
+
+        # test
+        start = time.perf_counter()
+        for i in range(test_iter):
+            k_cache = kvcaches[i % layer_num][0]
+            v_cache = kvcaches[i % layer_num][1]
+            torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
+        end = time.perf_counter()
+        total_time = end - start
+        print("cache sequence length: ", cache_seqlen)
+        print("Time(s): ", total_time)
+        print("Iteration: ", test_iter)
+        print("Time(us) per iteration: ", total_time / test_iter * 1000000)
+        print(
+            "Bandwidth: ",
+            cache_seqlen
+            * q_head_num
+            * head_dim
+            * 2
+            * 2
+            * test_iter
+            / total_time
+            / 1000
+            / 1000
+            / 1000,
+            "GB/s",
+        )
+        print("")
+
+
+bench_linear(1024, "cpu")
+bench_linear(4096, "cpu")
+bench_linear(1024, "cuda")
+bench_linear(4096, "cuda")
+bench_linear(16384, "cuda")
+bench_linear(32768, "cuda")
+bench_linear(65536, "cuda")
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
@@ -3,93 +3,125 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
+ * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:34
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
+
 #include "backend.h"

-Backend::Backend(int thread_num) {
-    thread_num_ = thread_num;
-    thread_state_.resize(thread_num);
-    for (int i = 0; i < thread_num; i++) {
+thread_local int Backend::thread_local_id = -1;
+
+Backend::Backend(int max_thread_num) {
+    max_thread_num_ = max_thread_num;
+    thread_state_.resize(max_thread_num_);
+    for (int i = 0; i < max_thread_num_; i++) {
        thread_state_[i].curr = std::make_unique<std::atomic<int>>();
-        thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
+        thread_state_[i].status =
+            std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
    }
-    workers_.resize(thread_num);
-    for (int i = 1; i < thread_num; i++) {
+    workers_.resize(max_thread_num_);
+    for (int i = 1; i < max_thread_num_; i++) {
        workers_[i] = std::thread(&Backend::worker_thread, this, i);
    }
 }

 Backend::~Backend() {
-    for (int i = 0; i < thread_num_; i++) {
-        thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
+    for (int i = 0; i < max_thread_num_; i++) {
+        thread_state_[i].status->store(ThreadStatus::EXIT,
+                                       std::memory_order_release);
    }
-    for (int i = 1; i < thread_num_; i++) {
+    for (int i = 1; i < max_thread_num_; i++) {
        if (workers_[i].joinable()) {
            workers_[i].join();
        }
    }
 }

-int Backend::get_thread_num() {
-    return thread_num_;
-}
+int Backend::get_thread_num() { return max_thread_num_; }

-void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
-    func_ = func;
+void Backend::do_work_stealing_job(int task_num,
+                                   std::function<void(int)> init_func,
+                                   std::function<void(int)> compute_func,
+                                   std::function<void(int)> finalize_func) {
+    init_func_ = init_func;
+    compute_func_ = compute_func;
+    finalize_func_ = finalize_func;
+    thread_num_ = std::min(max_thread_num_, task_num);
    int base = task_num / thread_num_;
    int remain = task_num % thread_num_;
    thread_state_[0].end = base + (0 < remain);
+
+    // 为主线程设置 thread_local_id
+    thread_local_id = 0;
+
    for (int i = 1; i < thread_num_; i++) {
-        thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
+        thread_state_[i].curr->store(thread_state_[i - 1].end,
+                                     std::memory_order_relaxed);
        thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
-        thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
+        thread_state_[i].status->store(ThreadStatus::WORKING,
+                                       std::memory_order_release);
    }
    thread_state_[0].curr->store(0, std::memory_order_relaxed);
-    thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
+    thread_state_[0].status->store(ThreadStatus::WORKING,
+                                   std::memory_order_release);
    process_tasks(0);
    for (int i = 1; i < thread_num_; i++) {
-        while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
+        while (thread_state_[i].status->load(std::memory_order_acquire) ==
+               ThreadStatus::WORKING) {
        }
    }
 }

 void Backend::process_tasks(int thread_id) {
+    if (init_func_ != nullptr) {
+        init_func_(thread_id);
+    }
    while (true) {
-        int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
+        int task_id = thread_state_[thread_id].curr->fetch_add(
+            1, std::memory_order_acq_rel);
        if (task_id >= thread_state_[thread_id].end) {
            break;
        }
-        func_(task_id);
+        compute_func_(task_id);
    }
    for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
        int t_i = (thread_id + t_offset) % thread_num_;
-        if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
+        if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
+            ThreadStatus::WORKING) {
            continue;
        }
        while (true) {
-            int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
+            int task_id = thread_state_[t_i].curr->fetch_add(
+                1, std::memory_order_acq_rel);
            if (task_id >= thread_state_[t_i].end) {
                break;
            }
-            func_(task_id);
+            compute_func_(task_id);
        }
    }
-    thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
+    if (finalize_func_ != nullptr) {
+        finalize_func_(thread_id);
+    }
+    thread_state_[thread_id].status->store(ThreadStatus::WAITING,
+                                           std::memory_order_release);
 }

 void Backend::worker_thread(int thread_id) {
    auto start = std::chrono::steady_clock::now();
+    thread_local_id = thread_id; // 设置线程本地变量
    while (true) {
-        ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
+        ThreadStatus status =
+            thread_state_[thread_id].status->load(std::memory_order_acquire);
        if (status == ThreadStatus::WORKING) {
            process_tasks(thread_id);
            start = std::chrono::steady_clock::now();
        } else if (status == ThreadStatus::WAITING) {
            auto now = std::chrono::steady_clock::now();
-            auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
+            auto duration =
+                std::chrono::duration_cast<std::chrono::milliseconds>(now -
+                                                                      start)
+                    .count();
            if (duration > 50) {
                std::this_thread::sleep_for(std::chrono::milliseconds(1));
            }

--- a/ktransformers/ktransformers_ext/cpu_backend/backend.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.h
@@ -3,7 +3,7 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:05
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
+ * @LastEditors  : chenht2022
 * @LastEditTime : 2024-07-25 10:33:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
@@ -31,20 +31,25 @@ struct ThreadState {
 };

 class Backend {
-   public:
+  public:
    Backend(int);
    ~Backend();
    int get_thread_num();
-    void do_work_stealing_job(int, std::function<void(int)>);
+    void do_work_stealing_job(int, std::function<void(int)>,
+                              std::function<void(int)>,
+                              std::function<void(int)>);
+    static thread_local int thread_local_id;

-   private:
+  private:
    int thread_num_;
-    std::vector<ThreadState> thread_state_;  // [thread_num]
-    std::function<void(int)> func_;
+    int max_thread_num_;
+    std::vector<ThreadState> thread_state_; // [thread_num]
+    std::function<void(int)> init_func_;
+    std::function<void(int)> compute_func_;
+    std::function<void(int)> finalize_func_;
    std::vector<std::thread> workers_;

    void process_tasks(int);
    void worker_thread(int);
 };
-
 #endif
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
@@ -54,4 +54,4 @@ void TaskQueue::processTasks() {
        }
        mutex.unlock();
    }
-}
+}
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
@@ -4,7 +4,7 @@
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
 * @LastEditors  : chenxl 
- * @LastEditTime : 2024-08-12 12:28:25
+ * @LastEditTime : 2024-08-08 04:23:51
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #ifndef CPUINFER_TASKQUEUE_H

--- a/ktransformers/ktransformers_ext/examples/test_attention.py
+++ b/ktransformers/ktransformers_ext/examples/test_attention.py
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Description  :  
+Author       : Jianwei Dong
+Date         : 2024-08-28 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-08-28 10:32:05
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""
+import os, sys
+import time
+
+sys.path.append(os.path.dirname(__file__) + "/../build")
+import cpuinfer_ext
+from flash_attn import flash_attn_with_kvcache
+import torch
+
+layer_num = 10
+kv_head_num = 8
+q_head_num = 32
+head_dim = 128
+block_len = 128
+anchor_num = 1
+cache_seqlen = 8192
+cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
+seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
+anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
+kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
+retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
+layer_step: int = 1
+token_step: int = 1
+layer_offset: int = 0
+max_thread_num: int = 2
+max_batch_size: int = 1
+max_block_num: int = 512
+CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
+validation_iter = 100
+
+with torch.inference_mode(mode=True):
+    config = cpuinfer_ext.kvcache.KVCacheConfig(
+        layer_num,
+        kv_head_num,
+        q_head_num,
+        head_dim,
+        block_len,
+        anchor_num,
+        anchor_type,
+        kv_type,
+        retrieval_type,
+        layer_step,
+        token_step,
+        layer_offset,
+        max_block_num,
+        max_batch_size,
+        max_thread_num,
+    )
+    local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
+
+    kvcaches = []
+    block_table = (
+        torch.arange(max_block_num, dtype=torch.int32, device="cpu")
+        .contiguous()
+        .view(1, -1)
+    )
+
+    for layer_idx in range(layer_num):
+        k_cache = torch.randn(
+            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+        v_cache = torch.randn(
+            (1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+
+        CPUInfer.submit(
+            local_kvcache.update_kvcache_fp16(
+                k_cache.data_ptr(),
+                v_cache.data_ptr(),
+                layer_idx,
+                block_table.data_ptr(),
+                1,
+                max_block_num,
+                seqlens_zero.data_ptr(),
+                cache_seqlen,
+            )
+        )
+        CPUInfer.sync()
+
+        kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))
+
+    # validation
+    for i in range(validation_iter):
+
+        k_cache = kvcaches[i % layer_num][0]
+        v_cache = kvcaches[i % layer_num][1]
+        input = torch.randn(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+        output = torch.empty(
+            (1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
+        ).contiguous()
+
+        # attn_lse: (bsz, q_len, q_head_num)
+        attn_lse = torch.empty(
+            (1, 1, q_head_num), dtype=torch.float32, device="cpu"
+        ).contiguous()
+        input = input / 100
+
+        CPUInfer.submit(
+            local_kvcache.attn(
+                input.data_ptr(),
+                output.data_ptr(),
+                attn_lse.data_ptr(),
+                i % layer_num,
+                0,
+                1,
+                1,
+                max_block_num,
+                block_table.data_ptr(),
+                cache_seqlens.data_ptr(),
+                -1,
+                -1,
+                -1,
+            )
+        )
+        CPUInfer.sync()
+        # print("cpuinfer output", output)
+
+        t_output = flash_attn_with_kvcache(
+            q=input.to("cuda"),
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_seqlens=cache_seqlens.to("cuda"),
+        )
+        # print("torch output", t_output)
+
+        diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
+            torch.abs(t_output)
+        )
+        print("diff = ", diff)
+        assert diff < 0.001
--- a/ktransformers/ktransformers_ext/ext_bindings.cpp
+++ b/ktransformers/ktransformers_ext/ext_bindings.cpp
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache.h
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache.h
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_attn.cpp
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_load_dump.cpp
+/**
+ * @Description  :
+ * @Author       : Jianwei Dong
+ * @Date         : 2024-08-26 22:47:06
+ * @Version      : 1.0.0
+ * @LastEditors  : Jianwei Dong
+ * @LastEditTime : 2024-08-26 22:47:06
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+
+#include "kvcache.h"
+void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
+    // Timer start
+    auto start = std::chrono::high_resolution_clock::now();
+    std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
+    if (!ifs_tensor) {
+        throw std::runtime_error("Failed to open tensor file");
+    }
+    ifs_tensor.read(reinterpret_cast<char *>(&cache_total_len_),
+                    sizeof(cache_total_len_));
+    int past_block_num =
+        (cache_total_len_ + config_.block_len - 1) / config_.block_len;
+    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_,
+           past_block_num);
+    for (int i = 0; i < config_.layer_num; ++i) {
+        past_block_num_[i] = past_block_num;
+    }
+    ifs_tensor.read(reinterpret_cast<char *>(anchor_.data()),
+                    anchor_.size() * sizeof(ggml_fp16_t));
+    for (int i = 0; i < config_.layer_num; ++i) {
+        for (int j = 0; j < config_.kv_head_num; ++j) {
+            for (int k = 0; k < past_block_num_[i]; ++k) {
+                if (config_.kv_type == GGML_TYPE_F16) {
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(k_cache_fp16_[i][j][k].data()),
+                        k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(v_cache_fp16_[i][j][k].data()),
+                        v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
+                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(k_cache_q4[i][j][k].data()),
+                        k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
+                    ifs_tensor.read(
+                        reinterpret_cast<char *>(v_cache_q4[i][j][k].data()),
+                        v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
+                }
+            }
+        }
+        for (int k = 0; k < past_block_num_[i]; ++k) {
+            for (int l = 0; l < config_.block_len; l++) {
+                ifs_tensor.read(
+                    reinterpret_cast<char *>(importance_[i][k][l].data()),
+                    importance_[i][k][l].size() * sizeof(ggml_fp16_t));
+            }
+        }
+    }
+    ifs_tensor.close();
+    // Timer end
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    printf("time of load: %f s\n", diff.count());
+}
+void KVCache::dump_kvcache(int *block_table, int cache_total_len,
+                           std::string tensor_file_path, Backend *backend) {
+    // Timer start
+    auto start = std::chrono::high_resolution_clock::now();
+    std::ofstream ofs(tensor_file_path, std::ios::binary);
+    printf("dump_kvcache: %s\n", tensor_file_path.c_str());
+    if (!ofs.is_open()) {
+        std::cerr << "Cannot open file " << tensor_file_path << std::endl;
+        return;
+    }
+    ofs.write(reinterpret_cast<const char *>(&cache_total_len),
+              sizeof(cache_total_len));
+    int past_block_num =
+        (cache_total_len + config_.block_len - 1) / config_.block_len;
+    printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len,
+           past_block_num);
+    ofs.write(reinterpret_cast<const char *>(anchor_.data()),
+              anchor_.size() * sizeof(ggml_fp16_t));
+    for (int i = 0; i < config_.layer_num; ++i) {
+        for (int j = 0; j < config_.kv_head_num; ++j) {
+            for (int k = 0; k < past_block_num; ++k) {
+                int block_idx = block_table[k];
+                if (config_.kv_type == GGML_TYPE_F16) {
+                    ofs.write(reinterpret_cast<const char *>(
+                                  k_cache_fp16_[i][j][block_idx].data()),
+                              k_cache_fp16_[i][j][block_idx].size() *
+                                  sizeof(ggml_fp16_t));
+                    ofs.write(reinterpret_cast<const char *>(
+                                  v_cache_fp16_[i][j][block_idx].data()),
+                              v_cache_fp16_[i][j][block_idx].size() *
+                                  sizeof(ggml_fp16_t));
+
+                } else if (config_.kv_type == GGML_TYPE_Q4_0) {
+                    ofs.write(reinterpret_cast<const char *>(
+                                  k_cache_q4[i][j][block_idx].data()),
+                              k_cache_q4[i][j][block_idx].size() *
+                                  sizeof(block_q4_0));
+                    ofs.write(reinterpret_cast<const char *>(
+                                  v_cache_q4[i][j][block_idx].data()),
+                              v_cache_q4[i][j][block_idx].size() *
+                                  sizeof(block_q4_0));
+                }
+            }
+        }
+        for (int k = 0; k < past_block_num; ++k) {
+            int block_idx = block_table[k];
+            for (int l = 0; l < config_.block_len; l++) {
+                ofs.write(reinterpret_cast<const char *>(
+                              importance_[i][block_idx][l].data()),
+                          importance_[i][block_idx][l].size() *
+                              sizeof(ggml_fp16_t));
+            }
+        }
+    }
+    ofs.close();
+    // Timer end
+    auto end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> diff = end - start;
+    printf("time of dump: %f s\n", diff.count());
+}
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_read_write.cpp
--- a/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
+++ b/ktransformers/ktransformers_ext/operators/kvcache/kvcache_utils.cpp
--- a/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
@@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-12 10:07:58
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-07-25 10:34:58
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-15 07:45:18
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #include "linear.h"
@@ -24,10 +24,14 @@ Linear::~Linear() {
    shared_mem_buffer.dealloc(this);
 }

-void Linear::warm_up(Backend* backend) {
+void Linear::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.input_size);
-    std::vector<uint8_t> input(config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
-    std::vector<uint8_t> output(config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> input(config_.input_size *
+                               ggml_type_size(config_.hidden_type) /
+                               ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.output_size *
+                                ggml_type_size(config_.hidden_type) /
+                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.input_size; i++) {
        input_fp32[i] = 0;
    }
@@ -45,7 +49,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
        proj_input_ptr = proj_input_;
    }
    int nth = config_.output_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
        float* proj_output_ptr = proj_output_ + ith * config_.stride;
@@ -57,7 +61,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
    }

--- a/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
@@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-16 10:43:18
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-07-25 10:35:04
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-15 07:44:38
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #include "mlp.h"
@@ -31,10 +31,14 @@ MLP::~MLP() {
    shared_mem_buffer.dealloc(this);
 }

-void MLP::warm_up(Backend* backend) {
+void MLP::warm_up(Backend *backend) {
    std::vector<float> input_fp32(config_.hidden_size);
-    std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
-    std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> input(config_.hidden_size *
+                               ggml_type_size(config_.hidden_type) /
+                               ggml_blck_size(config_.hidden_type));
+    std::vector<uint8_t> output(config_.hidden_size *
+                                ggml_type_size(config_.hidden_type) /
+                                ggml_blck_size(config_.hidden_type));
    for (int i = 0; i < config_.hidden_size; i++) {
        input_fp32[i] = 0;
    }
@@ -42,9 +46,7 @@ void MLP::warm_up(Backend* backend) {
    forward_many(1, input.data(), output.data(), backend);
 }

-static float act_fn(float x) {
-    return x / (1.0f + expf(-x));
-}
+static float act_fn(float x) { return x / (1.0f + expf(-x)); }

 void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
    const void* gate_input_ptr;
@@ -72,7 +74,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
        }
    }
    int nth = config_.intermediate_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
        float* gate_output_ptr = gate_output_ + ith * config_.stride;
@@ -90,12 +92,12 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
                from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            }
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
    }
    nth = config_.hidden_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = down_output_ + ith * config_.stride;
@@ -107,7 +109,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
                from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
            }
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
    }

--- a/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+++ b/ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
@@ -3,8 +3,8 @@
 * @Author       : chenht2022
 * @Date         : 2024-07-22 02:03:22
 * @Version      : 1.0.0
- * @LastEditors  : chenht2022
- * @LastEditTime : 2024-07-25 10:35:07
+ * @LastEditors  : kkk1nak0
+ * @LastEditTime : 2024-08-15 07:43:41
 * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 **/
 #include "moe.h"
@@ -121,7 +121,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
        }
    }
    int nth = config_.intermediate_size / config_.stride;
-    backend->do_work_stealing_job(nth * k, [&](int task_id) {
+    backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        uint64_t expert_id = expert_ids[expert_idx];
        int ith = task_id % nth;
@@ -139,14 +139,14 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
            void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
        for (int i = 0; i < k; i++) {
            from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
    }
    nth = config_.hidden_size / config_.stride;
-    backend->do_work_stealing_job(nth, [&](int task_id) {
+    backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
        int ith = task_id;
        for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
            s_output_fp32_[i] = 0;
@@ -165,7 +165,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
            void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
            from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
        }
-    });
+    }, nullptr);
    if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
        from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
    }
@@ -191,7 +191,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
        m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
        offset += m_local_num_[i];
    }
-    backend->do_work_stealing_job(qlen, [&](int i) {
+    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        const void* gate_input_ptr;
        const void* up_input_ptr;
        if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
@@ -220,10 +220,10 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
            memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
            memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
        }
-    });
+    }, nullptr);
    int stride = QK_K;
    int nth = config_.intermediate_size / stride;
-    backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
+    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
@@ -242,18 +242,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
            void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
            from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
        }
-    });
+    }, nullptr);
    stride = QK_K;
    nth = config_.hidden_size / stride;
-    backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
+    backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
        int expert_idx = task_id / nth;
        int ith = task_id % nth;
        void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
        void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
        float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
        llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
-    });
-    backend->do_work_stealing_job(qlen, [&](int i) {
+    }, nullptr);
+    backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
        for (int e = 0; e < config_.hidden_size; e++) {
            m_output_fp32_[i][e] = 0;
        }
@@ -263,7 +263,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
            }
        }
        from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
-    });
+    }, nullptr);
 }

 void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {

--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
-# Copyright 2024 Shaoyuan Chen
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
+Description  :  
+Author       : Boxin Zhang, Azure-Tang
+Version      : 0.1.0
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+"""

 import os
 import platform
 import sys
+
 project_dir = os.path.dirname(os.path.dirname(__file__))
 sys.path.insert(0, project_dir)
 import torch
@@ -31,6 +25,7 @@ import fire
 from ktransformers.optimize.optimize import optimize_and_load_gguf
 from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
 from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
+from ktransformers.models.modeling_llama import LlamaForCausalLM
 from ktransformers.models.modeling_mixtral import MixtralForCausalLM
 from ktransformers.util.utils import prefill_and_generate
 from ktransformers.server.config.config import Config
@@ -38,38 +33,56 @@ from ktransformers.server.config.config import Config
 custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
    "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
+    "LlamaForCausalLM": LlamaForCausalLM,
    "MixtralForCausalLM": MixtralForCausalLM,
 }

-ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
-default_optimize_rules ={
+ktransformer_rules_dir = (
+    os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
+)
+default_optimize_rules = {
    "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
    "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
+    "LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
    "MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
 }

+
 def local_chat(
-    model_path: str,
+    model_path: str | None = None,
    optimize_rule_path: str = None,
-    gguf_path: str = None,
+    gguf_path: str | None = None,
    max_new_tokens: int = 1000,
    cpu_infer: int = Config().cpu_infer,
    use_cuda_graph: bool = True,
+    prompt_file : str | None = None,
+    mode: str = "normal",
 ):
+
+
    torch.set_grad_enabled(False)
-    
+
    Config().cpu_infer = cpu_infer
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    torch.set_default_dtype(config.torch_dtype)
+    if mode == 'long_context':
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(config.torch_dtype)

    with torch.device("meta"):
        if config.architectures[0] in custom_models:
            print("using custom modeling_xxx.py.")
-            if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
+            if (
+                "Qwen2Moe" in config.architectures[0]
+            ):  # Qwen2Moe must use flash_attention_2 to avoid overflow.
                config._attn_implementation = "flash_attention_2"
-            if "Mixtral" in config.architectures[0]: 
+            if "Llama" in config.architectures[0]:
+                config._attn_implementation = "eager"
+            if "Mixtral" in config.architectures[0]:
                config._attn_implementation = "flash_attention_2"
+
            model = custom_models[config.architectures[0]](config)
        else:
            model = AutoModelForCausalLM.from_config(
@@ -95,26 +108,50 @@ def local_chat(
    if model.generation_config.pad_token_id is None:
        model.generation_config.pad_token_id = model.generation_config.eos_token_id
    model.eval()
-
    logging.basicConfig(level=logging.INFO)

    system = platform.system()
-    if (system == u'Windows'):
-        os.system('cls')
+    if system == "Windows":
+        os.system("cls")
    else:
-        os.system('clear')
+        os.system("clear")

    while True:
        content = input("Chat: ")
-        if content == "":
-            content = "Please write a piece of quicksort code in C++." 
+        if content.startswith('"""'):  # prefix """
+            # multi lines input
+            content = content[3:] + "\n"
+            while True:
+                line = input("")
+                if line.endswith('"""'):
+                    # end multi lines input
+                    line = line[:-3]  # suffix """
+                    if line:
+                        content += line + "\n"
+                    break
+                else:
+                    content += line + "\n"

+        if content == "":
+            if prompt_file != None:
+                content = open(prompt_file, "r").read()
+            else:
+                content = "Please write a piece of quicksort code in C++."
+        elif os.path.isfile(content):
+            content = open(content, "r").read()
        messages = [{"role": "user", "content": content}]
        input_tensor = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt"
        )
-        torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
-        generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph)
+        assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
+        "please change max_seq_len in  ~/.ktransformers/config.yaml"
+        torch.set_default_dtype(
+            torch.bfloat16
+        )  # TODO: Remove this, replace dtype using config
+        generated = prefill_and_generate(
+            model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode
+        )
+

 if __name__ == "__main__":
-    fire.Fire(local_chat)
\ No newline at end of file
+    fire.Fire(local_chat)
--- a/ktransformers/models/configuration_llama.py
+++ b/ktransformers/models/configuration_llama.py
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LLaMA model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
+            Llama 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/ktransformers/models/modeling_llama.py
+++ b/ktransformers/models/modeling_llama.py
--- a/ktransformers/operators/RoPE.py
+++ b/ktransformers/operators/RoPE.py
-'''
+"""
 Description  :  
 Author       : Boxin Zhang
 Version      : 0.1.0
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
-'''
+"""
+
 from torch import nn
-from ktransformers.models.modeling_deepseek import DeepseekV2YarnRotaryEmbedding, DeepseekV2RotaryEmbedding
+from transformers import ROPE_INIT_FUNCTIONS
+from ktransformers.models.modeling_llama import (
+    LlamaRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+    LlamaDynamicNTKScalingRotaryEmbedding,
+)
+from ktransformers.models.modeling_deepseek import (
+    DeepseekV2YarnRotaryEmbedding,
+    DeepseekV2RotaryEmbedding,
+)
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
 from ktransformers.util.utils import InferenceState
 from transformers.configuration_utils import PretrainedConfig

+
 # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
 class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                #  device: str = "cuda",
-                 generate_device: str = "cuda",
-                 prefill_device: str = "cuda",
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
-        self.orig_module.__init__(orig_module.dim,
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        #  device: str = "cuda",
+        generate_device: str = "cuda",
+        prefill_device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim, orig_module.max_position_embeddings, orig_module.base
+        )
+        self.generate_device = generate_device
+        self.prefill_device = prefill_device
+
+    def load(self):
+        self.orig_module.__init__(
+            self.orig_module.dim,
+            self.orig_module.max_position_embeddings,
+            self.orig_module.base,
+            self.device,
+        )
+
+
+class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        generate_device: str = "cuda",
+        prefill_device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim,
            orig_module.max_position_embeddings,
-            orig_module.base)
+            orig_module.base,
+            None,
+            orig_module.scaling_factor,
+            orig_module.rope_type,
+            orig_module.config,
+        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
-    
+
    def load(self):
-        self.orig_module.__init__(self.orig_module.dim,
+        self.orig_module.__init__(
+            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
-            self.device)
-    
+            self.device,
+            self.orig_module.scaling_factor,
+            self.orig_module.rope_type,
+            self.orig_module.config,
+        )
+
 class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                #  device: str = "cuda",
-                 generate_device: str = "cuda",
-                 prefill_device: str = "cuda",
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
-        self.orig_module.__init__(orig_module.dim,
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        #  device: str = "cuda",
+        generate_device: str = "cuda",
+        prefill_device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, generate_device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim,
            orig_module.max_position_embeddings,
            orig_module.base,
-            None, #device
+            None,  # device
            orig_module.scaling_factor,
            orig_module.original_max_position_embeddings,
            orig_module.beta_fast,
            orig_module.beta_slow,
            orig_module.mscale,
-            orig_module.mscale_all_dim)
+            orig_module.mscale_all_dim,
+        )
        self.generate_device = generate_device
        self.prefill_device = prefill_device
-        
-    
+
    def load(self):
-        self.orig_module.__init__(self.orig_module.dim,
+        self.orig_module.__init__(
+            self.orig_module.dim,
            self.orig_module.max_position_embeddings,
            self.orig_module.base,
            self.generate_device,
@@ -70,5 +131,42 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
            self.orig_module.beta_fast,
            self.orig_module.beta_slow,
            self.orig_module.mscale,
-            self.orig_module.mscale_all_dim)
-    
+            self.orig_module.mscale_all_dim,
+        )
+
+
+class DynamicNTKScalingRotaryEmbedding(
+    BaseInjectedModule, LlamaDynamicNTKScalingRotaryEmbedding
+):
+    def __init__(
+        self,
+        key: str,
+        gguf_loader: GGUFLoader,
+        config: PretrainedConfig,
+        orig_module: nn.Module,
+        device: str = "cuda",
+        **kwargs,
+    ):
+        BaseInjectedModule.__init__(
+            self, key, gguf_loader, config, orig_module, device, **kwargs
+        )
+        self.orig_module.__init__(
+            orig_module.dim,
+            orig_module.max_position_embeddings,
+            orig_module.base,
+            None,  # device
+            orig_module.scaling_factor,
+            orig_module.rope_type,
+            orig_module.config,
+        )
+
+    def load(self):
+        self.orig_module.__init__(
+            self.orig_module.dim,
+            self.orig_module.max_position_embeddings,
+            self.orig_module.base,
+            self.orig_module.device,
+            self.orig_module.scaling_factor,
+            self.orig_module.rope_type,
+            self.orig_module.config,
+        )