Unverified Commit 233bbb8c authored by UnicornChan's avatar UnicornChan Committed by GitHub
Browse files

Merge pull request #57 from UnicornChan/develop-0.1.3

[feature] release 0.1.3
parents 67f8b370 4d1d561d
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch
layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 64
max_batch_size: int = 1
max_block_num: int = 1024
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
warm_up_iter = 1000
test_iter = 10000
def bench_linear(cache_seqlen: int):
with torch.inference_mode(mode=True):
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
config = cpuinfer_ext.kvcache.KVCacheConfig(
layer_num,
kv_head_num,
q_head_num,
head_dim,
block_len,
anchor_num,
anchor_type,
kv_type,
retrieval_type,
layer_step,
token_step,
layer_offset,
max_block_num,
max_batch_size,
max_thread_num,
)
local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim),
dtype=torch.float16,
device="cpu",
).contiguous()
v_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim),
dtype=torch.float16,
device="cpu",
).contiguous()
CPUInfer.submit(
local_kvcache.update_kvcache_fp16(
k_cache.data_ptr(),
v_cache.data_ptr(),
layer_idx,
block_table.data_ptr(),
1,
max_block_num,
seqlens_zero.data_ptr(),
cache_seqlen,
)
)
CPUInfer.sync()
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
input = input / 100
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
local_kvcache.attn(
input.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
i % layer_num,
0,
1,
1,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
-1,
-1,
-1,
)
)
CPUInfer.sync()
# test
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
local_kvcache.attn(
input.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
i % layer_num,
0,
1,
1,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
-1,
-1,
-1,
)
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print("cache sequence length: ", cache_seqlen)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* kv_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
"GB/s",
)
print("")
bench_linear(1024)
bench_linear(4096)
bench_linear(16384)
bench_linear(32768)
bench_linear(65536)
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : Jianwei Dong
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
import torch
layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
warm_up_iter = 1000
test_iter = 10000
def bench_linear(cache_seqlen: int, device):
with torch.inference_mode(mode=True):
kvcaches = []
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, 32, cache_seqlen, head_dim),
dtype=torch.float16,
device=device,
).contiguous()
v_cache = torch.randn(
(1, 32, cache_seqlen, head_dim),
dtype=torch.float16,
device=device,
).contiguous()
kvcaches.append((k_cache, v_cache))
input = torch.randn(
(1, q_head_num, 1, head_dim), dtype=torch.float16, device=device
).contiguous()
input = input / 100
# warm up
for i in range(warm_up_iter):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
# test
start = time.perf_counter()
for i in range(test_iter):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
torch.nn.functional.scaled_dot_product_attention(input, k_cache, v_cache)
end = time.perf_counter()
total_time = end - start
print("cache sequence length: ", cache_seqlen)
print("Time(s): ", total_time)
print("Iteration: ", test_iter)
print("Time(us) per iteration: ", total_time / test_iter * 1000000)
print(
"Bandwidth: ",
cache_seqlen
* q_head_num
* head_dim
* 2
* 2
* test_iter
/ total_time
/ 1000
/ 1000
/ 1000,
"GB/s",
)
print("")
bench_linear(1024, "cpu")
bench_linear(4096, "cpu")
bench_linear(1024, "cuda")
bench_linear(4096, "cuda")
bench_linear(16384, "cuda")
bench_linear(32768, "cuda")
bench_linear(65536, "cuda")
......@@ -3,93 +3,125 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:34
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "backend.h"
Backend::Backend(int thread_num) {
thread_num_ = thread_num;
thread_state_.resize(thread_num);
for (int i = 0; i < thread_num; i++) {
thread_local int Backend::thread_local_id = -1;
Backend::Backend(int max_thread_num) {
max_thread_num_ = max_thread_num;
thread_state_.resize(max_thread_num_);
for (int i = 0; i < max_thread_num_; i++) {
thread_state_[i].curr = std::make_unique<std::atomic<int>>();
thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
thread_state_[i].status =
std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
}
workers_.resize(thread_num);
for (int i = 1; i < thread_num; i++) {
workers_.resize(max_thread_num_);
for (int i = 1; i < max_thread_num_; i++) {
workers_[i] = std::thread(&Backend::worker_thread, this, i);
}
}
Backend::~Backend() {
for (int i = 0; i < thread_num_; i++) {
thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
for (int i = 0; i < max_thread_num_; i++) {
thread_state_[i].status->store(ThreadStatus::EXIT,
std::memory_order_release);
}
for (int i = 1; i < thread_num_; i++) {
for (int i = 1; i < max_thread_num_; i++) {
if (workers_[i].joinable()) {
workers_[i].join();
}
}
}
int Backend::get_thread_num() {
return thread_num_;
}
int Backend::get_thread_num() { return max_thread_num_; }
void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
func_ = func;
void Backend::do_work_stealing_job(int task_num,
std::function<void(int)> init_func,
std::function<void(int)> compute_func,
std::function<void(int)> finalize_func) {
init_func_ = init_func;
compute_func_ = compute_func;
finalize_func_ = finalize_func;
thread_num_ = std::min(max_thread_num_, task_num);
int base = task_num / thread_num_;
int remain = task_num % thread_num_;
thread_state_[0].end = base + (0 < remain);
// 为主线程设置 thread_local_id
thread_local_id = 0;
for (int i = 1; i < thread_num_; i++) {
thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
thread_state_[i].curr->store(thread_state_[i - 1].end,
std::memory_order_relaxed);
thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
thread_state_[i].status->store(ThreadStatus::WORKING,
std::memory_order_release);
}
thread_state_[0].curr->store(0, std::memory_order_relaxed);
thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
thread_state_[0].status->store(ThreadStatus::WORKING,
std::memory_order_release);
process_tasks(0);
for (int i = 1; i < thread_num_; i++) {
while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
while (thread_state_[i].status->load(std::memory_order_acquire) ==
ThreadStatus::WORKING) {
}
}
}
void Backend::process_tasks(int thread_id) {
if (init_func_ != nullptr) {
init_func_(thread_id);
}
while (true) {
int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
int task_id = thread_state_[thread_id].curr->fetch_add(
1, std::memory_order_acq_rel);
if (task_id >= thread_state_[thread_id].end) {
break;
}
func_(task_id);
compute_func_(task_id);
}
for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
int t_i = (thread_id + t_offset) % thread_num_;
if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
if (thread_state_[t_i].status->load(std::memory_order_acquire) !=
ThreadStatus::WORKING) {
continue;
}
while (true) {
int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
int task_id = thread_state_[t_i].curr->fetch_add(
1, std::memory_order_acq_rel);
if (task_id >= thread_state_[t_i].end) {
break;
}
func_(task_id);
compute_func_(task_id);
}
}
thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
if (finalize_func_ != nullptr) {
finalize_func_(thread_id);
}
thread_state_[thread_id].status->store(ThreadStatus::WAITING,
std::memory_order_release);
}
void Backend::worker_thread(int thread_id) {
auto start = std::chrono::steady_clock::now();
thread_local_id = thread_id; // 设置线程本地变量
while (true) {
ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
ThreadStatus status =
thread_state_[thread_id].status->load(std::memory_order_acquire);
if (status == ThreadStatus::WORKING) {
process_tasks(thread_id);
start = std::chrono::steady_clock::now();
} else if (status == ThreadStatus::WAITING) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(now -
start)
.count();
if (duration > 50) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
......
......@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:38
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
......@@ -31,20 +31,25 @@ struct ThreadState {
};
class Backend {
public:
public:
Backend(int);
~Backend();
int get_thread_num();
void do_work_stealing_job(int, std::function<void(int)>);
void do_work_stealing_job(int, std::function<void(int)>,
std::function<void(int)>,
std::function<void(int)>);
static thread_local int thread_local_id;
private:
private:
int thread_num_;
std::vector<ThreadState> thread_state_; // [thread_num]
std::function<void(int)> func_;
int max_thread_num_;
std::vector<ThreadState> thread_state_; // [thread_num]
std::function<void(int)> init_func_;
std::function<void(int)> compute_func_;
std::function<void(int)> finalize_func_;
std::vector<std::thread> workers_;
void process_tasks(int);
void worker_thread(int);
};
#endif
\ No newline at end of file
......@@ -54,4 +54,4 @@ void TaskQueue::processTasks() {
}
mutex.unlock();
}
}
}
\ No newline at end of file
......@@ -4,7 +4,7 @@
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenxl
* @LastEditTime : 2024-08-12 12:28:25
* @LastEditTime : 2024-08-08 04:23:51
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_TASKQUEUE_H
......
#!/usr/bin/env python
# coding=utf-8
"""
Description :
Author : Jianwei Dong
Date : 2024-08-28 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-08-28 10:32:05
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + "/../build")
import cpuinfer_ext
from flash_attn import flash_attn_with_kvcache
import torch
layer_num = 10
kv_head_num = 8
q_head_num = 32
head_dim = 128
block_len = 128
anchor_num = 1
cache_seqlen = 8192
cache_seqlens = torch.tensor([cache_seqlen], dtype=torch.int32, device="cpu")
seqlens_zero = torch.zeros((1,), dtype=torch.int32, device="cpu")
anchor_type = cpuinfer_ext.kvcache.AnchorType.DYNAMIC
kv_type = cpuinfer_ext.kvcache.ggml_type.FP16
retrieval_type = cpuinfer_ext.kvcache.RetrievalType.LAYER
layer_step: int = 1
token_step: int = 1
layer_offset: int = 0
max_thread_num: int = 2
max_batch_size: int = 1
max_block_num: int = 512
CPUInfer = cpuinfer_ext.CPUInfer(max_thread_num)
validation_iter = 100
with torch.inference_mode(mode=True):
config = cpuinfer_ext.kvcache.KVCacheConfig(
layer_num,
kv_head_num,
q_head_num,
head_dim,
block_len,
anchor_num,
anchor_type,
kv_type,
retrieval_type,
layer_step,
token_step,
layer_offset,
max_block_num,
max_batch_size,
max_thread_num,
)
local_kvcache = cpuinfer_ext.kvcache.KVCache(config)
kvcaches = []
block_table = (
torch.arange(max_block_num, dtype=torch.int32, device="cpu")
.contiguous()
.view(1, -1)
)
for layer_idx in range(layer_num):
k_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
v_cache = torch.randn(
(1, cache_seqlen, kv_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
CPUInfer.submit(
local_kvcache.update_kvcache_fp16(
k_cache.data_ptr(),
v_cache.data_ptr(),
layer_idx,
block_table.data_ptr(),
1,
max_block_num,
seqlens_zero.data_ptr(),
cache_seqlen,
)
)
CPUInfer.sync()
kvcaches.append((k_cache.to("cuda"), v_cache.to("cuda")))
# validation
for i in range(validation_iter):
k_cache = kvcaches[i % layer_num][0]
v_cache = kvcaches[i % layer_num][1]
input = torch.randn(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
output = torch.empty(
(1, 1, q_head_num, head_dim), dtype=torch.float16, device="cpu"
).contiguous()
# attn_lse: (bsz, q_len, q_head_num)
attn_lse = torch.empty(
(1, 1, q_head_num), dtype=torch.float32, device="cpu"
).contiguous()
input = input / 100
CPUInfer.submit(
local_kvcache.attn(
input.data_ptr(),
output.data_ptr(),
attn_lse.data_ptr(),
i % layer_num,
0,
1,
1,
max_block_num,
block_table.data_ptr(),
cache_seqlens.data_ptr(),
-1,
-1,
-1,
)
)
CPUInfer.sync()
# print("cpuinfer output", output)
t_output = flash_attn_with_kvcache(
q=input.to("cuda"),
k_cache=k_cache,
v_cache=v_cache,
cache_seqlens=cache_seqlens.to("cuda"),
)
# print("torch output", t_output)
diff = torch.mean(torch.abs(output.to("cuda") - t_output)) / torch.mean(
torch.abs(t_output)
)
print("diff = ", diff)
assert diff < 0.001
This diff is collapsed.
/**
* @Description :
* @Author : Jianwei Dong
* @Date : 2024-08-26 22:47:06
* @Version : 1.0.0
* @LastEditors : Jianwei Dong
* @LastEditTime : 2024-08-26 22:47:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "kvcache.h"
void KVCache::load_kvcache(std::string tensor_file_path, Backend *backend) {
// Timer start
auto start = std::chrono::high_resolution_clock::now();
std::ifstream ifs_tensor(tensor_file_path, std::ios::binary);
if (!ifs_tensor) {
throw std::runtime_error("Failed to open tensor file");
}
ifs_tensor.read(reinterpret_cast<char *>(&cache_total_len_),
sizeof(cache_total_len_));
int past_block_num =
(cache_total_len_ + config_.block_len - 1) / config_.block_len;
printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len_,
past_block_num);
for (int i = 0; i < config_.layer_num; ++i) {
past_block_num_[i] = past_block_num;
}
ifs_tensor.read(reinterpret_cast<char *>(anchor_.data()),
anchor_.size() * sizeof(ggml_fp16_t));
for (int i = 0; i < config_.layer_num; ++i) {
for (int j = 0; j < config_.kv_head_num; ++j) {
for (int k = 0; k < past_block_num_[i]; ++k) {
if (config_.kv_type == GGML_TYPE_F16) {
ifs_tensor.read(
reinterpret_cast<char *>(k_cache_fp16_[i][j][k].data()),
k_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
ifs_tensor.read(
reinterpret_cast<char *>(v_cache_fp16_[i][j][k].data()),
v_cache_fp16_[i][j][k].size() * sizeof(ggml_fp16_t));
} else if (config_.kv_type == GGML_TYPE_Q4_0) {
ifs_tensor.read(
reinterpret_cast<char *>(k_cache_q4[i][j][k].data()),
k_cache_q4[i][j][k].size() * sizeof(block_q4_0));
ifs_tensor.read(
reinterpret_cast<char *>(v_cache_q4[i][j][k].data()),
v_cache_q4[i][j][k].size() * sizeof(block_q4_0));
}
}
}
for (int k = 0; k < past_block_num_[i]; ++k) {
for (int l = 0; l < config_.block_len; l++) {
ifs_tensor.read(
reinterpret_cast<char *>(importance_[i][k][l].data()),
importance_[i][k][l].size() * sizeof(ggml_fp16_t));
}
}
}
ifs_tensor.close();
// Timer end
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("time of load: %f s\n", diff.count());
}
void KVCache::dump_kvcache(int *block_table, int cache_total_len,
std::string tensor_file_path, Backend *backend) {
// Timer start
auto start = std::chrono::high_resolution_clock::now();
std::ofstream ofs(tensor_file_path, std::ios::binary);
printf("dump_kvcache: %s\n", tensor_file_path.c_str());
if (!ofs.is_open()) {
std::cerr << "Cannot open file " << tensor_file_path << std::endl;
return;
}
ofs.write(reinterpret_cast<const char *>(&cache_total_len),
sizeof(cache_total_len));
int past_block_num =
(cache_total_len + config_.block_len - 1) / config_.block_len;
printf("cache_total_len: %d, past_block_num: %d\n", cache_total_len,
past_block_num);
ofs.write(reinterpret_cast<const char *>(anchor_.data()),
anchor_.size() * sizeof(ggml_fp16_t));
for (int i = 0; i < config_.layer_num; ++i) {
for (int j = 0; j < config_.kv_head_num; ++j) {
for (int k = 0; k < past_block_num; ++k) {
int block_idx = block_table[k];
if (config_.kv_type == GGML_TYPE_F16) {
ofs.write(reinterpret_cast<const char *>(
k_cache_fp16_[i][j][block_idx].data()),
k_cache_fp16_[i][j][block_idx].size() *
sizeof(ggml_fp16_t));
ofs.write(reinterpret_cast<const char *>(
v_cache_fp16_[i][j][block_idx].data()),
v_cache_fp16_[i][j][block_idx].size() *
sizeof(ggml_fp16_t));
} else if (config_.kv_type == GGML_TYPE_Q4_0) {
ofs.write(reinterpret_cast<const char *>(
k_cache_q4[i][j][block_idx].data()),
k_cache_q4[i][j][block_idx].size() *
sizeof(block_q4_0));
ofs.write(reinterpret_cast<const char *>(
v_cache_q4[i][j][block_idx].data()),
v_cache_q4[i][j][block_idx].size() *
sizeof(block_q4_0));
}
}
}
for (int k = 0; k < past_block_num; ++k) {
int block_idx = block_table[k];
for (int l = 0; l < config_.block_len; l++) {
ofs.write(reinterpret_cast<const char *>(
importance_[i][block_idx][l].data()),
importance_[i][block_idx][l].size() *
sizeof(ggml_fp16_t));
}
}
}
ofs.close();
// Timer end
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("time of dump: %f s\n", diff.count());
}
\ No newline at end of file
......@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:34:58
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-15 07:45:18
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "linear.h"
......@@ -24,10 +24,14 @@ Linear::~Linear() {
shared_mem_buffer.dealloc(this);
}
void Linear::warm_up(Backend* backend) {
void Linear::warm_up(Backend *backend) {
std::vector<float> input_fp32(config_.input_size);
std::vector<uint8_t> input(config_.input_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.output_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> input(config_.input_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.output_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
for (int i = 0; i < config_.input_size; i++) {
input_fp32[i] = 0;
}
......@@ -45,7 +49,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
proj_input_ptr = proj_input_;
}
int nth = config_.output_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
void* proj_ptr = (uint8_t*)proj_ + ith * config_.stride * config_.input_size * ggml_type_size(config_.proj_type) / ggml_blck_size(config_.proj_type);
float* proj_output_ptr = proj_output_ + ith * config_.stride;
......@@ -57,7 +61,7 @@ void Linear::forward_many(int qlen, const void* input, void* output, Backend* ba
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
}
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
from_float(proj_output_, output, qlen * config_.output_size, config_.hidden_type);
}
......
......@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:04
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-15 07:44:38
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "mlp.h"
......@@ -31,10 +31,14 @@ MLP::~MLP() {
shared_mem_buffer.dealloc(this);
}
void MLP::warm_up(Backend* backend) {
void MLP::warm_up(Backend *backend) {
std::vector<float> input_fp32(config_.hidden_size);
std::vector<uint8_t> input(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> input(config_.hidden_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
std::vector<uint8_t> output(config_.hidden_size *
ggml_type_size(config_.hidden_type) /
ggml_blck_size(config_.hidden_type));
for (int i = 0; i < config_.hidden_size; i++) {
input_fp32[i] = 0;
}
......@@ -42,9 +46,7 @@ void MLP::warm_up(Backend* backend) {
forward_many(1, input.data(), output.data(), backend);
}
static float act_fn(float x) {
return x / (1.0f + expf(-x));
}
static float act_fn(float x) { return x / (1.0f + expf(-x)); }
void MLP::forward_many(int qlen, const void* input, void* output, Backend* backend) {
const void* gate_input_ptr;
......@@ -72,7 +74,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
}
}
int nth = config_.intermediate_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
void* gate_proj_ptr = (uint8_t*)gate_proj_ + ith * config_.stride * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
float* gate_output_ptr = gate_output_ + ith * config_.stride;
......@@ -90,12 +92,12 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
from_float(intermediate_fp32_, down_input_, qlen * config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
nth = config_.hidden_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
void* down_proj_ptr = (uint8_t*)down_proj_ + ith * config_.stride * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
float* down_output_ptr = down_output_ + ith * config_.stride;
......@@ -107,7 +109,7 @@ void MLP::forward_many(int qlen, const void* input, void* output, Backend* backe
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
}
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
from_float(down_output_, output, qlen * config_.hidden_size, config_.hidden_type);
}
......
......@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:07
* @LastEditors : kkk1nak0
* @LastEditTime : 2024-08-15 07:43:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "moe.h"
......@@ -121,7 +121,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
int nth = config_.intermediate_size / config_.stride;
backend->do_work_stealing_job(nth * k, [&](int task_id) {
backend->do_work_stealing_job(nth * k, nullptr, [&](int task_id) {
int expert_idx = task_id / nth;
uint64_t expert_id = expert_ids[expert_idx];
int ith = task_id % nth;
......@@ -139,14 +139,14 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
void* down_input_ptr = s_down_input_[expert_idx] + ith * config_.stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
from_float(intermediate_fp32_ptr, down_input_ptr, config_.stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) != 0) {
for (int i = 0; i < k; i++) {
from_float(s_intermediate_fp32_[i], s_down_input_[i], config_.intermediate_size, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
}
nth = config_.hidden_size / config_.stride;
backend->do_work_stealing_job(nth, [&](int task_id) {
backend->do_work_stealing_job(nth, nullptr, [&](int task_id) {
int ith = task_id;
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
s_output_fp32_[i] = 0;
......@@ -165,7 +165,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
}
});
}, nullptr);
if (config_.stride % ggml_blck_size(config_.hidden_type) != 0) {
from_float(s_output_fp32_, output, config_.hidden_size, config_.hidden_type);
}
......@@ -191,7 +191,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
offset += m_local_num_[i];
}
backend->do_work_stealing_job(qlen, [&](int i) {
backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
const void* gate_input_ptr;
const void* up_input_ptr;
if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
......@@ -220,10 +220,10 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
memcpy(m_local_gate_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type), gate_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.gate_type).vec_dot_type));
memcpy(m_local_up_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type), up_input_ptr, config_.hidden_size * ggml_type_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.up_type).vec_dot_type));
}
});
}, nullptr);
int stride = QK_K;
int nth = config_.intermediate_size / stride;
backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
int expert_idx = task_id / nth;
int ith = task_id % nth;
void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
......@@ -242,18 +242,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
void* down_input_ptr = m_local_down_input_ptr_[expert_idx] + i * config_.intermediate_size * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) + ith * stride * ggml_type_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
from_float(intermediate_fp32_ptr, down_input_ptr, stride, ggml_internal_get_type_traits(config_.down_type).vec_dot_type);
}
});
}, nullptr);
stride = QK_K;
nth = config_.hidden_size / stride;
backend->do_work_stealing_job(nth * config_.expert_num, [&](int task_id) {
backend->do_work_stealing_job(nth * config_.expert_num, nullptr, [&](int task_id) {
int expert_idx = task_id / nth;
int ith = task_id % nth;
void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
});
backend->do_work_stealing_job(qlen, [&](int i) {
}, nullptr);
backend->do_work_stealing_job(qlen, nullptr, [&](int i) {
for (int e = 0; e < config_.hidden_size; e++) {
m_output_fp32_[i][e] = 0;
}
......@@ -263,7 +263,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
}
}
from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
});
}, nullptr);
}
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
......
# Copyright 2024 Shaoyuan Chen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Description :
Author : Boxin Zhang, Azure-Tang
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
"""
import os
import platform
import sys
project_dir = os.path.dirname(os.path.dirname(__file__))
sys.path.insert(0, project_dir)
import torch
......@@ -31,6 +25,7 @@ import fire
from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_llama import LlamaForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate
from ktransformers.server.config.config import Config
......@@ -38,38 +33,56 @@ from ktransformers.server.config.config import Config
custom_models = {
"DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
"Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
"LlamaForCausalLM": LlamaForCausalLM,
"MixtralForCausalLM": MixtralForCausalLM,
}
ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
default_optimize_rules ={
ktransformer_rules_dir = (
os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
)
default_optimize_rules = {
"DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
"Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
"LlamaForCausalLM": ktransformer_rules_dir + "Internlm2_5-7b-Chat-1m.yaml",
"MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
}
def local_chat(
model_path: str,
model_path: str | None = None,
optimize_rule_path: str = None,
gguf_path: str = None,
gguf_path: str | None = None,
max_new_tokens: int = 1000,
cpu_infer: int = Config().cpu_infer,
use_cuda_graph: bool = True,
prompt_file : str | None = None,
mode: str = "normal",
):
torch.set_grad_enabled(False)
Config().cpu_infer = cpu_infer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
torch.set_default_dtype(config.torch_dtype)
if mode == 'long_context':
torch.set_default_dtype(torch.float16)
else:
torch.set_default_dtype(config.torch_dtype)
with torch.device("meta"):
if config.architectures[0] in custom_models:
print("using custom modeling_xxx.py.")
if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
if (
"Qwen2Moe" in config.architectures[0]
): # Qwen2Moe must use flash_attention_2 to avoid overflow.
config._attn_implementation = "flash_attention_2"
if "Mixtral" in config.architectures[0]:
if "Llama" in config.architectures[0]:
config._attn_implementation = "eager"
if "Mixtral" in config.architectures[0]:
config._attn_implementation = "flash_attention_2"
model = custom_models[config.architectures[0]](config)
else:
model = AutoModelForCausalLM.from_config(
......@@ -95,26 +108,50 @@ def local_chat(
if model.generation_config.pad_token_id is None:
model.generation_config.pad_token_id = model.generation_config.eos_token_id
model.eval()
logging.basicConfig(level=logging.INFO)
system = platform.system()
if (system == u'Windows'):
os.system('cls')
if system == "Windows":
os.system("cls")
else:
os.system('clear')
os.system("clear")
while True:
content = input("Chat: ")
if content == "":
content = "Please write a piece of quicksort code in C++."
if content.startswith('"""'): # prefix """
# multi lines input
content = content[3:] + "\n"
while True:
line = input("")
if line.endswith('"""'):
# end multi lines input
line = line[:-3] # suffix """
if line:
content += line + "\n"
break
else:
content += line + "\n"
if content == "":
if prompt_file != None:
content = open(prompt_file, "r").read()
else:
content = "Please write a piece of quicksort code in C++."
elif os.path.isfile(content):
content = open(content, "r").read()
messages = [{"role": "user", "content": content}]
input_tensor = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
)
torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph)
assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
"please change max_seq_len in ~/.ktransformers/config.yaml"
torch.set_default_dtype(
torch.bfloat16
) # TODO: Remove this, replace dtype using config
generated = prefill_and_generate(
model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode
)
if __name__ == "__main__":
fire.Fire(local_chat)
\ No newline at end of file
fire.Fire(local_chat)
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""LLaMA model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
class LlamaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LLaMA-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`LlamaModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
Llama 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`List[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
```python
>>> from transformers import LlamaModel, LlamaConfig
>>> # Initializing a LLaMA llama-7b style configuration
>>> configuration = LlamaConfig()
>>> # Initializing a model from the llama-7b style configuration
>>> model = LlamaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "llama"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
rope_config_validation(self)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
This diff is collapsed.
'''
"""
Description :
Author : Boxin Zhang
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
"""
from torch import nn
from ktransformers.models.modeling_deepseek import DeepseekV2YarnRotaryEmbedding, DeepseekV2RotaryEmbedding
from transformers import ROPE_INIT_FUNCTIONS
from ktransformers.models.modeling_llama import (
LlamaRotaryEmbedding,
LlamaLinearScalingRotaryEmbedding,
LlamaDynamicNTKScalingRotaryEmbedding,
)
from ktransformers.models.modeling_deepseek import (
DeepseekV2YarnRotaryEmbedding,
DeepseekV2RotaryEmbedding,
)
from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
def __init__(self,
key: str,
gguf_loader : GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.orig_module.__init__(orig_module.dim,
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, generate_device, **kwargs
)
self.orig_module.__init__(
orig_module.dim, orig_module.max_position_embeddings, orig_module.base
)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self):
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.device,
)
class RotaryEmbeddingV2(BaseInjectedModule, LlamaRotaryEmbedding):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, generate_device, **kwargs
)
self.orig_module.__init__(
orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base)
orig_module.base,
None,
orig_module.scaling_factor,
orig_module.rope_type,
orig_module.config,
)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self):
self.orig_module.__init__(self.orig_module.dim,
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.device)
self.device,
self.orig_module.scaling_factor,
self.orig_module.rope_type,
self.orig_module.config,
)
class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
def __init__(self,
key: str,
gguf_loader : GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.orig_module.__init__(orig_module.dim,
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
# device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, generate_device, **kwargs
)
self.orig_module.__init__(
orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base,
None, #device
None, # device
orig_module.scaling_factor,
orig_module.original_max_position_embeddings,
orig_module.beta_fast,
orig_module.beta_slow,
orig_module.mscale,
orig_module.mscale_all_dim)
orig_module.mscale_all_dim,
)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self):
self.orig_module.__init__(self.orig_module.dim,
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.generate_device,
......@@ -70,5 +131,42 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
self.orig_module.beta_fast,
self.orig_module.beta_slow,
self.orig_module.mscale,
self.orig_module.mscale_all_dim)
self.orig_module.mscale_all_dim,
)
class DynamicNTKScalingRotaryEmbedding(
BaseInjectedModule, LlamaDynamicNTKScalingRotaryEmbedding
):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
**kwargs,
):
BaseInjectedModule.__init__(
self, key, gguf_loader, config, orig_module, device, **kwargs
)
self.orig_module.__init__(
orig_module.dim,
orig_module.max_position_embeddings,
orig_module.base,
None, # device
orig_module.scaling_factor,
orig_module.rope_type,
orig_module.config,
)
def load(self):
self.orig_module.__init__(
self.orig_module.dim,
self.orig_module.max_position_embeddings,
self.orig_module.base,
self.orig_module.device,
self.orig_module.scaling_factor,
self.orig_module.rope_type,
self.orig_module.config,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment