"vscode:/vscode.git/clone" did not exist on "5623ea065abc80211efda0952662ec3b6023ca87"
Unverified Commit 233bbb8c authored by UnicornChan's avatar UnicornChan Committed by GitHub
Browse files

Merge pull request #57 from UnicornChan/develop-0.1.3

[feature] release 0.1.3
parents 67f8b370 4d1d561d
...@@ -7,16 +7,22 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ...@@ -7,16 +7,22 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
import torch import torch
from torch import nn from torch import nn
import warnings import warnings
import torch.nn.functional as F
from ktransformers.operators.models import KLlamaModel
from ktransformers.models.configuration_deepseek import DeepseekV2Config from ktransformers.models.configuration_deepseek import DeepseekV2Config
from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from typing import Optional, Tuple from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_gguf import GGUFLoader
import logging
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from transformers.cache_utils import Cache from transformers.cache_utils import Cache
logger = logging.getLogger("attention")
class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
"""Multi-headed attention from 'Attention Is All You Need' paper""" """Multi-headed attention from 'Attention Is All You Need' paper"""
attn_mask: Optional[torch.Tensor] = None
def __init__(self, def __init__(self,
key: str, key: str,
...@@ -24,10 +30,12 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): ...@@ -24,10 +30,12 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
config: PretrainedConfig, config: PretrainedConfig,
orig_module: nn.Module, orig_module: nn.Module,
device: str = "cuda", device: str = "cuda",
chunck_size: int = 1000,
**kwargs): **kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
self.orig_module.__init__(orig_module.config, self.orig_module.__init__(orig_module.config,
orig_module.layer_idx) orig_module.layer_idx)
self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]: def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')): if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
...@@ -157,9 +165,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): ...@@ -157,9 +165,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
) )
bsz, q_len, _ = hidden_states.size() bsz, q_len, _ = hidden_states.size()
chunck_size = 256 # TODO, generate chunck_size automatically.
if q_len <= chunck_size: if q_len <= self.chunck_size:
return self.forward_chunck( return self.forward_chunck(
hidden_states, hidden_states,
attention_mask, attention_mask,
...@@ -176,24 +183,170 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention): ...@@ -176,24 +183,170 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
cur_idx = 0 cur_idx = 0
while cur_idx < q_len: while cur_idx < q_len:
if attention_mask is not None: if attention_mask is not None:
chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + chunck_size, q_len), ...] chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
else: else:
chunk_mask = None # generate chunk_mask automatically.
self.attn_mask = \
torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
if self.attn_mask is None \
else self.attn_mask
self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
-1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
[:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
self.attn_mask[:, :, :, :cur_idx] = 0
chunck_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))
cur_output, _, _ = self.forward_chunck( cur_output, _, _ = self.forward_chunck(
hidden_states[:, cur_idx:min(cur_idx + chunck_size, q_len), ...], hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
chunk_mask, chunck_mask,
position_ids[:, cur_idx:min(cur_idx + chunck_size, q_len)], position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
past_key_value, past_key_value,
output_attentions, output_attentions,
use_cache, use_cache,
cache_position[cur_idx:min(cur_idx + chunck_size, q_len)], cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
**kwargs **kwargs
) )
cur_idx += chunck_size cur_idx += self.chunck_size
if attn_output is None: if attn_output is None:
attn_output = cur_output attn_output = cur_output
else: else:
attn_output = torch.cat((attn_output, cur_output), dim=-2) attn_output = torch.cat((attn_output, cur_output), dim=-2)
return attn_output, None, past_key_value return attn_output, None, past_key_value
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
class KLlamaAttention(BaseInjectedModule):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def __init__(self,
key: str,
gguf_loader : GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module,
device: str = "cuda",
**kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
self.orig_module.__init__(orig_module.config,
orig_module.layer_idx)
def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`, *optional*):
Deprecated and unused.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
cos = cos.unsqueeze(unsqueeze_dim)
sin = sin.unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Cache] = None,
output_attentions: bool = False,
use_cache: bool = False,
cache_position: Optional[torch.LongTensor] = None,
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
bsz, q_len, _ = hidden_states.size()
if self.config.pretraining_tp > 1:
key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
query_slices = self.q_proj.weight.split(
(self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
)
key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
query_states = torch.cat(query_states, dim=-1)
key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
key_states = torch.cat(key_states, dim=-1)
value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
value_states = torch.cat(value_states, dim=-1)
else:
query_states = self.q_proj(hidden_states)
key_states = self.k_proj(hidden_states)
value_states = self.v_proj(hidden_states)
query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
if position_embeddings is None:
logger.warning(
"The attention layers in this model are transitioning from computing the RoPE embeddings internally "
"through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
"`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
"removed and `position_embeddings` will be mandatory."
)
cos, sin = self.rotary_emb(value_states, position_ids)
else:
cos, sin = position_embeddings
query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)
if q_len == 1:
position_ids = position_ids[0][-1].unsqueeze(0).unsqueeze(0)
query_states = query_states[:, :, -1:]
key_states = key_states[:, :, -1:]
attn_output = KLlamaModel.dynamic_sdpa.apply(
self.layer_idx,
bsz,
position_ids[0][0],
query_states.transpose(1, 2).to(torch.float16),
key_states.transpose(1, 2).to(torch.float16),
value_states.transpose(1, 2).to(torch.float16),
mode="prefill" if q_len > 1 else "generate",
)
if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2).contiguous()
attn_output = attn_output.reshape(bsz, q_len, -1)
if self.config.pretraining_tp > 1:
attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
else:
attn_output = self.o_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
...@@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022 ...@@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-25 11:25:24 Date : 2024-07-25 11:25:24
Version : 0.1.0 Version : 0.1.0
LastEditors : Azure LastEditors : Azure
LastEditTime : 2024-08-15 02:36:29 LastEditTime : 2024-08-27 03:50:23
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
...@@ -436,7 +436,7 @@ class KExpertsTorch(KExpertsBase): ...@@ -436,7 +436,7 @@ class KExpertsTorch(KExpertsBase):
final_hidden_states.index_add_(0, top_x, current_hidden_states) final_hidden_states.index_add_(0, top_x, current_hidden_states)
return final_hidden_states.to(org_dtype, device=org_device) return final_hidden_states.to(dtype=org_dtype, device=org_device)
EXPERTS_MAP = { EXPERTS_MAP = {
"KExpertsCPU": KExpertsCPU, "KExpertsCPU": KExpertsCPU,
......
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
generate_device: "cuda" generate_device: "cuda"
prefill_device: "cuda" prefill_device: "cuda"
- match: - match:
name: "^model\\.layers\\.(?!.*self_attn).*$" # regular expression name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously class: torch.nn.Linear # only match modules matching name and class simultaneously
replace: replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
...@@ -41,6 +41,12 @@ ...@@ -41,6 +41,12 @@
kwargs: kwargs:
generate_device: "cuda" generate_device: "cuda"
prefill_device: "cuda" prefill_device: "cuda"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 2000 # 0 is close layer wise prefill
- match: - match:
name: "^model.embed_tokens" name: "^model.embed_tokens"
replace: replace:
......
- match:
class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.RotaryEmbeddingV2
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
class: ktransformers.models.modeling_llama.LlamaModel
replace:
class: ktransformers.operators.models.KLlamaModel
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.attention.KLlamaAttention
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model\\.layers\\..*\\."
replace:
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match: - match:
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace: replace:
......
...@@ -5,10 +5,11 @@ Description : ...@@ -5,10 +5,11 @@ Description :
Author : unicornchan Author : unicornchan
Date : 2024-06-11 16:35:42 Date : 2024-06-11 16:35:42
Version : 1.0.0 Version : 1.0.0
LastEditors : chenxl LastEditors : WuHao
LastEditTime : 2024-07-27 01:55:42 LastEditTime : 2024-08-12 06:31:14
''' '''
import os import os
import shutil
import yaml import yaml
from ktransformers.server.config.singleton import Singleton from ktransformers.server.config.singleton import Singleton
...@@ -30,10 +31,18 @@ class Config(metaclass=Singleton): ...@@ -30,10 +31,18 @@ class Config(metaclass=Singleton):
os.path.dirname(os.path.dirname(__file__))) os.path.dirname(os.path.dirname(__file__)))
config_yaml: str = os.path.join( config_yaml: str = os.path.join(
base_path, "configs", Config.CONFIG_FILE_NAME) base_path, "configs", Config.CONFIG_FILE_NAME)
user_path: str = os.path.expanduser('~')
localstore_path: str = os.path.join(user_path,'.ktransformers')
config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
if not os.path.exists(config_yaml): if not os.path.exists(config_yaml):
print(f"Can't find config file, {config_yaml}") print(f"Can't find config file, {config_yaml}")
exit(-1) exit(-1)
with open(config_yaml, 'r', encoding="utf-8") as fp: if not os.path.exists(localstore_path):
os.mkdir(localstore_path)
if not os.path.exists(config_path):
shutil.copyfile(config_yaml,config_path)
with open(config_path, 'r', encoding="utf-8") as fp:
config = yaml.safe_load(fp) config = yaml.safe_load(fp)
return config return config
...@@ -51,6 +60,8 @@ class Config(metaclass=Singleton): ...@@ -51,6 +60,8 @@ class Config(metaclass=Singleton):
cfg = Config.load() cfg = Config.load()
self.base_path = os.path.dirname( self.base_path = os.path.dirname(
os.path.dirname(os.path.dirname(__file__))) os.path.dirname(os.path.dirname(__file__)))
self.user_path: str = os.path.expanduser('~')
self.localstore_path: str = os.path.join(self.user_path,'.ktransformers')
# log configs # log configs
self.log_dir = os.path.join(self.base_path, Config.to_path(cfg["log"]["dir"])) self.log_dir = os.path.join(self.base_path, Config.to_path(cfg["log"]["dir"]))
self.log_file = cfg["log"]["file"] self.log_file = cfg["log"]["file"]
...@@ -83,6 +94,7 @@ class Config(metaclass=Singleton): ...@@ -83,6 +94,7 @@ class Config(metaclass=Singleton):
self.model_name: str = self.model.get("name", "") self.model_name: str = self.model.get("name", "")
self.model_device: str = self.model.get("device", "cuda:0") self.model_device: str = self.model.get("device", "cuda:0")
self.gguf_path: str = self.model.get("gguf_path", "") self.gguf_path: str = self.model.get("gguf_path", "")
self.model_cache_lens = self.model.get("cache_lens")
# web config # web config
self.web: dict = cfg.get("web", {}) self.web: dict = cfg.get("web", {})
...@@ -91,3 +103,11 @@ class Config(metaclass=Singleton): ...@@ -91,3 +103,11 @@ class Config(metaclass=Singleton):
self.ext: dict = cfg.get("ext", {}) self.ext: dict = cfg.get("ext", {})
self.cpu_infer = self.ext.get("cpu_infer", 10) self.cpu_infer = self.ext.get("cpu_infer", 10)
#file config
self.local_store_configs: dict = cfg.get("local_store",{})
self.file_upload_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("file_upload_dir",""))
self.assistant_store_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("assistant_store_dir",""))
#long context config
self.long_context_config: dict = cfg.get("long_context",{})
\ No newline at end of file
...@@ -46,6 +46,7 @@ class CUDAGraphRunner: ...@@ -46,6 +46,7 @@ class CUDAGraphRunner:
capture_stream.wait_stream(torch.cuda.current_stream()) capture_stream.wait_stream(torch.cuda.current_stream())
torch.cuda.set_device(main_device) torch.cuda.set_device(main_device)
torch.cuda.set_stream(capture_stream) torch.cuda.set_stream(capture_stream)
if past_key_values != None:
past_key_values.change_seq_length(-1) past_key_values.change_seq_length(-1)
torch.cuda.synchronize(self.main_device) torch.cuda.synchronize(self.main_device)
#self.graph.debug_dump("cuda_graph_hooked.dot") #self.graph.debug_dump("cuda_graph_hooked.dot")
......
...@@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022 ...@@ -6,7 +6,7 @@ Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-26 08:48:54 Date : 2024-07-26 08:48:54
Version : 1.0.0 Version : 1.0.0
LastEditors : kkk1nak0 LastEditors : kkk1nak0
LastEditTime : 2024-08-12 07:21:55 LastEditTime : 2024-08-14 08:20:45
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer Copyright (c) 2024 Thomas Germer
...@@ -294,7 +294,6 @@ class GGUFLoader: ...@@ -294,7 +294,6 @@ class GGUFLoader:
else: else:
values = GGML_DEQUANTIZE[ggml_name](data) values = GGML_DEQUANTIZE[ggml_name](data)
values = torch.from_numpy(values) values = torch.from_numpy(values)
values = values.view(shape[::-1]) values = values.view(shape[::-1])
if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]: if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
n_head = self.gguf_file_meta['llama.attention.head_count'] n_head = self.gguf_file_meta['llama.attention.head_count']
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment