Merge pull request #57 from UnicornChan/develop-0.1.3

[feature] release 0.1.3

Merge pull request #57 from UnicornChan/develop-0.1.3
[feature] release 0.1.3
233bbb8c · UnicornChan · GitHub · 67f8b370 · 4d1d561d · 233bbb8c
Unverified Commit 233bbb8c authored Aug 29, 2024 by UnicornChan Committed by GitHub Aug 29, 2024
18 changed files
--- a/ktransformers/operators/attention.py
+++ b/ktransformers/operators/attention.py
@@ -7,16 +7,22 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
 import torch
 from torch import nn
 import warnings
+import torch.nn.functional as F
+from ktransformers.operators.models import KLlamaModel
 from ktransformers.models.configuration_deepseek import DeepseekV2Config
+from ktransformers.models.configuration_llama import LlamaConfig
+from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
 from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
 from typing import Optional, Tuple
 from ktransformers.operators.base_operator import BaseInjectedModule
 from ktransformers.util.custom_gguf import GGUFLoader
+import logging
 from transformers.configuration_utils import PretrainedConfig
 from transformers.cache_utils import Cache
+logger = logging.getLogger("attention")
 class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    attn_mask: Optional[torch.Tensor] = None
    def __init__(self,
                 key: str,
@@ -24,10 +30,12 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                 config: PretrainedConfig,
                 orig_module: nn.Module,
                 device: str = "cuda",
+                 chunck_size: int = 1000,
                 **kwargs):
        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
        self.orig_module.__init__(orig_module.config,
            orig_module.layer_idx)
+        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.
    def get_absorbed(self) -> Tuple[torch.Tensor, torch.Tensor]:
        if not (hasattr(self, 'q_absorb') and hasattr(self, 'out_absorb')):
@@ -157,9 +165,8 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        bsz, q_len, _ = hidden_states.size()
-        chunck_size = 256 # TODO, generate chunck_size automatically.
-        if q_len <= chunck_size:
+        if q_len <= self.chunck_size:
            return self.forward_chunck(
                            hidden_states,
                            attention_mask,
@@ -176,24 +183,170 @@ class KDeepseekV2Attention(BaseInjectedModule, DeepseekV2Attention):
        cur_idx = 0
        while cur_idx < q_len:
            if attention_mask is not None:
-                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + chunck_size, q_len), ...]
+                chunk_mask = attention_mask[:, :, cur_idx:min(cur_idx + self.chunck_size, q_len), ...]
            else:
-                chunk_mask = None
+                # generate chunk_mask automatically.
+                self.attn_mask = \
+                    torch.zeros(1, 1, self.chunck_size, past_key_value.max_cache_len, device=hidden_states.device) \
+                        if self.attn_mask is None \
+                            else self.attn_mask
+                self.attn_mask[:, :, :, cur_idx:min(cur_idx+self.chunck_size, past_key_value.max_cache_len)] = \
+                    -1e+38 * torch.triu(torch.ones(self.chunck_size, self.chunck_size, device=hidden_states.device), diagonal=1)\
+                        [:,:min(self.chunck_size, min(past_key_value.max_cache_len-cur_idx, self.chunck_size))]
+                self.attn_mask[:, :, :, cur_idx+self.chunck_size:] = -1e+38
+                self.attn_mask[:, :, :, :cur_idx] = 0
+                chunck_mask = torch.narrow(self.attn_mask, 2, 0, min(self.chunck_size, q_len-cur_idx))
            cur_output, _, _ = self.forward_chunck(
-                            hidden_states[:, cur_idx:min(cur_idx + chunck_size, q_len), ...],
+                            hidden_states[:, cur_idx:min(cur_idx + self.chunck_size, q_len), ...],
-                            chunk_mask,
+                            chunck_mask,
-                            position_ids[:, cur_idx:min(cur_idx + chunck_size, q_len)],
+                            position_ids[:, cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            past_key_value,
                            output_attentions,
                            use_cache,
-                            cache_position[cur_idx:min(cur_idx + chunck_size, q_len)],
+                            cache_position[cur_idx:min(cur_idx + self.chunck_size, q_len)],
                            **kwargs
                        )
-            cur_idx += chunck_size
+            cur_idx += self.chunck_size
            if attn_output is None:
                attn_output = cur_output
            else:
                attn_output = torch.cat((attn_output, cur_output), dim=-2)
        return attn_output, None, past_key_value
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class KLlamaAttention(BaseInjectedModule):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self,
+                 key: str,
+                 gguf_loader : GGUFLoader,
+                 config: PretrainedConfig,
+                 orig_module: nn.Module,
+                 device: str = "cuda",
+                 **kwargs):
+        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
+        self.orig_module.__init__(orig_module.config,
+            orig_module.layer_idx)
+    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.45
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = self.apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if q_len == 1:
+            position_ids = position_ids[0][-1].unsqueeze(0).unsqueeze(0)
+            query_states = query_states[:, :, -1:]
+            key_states = key_states[:, :, -1:]
+        attn_output = KLlamaModel.dynamic_sdpa.apply(
+            self.layer_idx,
+            bsz,
+            position_ids[0][0],
+            query_states.transpose(1, 2).to(torch.float16),
+            key_states.transpose(1, 2).to(torch.float16),
+            value_states.transpose(1, 2).to(torch.float16),
+            mode="prefill" if q_len > 1 else "generate",
+        )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
\ No newline at end of file
--- a/ktransformers/operators/cpuinfer.py
+++ b/ktransformers/operators/cpuinfer.py
--- a/ktransformers/operators/dynamic_attention.py
+++ b/ktransformers/operators/dynamic_attention.py
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -6,7 +6,7 @@ Author       : Azure-Tang, Boxin Zhang, chenht2022
 Date         : 2024-07-25 11:25:24
 Version      : 0.1.0
 LastEditors  : Azure 
-LastEditTime : 2024-08-15 02:36:29
+LastEditTime : 2024-08-27 03:50:23
 Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
 '''
@@ -436,7 +436,7 @@ class KExpertsTorch(KExpertsBase):
            final_hidden_states.index_add_(0, top_x, current_hidden_states)
-        return final_hidden_states.to(org_dtype, device=org_device)
+        return final_hidden_states.to(dtype=org_dtype, device=org_device)
 EXPERTS_MAP = {
    "KExpertsCPU": KExpertsCPU,

--- a/ktransformers/operators/models.py
+++ b/ktransformers/operators/models.py
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
@@ -6,7 +6,7 @@
      generate_device: "cuda"
      prefill_device: "cuda"
 - match:
-    name: "^model\\.layers\\.(?!.*self_attn).*$"  # regular expression 
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
@@ -41,6 +41,12 @@
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 2000 # 0 is close layer wise prefill
 - match:
    name: "^model.embed_tokens"
  replace:

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
--- a/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
+++ b/ktransformers/optimize/optimize_rules/Internlm2_5-7b-Chat-1m.yaml
+- match:
+    class: ktransformers.models.modeling_llama.LlamaRotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.RotaryEmbeddingV2
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+        generate_device: "cpu"
+        prefill_device: "cpu"
+- match:
+    class: ktransformers.models.modeling_llama.LlamaModel
+  replace:
+    class: ktransformers.operators.models.KLlamaModel
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KLlamaAttention
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
--- a/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+++ b/ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+- match:
+    name: "^model\\.layers\\..*\\."
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
 - match:
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:

--- a/ktransformers/server/config/config.py
+++ b/ktransformers/server/config/config.py
@@ -5,10 +5,11 @@ Description  :
 Author       : unicornchan
 Date         : 2024-06-11 16:35:42
 Version      : 1.0.0
-LastEditors  : chenxl 
+LastEditors  : WuHao 
-LastEditTime : 2024-07-27 01:55:42
+LastEditTime : 2024-08-12 06:31:14
 '''
 import os
+import shutil
 import yaml
 from ktransformers.server.config.singleton import Singleton
@@ -30,10 +31,18 @@ class Config(metaclass=Singleton):
            os.path.dirname(os.path.dirname(__file__)))
        config_yaml: str = os.path.join(
            base_path, "configs", Config.CONFIG_FILE_NAME)
+        user_path: str = os.path.expanduser('~')
+        localstore_path: str = os.path.join(user_path,'.ktransformers')
+        config_path: str = os.path.join(localstore_path,Config.CONFIG_FILE_NAME)
        if not os.path.exists(config_yaml):
            print(f"Can't find config file, {config_yaml}")
            exit(-1)
-        with open(config_yaml, 'r', encoding="utf-8") as fp:
+        if not os.path.exists(localstore_path):
+            os.mkdir(localstore_path)
+        if not os.path.exists(config_path):
+            shutil.copyfile(config_yaml,config_path)
+        with open(config_path, 'r', encoding="utf-8") as fp:
            config = yaml.safe_load(fp)
        return config
@@ -51,6 +60,8 @@ class Config(metaclass=Singleton):
        cfg = Config.load()
        self.base_path = os.path.dirname(
            os.path.dirname(os.path.dirname(__file__)))
+        self.user_path: str = os.path.expanduser('~')
+        self.localstore_path: str = os.path.join(self.user_path,'.ktransformers')
        # log configs
        self.log_dir = os.path.join(self.base_path, Config.to_path(cfg["log"]["dir"]))
        self.log_file = cfg["log"]["file"]
@@ -83,6 +94,7 @@ class Config(metaclass=Singleton):
        self.model_name: str = self.model.get("name", "")
        self.model_device: str = self.model.get("device", "cuda:0")
        self.gguf_path: str = self.model.get("gguf_path", "")
+        self.model_cache_lens = self.model.get("cache_lens")
        # web config
        self.web: dict = cfg.get("web", {})
@@ -91,3 +103,11 @@ class Config(metaclass=Singleton):
        self.ext: dict = cfg.get("ext", {})
        self.cpu_infer = self.ext.get("cpu_infer", 10)
+        #file config
+        self.local_store_configs: dict = cfg.get("local_store",{})
+        self.file_upload_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("file_upload_dir",""))
+        self.assistant_store_dir: str = os.path.join(self.localstore_path,self.local_store_configs.get("assistant_store_dir",""))
+        #long context config
+        self.long_context_config: dict = cfg.get("long_context",{})
\ No newline at end of file
--- a/ktransformers/util/cuda_graph_runner.py
+++ b/ktransformers/util/cuda_graph_runner.py
@@ -46,6 +46,7 @@ class CUDAGraphRunner:
            capture_stream.wait_stream(torch.cuda.current_stream())
            torch.cuda.set_device(main_device)
            torch.cuda.set_stream(capture_stream)
+        if past_key_values != None:    
            past_key_values.change_seq_length(-1)
        torch.cuda.synchronize(self.main_device)
        #self.graph.debug_dump("cuda_graph_hooked.dot")

--- a/ktransformers/util/custom_gguf.py
+++ b/ktransformers/util/custom_gguf.py
@@ -6,7 +6,7 @@ Author       : Azure-Tang, Boxin Zhang, chenht2022
 Date         : 2024-07-26 08:48:54
 Version      : 1.0.0
 LastEditors  : kkk1nak0
-LastEditTime : 2024-08-12 07:21:55
+LastEditTime : 2024-08-14 08:20:45
 Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
 Copyright (c) 2023-2024 The ggml authors
 Copyright (c) 2024 Thomas Germer
@@ -294,7 +294,6 @@ class GGUFLoader:
        else:
            values = GGML_DEQUANTIZE[ggml_name](data)
            values = torch.from_numpy(values)
        values = values.view(shape[::-1])
        if "attn_q" in name and self.gguf_file_meta['general.architecture'] in ["llama"]:
            n_head = self.gguf_file_meta['llama.attention.head_count']

--- a/ktransformers/util/utils.py
+++ b/ktransformers/util/utils.py
--- a/pyproject.toml
+++ b/pyproject.toml
--- a/requirements-local_chat.txt
+++ b/requirements-local_chat.txt