"vscode:/vscode.git/clone" did not exist on "49b23e1583346029bdd28e67b2fd146c9569a789"
Commit c6aa379d authored by qiyuxinlin's avatar qiyuxinlin
Browse files

support safetensor load, delete architectures argument

parent 900a7f7c
import os import os
import sys import sys
sys.path.insert(0,"/home/zbx/ktransformers") sys.path.insert(0,"/home/zbx/ktransformers")
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
import torch import torch
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf") gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
......
#!/bin/bash
set -e
# clear build dirs
# rm -rf build
# rm -rf *.egg-info
# rm -rf csrc/build
# rm -rf csrc/ktransformers_ext/build
# rm -rf csrc/ktransformers_ext/cuda/build
# rm -rf csrc/ktransformers_ext/cuda/dist
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
pip install -r ktransformers/server/requirements.txt
echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
pip install third_party/custom_flashinfer/ -v
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
echo "Installation completed successfully"
...@@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache): ...@@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache):
self.page_table_list = [] self.page_table_list = []
for idx in range(config.num_hidden_layers): for idx in range(config.num_hidden_layers):
if isinstance(device, dict): if isinstance(device, dict):
target_device = device[f"blk.{idx}.self_attn"]["generate_device"] target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
else: else:
target_device = device target_device = device
...@@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache): ...@@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache):
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache. # breaks when updating the cache.
if isinstance(device, dict): if isinstance(device, dict):
target_device = device[f"blk.{idx}.self_attn"]["generate_device"] target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
else: else:
target_device = device target_device = device
......
...@@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel): ...@@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
self.cache = cache self.cache = cache
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.attn = [None] * 10 self.attn = [None] * 100
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0): def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device) self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
......
...@@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel): ...@@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):
self.cache = cache self.cache = cache
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.attn = [None] * 10 self.attn = [None] * 100
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0): def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device) self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
......
...@@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import ( ...@@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import (
yarn_find_correction_range yarn_find_correction_range
) )
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
import torch import torch
......
...@@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding ...@@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from typing import Optional, Tuple from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability from ktransformers.util.utils import get_compute_capability
import logging import logging
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
......
...@@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention ...@@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
from typing import Optional, Tuple from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
import logging import logging
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from flashinfer import BatchMLAPagedAttentionWrapper from flashinfer import BatchMLAPagedAttentionWrapper
......
...@@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ...@@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
from typing import Any from typing import Any
from torch import nn, Tensor from torch import nn, Tensor
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
import ktransformers.util.utils as utils import ktransformers.util.utils as utils
class BaseInjectedModule(nn.Module): class BaseInjectedModule(nn.Module):
......
...@@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext ...@@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
import cpuinfer_ext import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE from cpuinfer_ext.moe import MOEConfig, MOE
import ctypes import ctypes
from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader from ktransformers.util.custom_gguf import GGMLQuantizationType
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.server.config.config import Config from ktransformers.server.config.config import Config
from transformers.activations import ACT2FN from transformers.activations import ACT2FN
...@@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer ...@@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer
def deduplicate_and_sort(lst): def deduplicate_and_sort(lst):
return sorted(set(lst)) return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
if chunk_size <= 1024:
return base_list
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
return deduplicate_and_sort(base_list + multiples)
#cuda_graphs = [Config().chunk_size] #cuda_graphs = [Config().chunk_size]
cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size]) cuda_graphs = generate_cuda_graphs(Config().chunk_size)
# class Base(BaseInjectedModule, ABC): # class Base(BaseInjectedModule, ABC):
class KExpertsBase(ABC): class KExpertsBase(ABC):
def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs): def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
...@@ -77,7 +88,7 @@ class KExpertsBase(ABC): ...@@ -77,7 +88,7 @@ class KExpertsBase(ABC):
down_type = None down_type = None
for key in keys: for key in keys:
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ] targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
tensors = self.load_multi(key, targets, device=device) tensors = self.load_multi(key, targets, device=device)
gate = tensors[".ffn_gate_exps.weight"] gate = tensors[".ffn_gate_exps.weight"]
...@@ -86,7 +97,7 @@ class KExpertsBase(ABC): ...@@ -86,7 +97,7 @@ class KExpertsBase(ABC):
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"] gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"] up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"] down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info: elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
# for supporting Mixtral-8x7B-Instuct # for supporting Mixtral-8x7B-Instuct
gate = [] gate = []
up = [] up = []
...@@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase): ...@@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase):
self.config.num_experts_per_tok, self.config.num_experts_per_tok,
self.config.hidden_size, self.config.hidden_size,
self.config.moe_intermediate_size, self.config.moe_intermediate_size,
25600, max(cuda_graphs),
gate_ptr, gate_ptr,
up_ptr, up_ptr,
down_ptr, down_ptr,
...@@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase): ...@@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase):
self.config.num_experts_per_tok, self.config.num_experts_per_tok,
self.config.hidden_size, self.config.hidden_size,
self.config.moe_intermediate_size, self.config.moe_intermediate_size,
25600, max(cuda_graphs),
gate_ptr, gate_ptr,
up_ptr, up_ptr,
down_ptr, down_ptr,
...@@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase): ...@@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase):
down_type = None down_type = None
for key in keys: for key in keys:
if self.gguf_loader.safetensor_loader is not None: if isinstance(self.gguf_loader, SafeTensorLoader):
# using a temp ugly way to temprary load the tensor res = self.gguf_loader.load_experts(key)
gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy() return {key: res}
up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy() elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy() gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item() up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item() down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item() # gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
# up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
# down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
...@@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase): ...@@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase):
gate = np.stack(gate) gate = np.stack(gate)
up = np.stack(up) up = np.stack(up)
down = np.stack(down) down = np.stack(down)
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"] gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"] up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"] down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
else: else:
raise ValueError(f"Experts {key} not found in gguf_loader") raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}} res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
...@@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase): ...@@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase):
down = None down = None
for key in keys: for key in keys:
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight") up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight") down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
......
...@@ -40,7 +40,7 @@ class flashInferAttn(): ...@@ -40,7 +40,7 @@ class flashInferAttn():
self.kv_layout = kv_layout self.kv_layout = kv_layout
self.use_cuda_graph = use_cuda_graph self.use_cuda_graph = use_cuda_graph
if flashInferAttn.float_workspace_buffer is None: if flashInferAttn.float_workspace_buffer is None:
flashInferAttn.float_workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.uint8, device=device) flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device) self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device) self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device) self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
......
...@@ -6,7 +6,7 @@ import os ...@@ -6,7 +6,7 @@ import os
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.linear import KTransformersLinear from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
...@@ -55,24 +55,20 @@ class KMoEGateBase(ABC): ...@@ -55,24 +55,20 @@ class KMoEGateBase(ABC):
down_type = None down_type = None
for key in keys: for key in keys:
key = ".".join(key.split(".")[:-1]) # key = ".".join(key.split(".")[:-1])
if self.gguf_loader.safetensor_loader is not None: if isinstance(self.gguf_loader, SafeTensorLoader):
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"] res = self.gguf_loader.load_gate(key, device=device)
weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight") elif self.gguf_loader.has_tensor(key+".weight"):
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias") # targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
weight_type = weight.dtype targets = [".weight", ".e_score_correction_bias"]
e_score_correction_bias_type = e_score_correction_bias.dtype
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
tensors = self.load_multi(key, targets, device=device) tensors = self.load_multi(key, targets, device=device)
weight = tensors[".ffn_gate_inp.weight"] weight = tensors[".weight"]
e_score_correction_bias = tensors[".exp_probs_b.bias"] e_score_correction_bias = tensors[".e_score_correction_bias"]
weight_type = self.gguf_loader.tensor_info[key + ".ffn_gate_inp.weight"]["ggml_type"] # weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
e_score_correction_bias_type = self.gguf_loader.tensor_info[key + ".exp_probs_b.bias"]["ggml_type"] res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
else: else:
raise ValueError(f"Experts {key} not found in gguf_loader") raise ValueError(f"Experts {key} not found in gguf_loader")
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
return res return res
def load_multi(self, key: str, keys: list[str], device: str = "cpu"): def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
...@@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase): ...@@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
if w is None: w = self.load_weights(device=device) if w is None: w = self.load_weights(device=device)
if isinstance(w, dict): if isinstance(w, dict):
self.weight_type = w["weight_type"]
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
self.orig_module.weight = nn.Parameter(w["weight"]) self.orig_module.weight = nn.Parameter(w["weight"])
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"]) self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
else: else:
...@@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase): ...@@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
if w is None: w = self.load_weights(device=device) if w is None: w = self.load_weights(device=device)
if isinstance(w, dict): if isinstance(w, dict):
self.weight_type = w["weight_type"]
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
self.orig_module.weight = nn.Parameter(w["weight"]) self.orig_module.weight = nn.Parameter(w["weight"])
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"]) self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
else: else:
......
...@@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm ...@@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from flashinfer.norm import ( from flashinfer.norm import (
fused_add_rmsnorm, fused_add_rmsnorm,
rmsnorm, rmsnorm,
......
...@@ -16,7 +16,7 @@ import torch ...@@ -16,7 +16,7 @@ import torch
from torch import Tensor, nn from torch import Tensor, nn
import KTransformersOps import KTransformersOps
import vLLMMarlin import vLLMMarlin
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import ( from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
MarlinWorkspace, MarlinWorkspace,
...@@ -83,15 +83,15 @@ class KLinearBase(ABC): ...@@ -83,15 +83,15 @@ class KLinearBase(ABC):
keys = [self.key] keys = [self.key]
for key in keys: for key in keys:
if self.gguf_loader.safetensor_loader is not None: if isinstance(self.gguf_loader, SafeTensorLoader):
# using safetensor_loader # using safetensor_loader
tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight') tensor = self.gguf_loader.load_tensor(key+'.weight')
if key+'.weight_scale_inv' in self.gguf_loader.safetensor_loader.tensor_file_map: if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv') weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
return nn.Parameter(tensor), nn.Parameter(weight_scale_inv) return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
return nn.Parameter(tensor) return nn.Parameter(tensor)
elif key + ".weight" in self.gguf_loader.tensor_file_map: elif self.gguf_loader.has_tensor(key + ".weight"):
if key + ".bias" in self.gguf_loader.tensor_file_map: if key + ".bias" in self.gguf_loader.tensor_file_map:
tensors = self.load_multi(key, ["weight", "bias"], device=device) tensors = self.load_multi(key, ["weight", "bias"], device=device)
tensor = tensors["weight"] tensor = tensors["weight"]
...@@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase): ...@@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase):
self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device) self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)
def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"): def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
if self.key + ".weight" in self.gguf_loader.tensor_info: if self.gguf_loader.has_tensor(self.key + ".weight"):
if self.key + ".bias" in self.gguf_loader.tensor_file_map: if self.key + ".bias" in self.gguf_loader.tensor_file_map:
self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight") self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"] self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
......
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from transformers import PretrainedConfig from transformers import PretrainedConfig
import torch.nn as nn import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
......
...@@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig ...@@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState, get_compute_capability from ktransformers.util.utils import InferenceState, get_compute_capability
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import ( from ktransformers.models.modeling_llama import (
LlamaDecoderLayer, LlamaDecoderLayer,
......
...@@ -12,7 +12,7 @@ from torch import nn ...@@ -12,7 +12,7 @@ from torch import nn
from transformers import AutoConfig from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
# from operators import BaseInjectedModule # from operators import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
from ktransformers.util.utils import set_module, load_weights from ktransformers.util.utils import set_module, load_weights
import itertools import itertools
import copy import copy
...@@ -54,7 +54,7 @@ def del_meta(module:nn.Module): ...@@ -54,7 +54,7 @@ def del_meta(module:nn.Module):
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"): def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
module_name = prefix[:-1] module_name = prefix[:-1]
translated_name = translate_name_to_gguf(prefix)[:-1] # translated_name = translate_name_to_gguf(prefix)[:-1]
#print("gen_optimize_config", prefix, module_name, translated_name) #print("gen_optimize_config", prefix, module_name, translated_name)
recursive = True recursive = True
for rule in rule_list: for rule in rule_list:
...@@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if "replace" in rule: if "replace" in rule:
replace_meta = rule["replace"] replace_meta = rule["replace"]
if module_name not in out_data: if module_name not in out_data:
out_data[module_name]={"key": translated_name, out_data[module_name]={"key": module_name,
"class": replace_meta["class"] if "class" in replace_meta else "default", "class": replace_meta["class"] if "class" in replace_meta else "default",
# "device": replace_meta["device"] if "device" in replace_meta else default_device, # "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()} "kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
...@@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if module_name not in out_data: if module_name not in out_data:
out_data[module_name]= { out_data[module_name]= {
"class": "default", "class": "default",
"key": translated_name, "key": module_name,
"kwargs": {"generate_device": default_device, "kwargs": {"generate_device": default_device,
"prefill_device": default_device} "prefill_device": default_device}
} }
...@@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo ...@@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
model_config = translate_model_config(model_config) model_config = translate_model_config(model_config)
gguf_loader=GGUFLoader(gguf_path) weights_loader = ModelLoaderFactory.create_loader(gguf_path)
with torch.device("meta"): with torch.device("meta"):
inject(module, optimize_config, model_config, gguf_loader) inject(module, optimize_config, model_config, weights_loader)
# pre load lm_head because its big inter result # pre load lm_head because its big inter result
load_weights(module.lm_head, gguf_loader, "lm_head.") load_weights(module.lm_head, weights_loader, "lm_head.")
load_weights(module, gguf_loader) load_weights(module, weights_loader)
module.gguf_loader = gguf_loader module.gguf_loader = weights_loader
del_meta(module) del_meta(module)
torch.cuda.empty_cache() torch.cuda.empty_cache()
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearFP8"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\..*\\.mlp$"
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace:
class: ktransformers.operators.experts.KDeepseekV3MoEV2 # mlp module with custom forward function
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace:
class: ktransformers.operators.gate.KMoEGate
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda"
prefill_op: "KExpertsTorch"
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda"
backend: "llamafile"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
replace:
class: ktransformers.operators.layernorm.RMSNorm
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
replace:
class: ktransformers.operators.mlp.kDeepseekV3MLP
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^lm_head$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "VLinearMarlin"
prefill_op: "KLinearTorch"
\ No newline at end of file
...@@ -128,10 +128,7 @@ class ArgumentParser: ...@@ -128,10 +128,7 @@ class ArgumentParser:
else: else:
args.model_dir = self.cfg.model_dir args.model_dir = self.cfg.model_dir
args.model_path = self.cfg.model_path args.model_path = self.cfg.model_path
# set config from args
for key, value in vars(args).items():
if value is not None and hasattr(self.cfg, key):
setattr(self.cfg, key, value)
# we add the name not match args individually # we add the name not match args individually
self.cfg.model_device = args.device self.cfg.model_device = args.device
self.cfg.mount_web = args.web self.cfg.mount_web = args.web
...@@ -140,10 +137,15 @@ class ArgumentParser: ...@@ -140,10 +137,15 @@ class ArgumentParser:
self.cfg.user_force_think = args.force_think self.cfg.user_force_think = args.force_think
model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
if args.architectures == "Qwen3MoeForCausalLM" or args.architectures == "Qwen2MoeForCausalLM" : if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
args.architectures = model_config.architectures[0]
else: else:
args.gpu_memory_size = args.cache_lens*2*576*61 args.gpu_memory_size = args.cache_lens*2*576*61
# set config from args
for key, value in vars(args).items():
if value is not None and hasattr(self.cfg, key):
setattr(self.cfg, key, value)
self.cfg.gpu_memory_size = args.gpu_memory_size self.cfg.gpu_memory_size = args.gpu_memory_size
free_ports = get_free_ports(3, [args.port]) free_ports = get_free_ports(3, [args.port])
args.sched_port = free_ports[0] args.sched_port = free_ports[0]
......
...@@ -197,7 +197,7 @@ class Engine: ...@@ -197,7 +197,7 @@ class Engine:
self.block_num = inference_context.k_cache[0].size(1) self.block_num = inference_context.k_cache[0].size(1)
#@TODO add config #@TODO add config
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM": if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
self.model.init_wrapper(self.args.use_cuda_graph, self.device, 1024 ,args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens) self.model.init_wrapper(self.args.use_cuda_graph, self.device, Config().chunk_size, args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
else: else:
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num) self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment