Commit c6aa379d authored by qiyuxinlin's avatar qiyuxinlin
Browse files

support safetensor load, delete architectures argument

parent 900a7f7c
import os import os
import sys import sys
sys.path.insert(0,"/home/zbx/ktransformers") sys.path.insert(0,"/home/zbx/ktransformers")
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
import torch import torch
gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf") gguf_loader_1 = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
......
#!/bin/bash
set -e
# clear build dirs
# rm -rf build
# rm -rf *.egg-info
# rm -rf csrc/build
# rm -rf csrc/ktransformers_ext/build
# rm -rf csrc/ktransformers_ext/cuda/build
# rm -rf csrc/ktransformers_ext/cuda/dist
# rm -rf csrc/ktransformers_ext/cuda/*.egg-info
rm -rf ~/.ktransformers
echo "Installing python dependencies from requirements.txt"
pip install -r requirements-local_chat.txt
pip install -r ktransformers/server/requirements.txt
echo "Installing ktransformers"
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -v . --no-build-isolation
pip install third_party/custom_flashinfer/ -v
# SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
# echo "Copying thirdparty libs to $SITE_PACKAGES"
# cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
# patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
echo "Installation completed successfully"
...@@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache): ...@@ -66,7 +66,7 @@ class StaticCache(transformers.StaticCache):
self.page_table_list = [] self.page_table_list = []
for idx in range(config.num_hidden_layers): for idx in range(config.num_hidden_layers):
if isinstance(device, dict): if isinstance(device, dict):
target_device = device[f"blk.{idx}.self_attn"]["generate_device"] target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
else: else:
target_device = device target_device = device
...@@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache): ...@@ -91,7 +91,7 @@ class StaticCache(transformers.StaticCache):
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache. # breaks when updating the cache.
if isinstance(device, dict): if isinstance(device, dict):
target_device = device[f"blk.{idx}.self_attn"]["generate_device"] target_device = device[f"model.layers.{idx}.self_attn"]["generate_device"]
else: else:
target_device = device target_device = device
......
...@@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel): ...@@ -39,7 +39,7 @@ class KQwen2MoeForCausalLM(Qwen2MoePreTrainedModel):
self.cache = cache self.cache = cache
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.attn = [None] * 10 self.attn = [None] * 100
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0): def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device) self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
......
...@@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel): ...@@ -39,7 +39,7 @@ class KQwen3MoeForCausalLM(Qwen3MoePreTrainedModel):
self.cache = cache self.cache = cache
self.vocab_size = config.vocab_size self.vocab_size = config.vocab_size
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.attn = [None] * 10 self.attn = [None] * 100
def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0): def init_wrapper(self, use_cuda_graph, device, max_batch_token, max_batch_size, max_pages, cuda_graph_idx = 0):
self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device) self.attn[cuda_graph_idx] = flashInferAttn(use_cuda_graph=use_cuda_graph, max_batch_token=max_batch_token, max_batch_size=max_batch_size, max_pages=max_pages, device=device)
......
...@@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import ( ...@@ -23,7 +23,7 @@ from ktransformers.models.modeling_deepseek import (
yarn_find_correction_range yarn_find_correction_range
) )
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
import torch import torch
......
...@@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding ...@@ -15,7 +15,7 @@ from ktransformers.models.modeling_llama import LlamaRotaryEmbedding
from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb from ktransformers.models.modeling_deepseek import DeepseekV2Attention, apply_rotary_pos_emb
from typing import Optional, Tuple from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from ktransformers.util.utils import get_compute_capability from ktransformers.util.utils import get_compute_capability
import logging import logging
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
......
...@@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention ...@@ -11,7 +11,7 @@ from ktransformers.models.modeling_qwen2_moe import Qwen2MoeAttention
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention from ktransformers.models.modeling_qwen3_moe import Qwen3MoeAttention
from typing import Optional, Tuple from typing import Optional, Tuple
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
import logging import logging
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from flashinfer import BatchMLAPagedAttentionWrapper from flashinfer import BatchMLAPagedAttentionWrapper
......
...@@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved. ...@@ -6,7 +6,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
from typing import Any from typing import Any
from torch import nn, Tensor from torch import nn, Tensor
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
import ktransformers.util.utils as utils import ktransformers.util.utils as utils
class BaseInjectedModule(nn.Module): class BaseInjectedModule(nn.Module):
......
...@@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext ...@@ -26,7 +26,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
import cpuinfer_ext import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE from cpuinfer_ext.moe import MOEConfig, MOE
import ctypes import ctypes
from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader from ktransformers.util.custom_gguf import GGMLQuantizationType
from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader, ModelLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.server.config.config import Config from ktransformers.server.config.config import Config
from transformers.activations import ACT2FN from transformers.activations import ACT2FN
...@@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer ...@@ -39,8 +40,18 @@ from ktransformers.operators.cpuinfer import CPUInfer
def deduplicate_and_sort(lst): def deduplicate_and_sort(lst):
return sorted(set(lst)) return sorted(set(lst))
def generate_cuda_graphs(chunk_size: int) -> list:
assert chunk_size <= 1024 or chunk_size % 1024 == 0, "chunk_size must <= 1024 or a multiple of 1024"
base_list = [1, 2, 3, Config().max_batch_size, 64, 256, 512, chunk_size]
if chunk_size <= 1024:
return base_list
multiples = [i for i in range(1024, chunk_size + 1, 1024)]
return deduplicate_and_sort(base_list + multiples)
#cuda_graphs = [Config().chunk_size] #cuda_graphs = [Config().chunk_size]
cuda_graphs = deduplicate_and_sort([1, 2, 3, Config().max_batch_size, 64, Config().chunk_size]) cuda_graphs = generate_cuda_graphs(Config().chunk_size)
# class Base(BaseInjectedModule, ABC): # class Base(BaseInjectedModule, ABC):
class KExpertsBase(ABC): class KExpertsBase(ABC):
def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs): def __init__(self, key: str, gguf_loader: GGUFLoader, config: PretrainedConfig, orig_module: nn.Module, device: str = "cuda", **kwargs):
...@@ -77,7 +88,7 @@ class KExpertsBase(ABC): ...@@ -77,7 +88,7 @@ class KExpertsBase(ABC):
down_type = None down_type = None
for key in keys: for key in keys:
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ] targets = [".ffn_gate_exps.weight", ".ffn_up_exps.weight", ".ffn_down_exps.weight" ]
tensors = self.load_multi(key, targets, device=device) tensors = self.load_multi(key, targets, device=device)
gate = tensors[".ffn_gate_exps.weight"] gate = tensors[".ffn_gate_exps.weight"]
...@@ -86,7 +97,7 @@ class KExpertsBase(ABC): ...@@ -86,7 +97,7 @@ class KExpertsBase(ABC):
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"] gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"] up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"] down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
elif key + ".ffn_down.0.weight" in self.gguf_loader.tensor_info: elif self.gguf_loader.has_tensor(key + ".ffn_down.0.weight"):
# for supporting Mixtral-8x7B-Instuct # for supporting Mixtral-8x7B-Instuct
gate = [] gate = []
up = [] up = []
...@@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase): ...@@ -194,7 +205,7 @@ class KExpertsCPU(KExpertsBase):
self.config.num_experts_per_tok, self.config.num_experts_per_tok,
self.config.hidden_size, self.config.hidden_size,
self.config.moe_intermediate_size, self.config.moe_intermediate_size,
25600, max(cuda_graphs),
gate_ptr, gate_ptr,
up_ptr, up_ptr,
down_ptr, down_ptr,
...@@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase): ...@@ -212,7 +223,7 @@ class KExpertsCPU(KExpertsBase):
self.config.num_experts_per_tok, self.config.num_experts_per_tok,
self.config.hidden_size, self.config.hidden_size,
self.config.moe_intermediate_size, self.config.moe_intermediate_size,
25600, max(cuda_graphs),
gate_ptr, gate_ptr,
up_ptr, up_ptr,
down_ptr, down_ptr,
...@@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase): ...@@ -325,14 +336,19 @@ class KExpertsCPU(KExpertsBase):
down_type = None down_type = None
for key in keys: for key in keys:
if self.gguf_loader.safetensor_loader is not None: if isinstance(self.gguf_loader, SafeTensorLoader):
# using a temp ugly way to temprary load the tensor res = self.gguf_loader.load_experts(key)
gate = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.weight").numpy() return {key: res}
up = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.weight").numpy() elif self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
down = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.weight").numpy() gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
gate_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_exps.ggml_type").item() up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
up_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_up_exps.ggml_type").item() down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
down_type = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_down_exps.ggml_type").item() # gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate_exps.weight"]["ggml_type"]
# up_type = self.gguf_loader.tensor_info[key + ".ffn_up_exps.weight"]["ggml_type"]
# down_type = self.gguf_loader.tensor_info[key + ".ffn_down_exps.weight"]["ggml_type"]
gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate_exps.weight")
up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up_exps.weight")
down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down_exps.weight")
elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: elif key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info:
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
...@@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase): ...@@ -356,9 +372,9 @@ class KExpertsCPU(KExpertsBase):
gate = np.stack(gate) gate = np.stack(gate)
up = np.stack(up) up = np.stack(up)
down = np.stack(down) down = np.stack(down)
gate_type = self.gguf_loader.tensor_info[key + ".ffn_gate.0.weight"]["ggml_type"] gate_type = self.gguf_loader.get_ggml_type(key + ".ffn_gate.0.weight")
up_type = self.gguf_loader.tensor_info[key + ".ffn_up.0.weight"]["ggml_type"] up_type = self.gguf_loader.get_ggml_type(key + ".ffn_up.0.weight")
down_type = self.gguf_loader.tensor_info[key + ".ffn_down.0.weight"]["ggml_type"] down_type = self.gguf_loader.get_ggml_type(key + ".ffn_down.0.weight")
else: else:
raise ValueError(f"Experts {key} not found in gguf_loader") raise ValueError(f"Experts {key} not found in gguf_loader")
res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}} res = {key:{"gate": gate, "up": up, "down": down, "gate_type": gate_type, "up_type": up_type, "down_type": down_type}}
...@@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase): ...@@ -445,7 +461,7 @@ class KExpertsMarlin(KExpertsBase):
down = None down = None
for key in keys: for key in keys:
if key + ".ffn_gate_exps.weight" in self.gguf_loader.tensor_info: if self.gguf_loader.has_tensor(key + ".ffn_gate_exps.weight"):
gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight") gate = self.gguf_loader.get_mmap_tensor(key + ".ffn_gate_exps.weight")
up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight") up = self.gguf_loader.get_mmap_tensor(key + ".ffn_up_exps.weight")
down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight") down = self.gguf_loader.get_mmap_tensor(key + ".ffn_down_exps.weight")
......
...@@ -40,7 +40,7 @@ class flashInferAttn(): ...@@ -40,7 +40,7 @@ class flashInferAttn():
self.kv_layout = kv_layout self.kv_layout = kv_layout
self.use_cuda_graph = use_cuda_graph self.use_cuda_graph = use_cuda_graph
if flashInferAttn.float_workspace_buffer is None: if flashInferAttn.float_workspace_buffer is None:
flashInferAttn.float_workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.uint8, device=device) flashInferAttn.float_workspace_buffer = torch.empty(max_batch_token * 1024 * 1024, dtype=torch.uint8, device=device)
self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device) self.qo_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device) self.paged_kv_indptr_buf = torch.empty((max_batch_size+1,), dtype=torch.int32, device=device)
self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device) self.paged_kv_indices_buf = torch.empty((max_pages,), dtype=torch.int32, device=device)
......
...@@ -6,7 +6,7 @@ import os ...@@ -6,7 +6,7 @@ import os
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.operators.linear import KTransformersLinear from ktransformers.operators.linear import KTransformersLinear
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader, ModelLoader, SafeTensorLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
...@@ -55,24 +55,20 @@ class KMoEGateBase(ABC): ...@@ -55,24 +55,20 @@ class KMoEGateBase(ABC):
down_type = None down_type = None
for key in keys: for key in keys:
key = ".".join(key.split(".")[:-1]) # key = ".".join(key.split(".")[:-1])
if self.gguf_loader.safetensor_loader is not None: if isinstance(self.gguf_loader, SafeTensorLoader):
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"] res = self.gguf_loader.load_gate(key, device=device)
weight = self.gguf_loader.safetensor_loader.load_tensor(key + ".ffn_gate_inp.weight") elif self.gguf_loader.has_tensor(key+".weight"):
e_score_correction_bias = self.gguf_loader.safetensor_loader.load_tensor(key + ".exp_probs_b.bias") # targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
weight_type = weight.dtype targets = [".weight", ".e_score_correction_bias"]
e_score_correction_bias_type = e_score_correction_bias.dtype
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
elif key + ".ffn_gate_inp.weight" in self.gguf_loader.tensor_info:
targets = [".ffn_gate_inp.weight", ".exp_probs_b.bias"]
tensors = self.load_multi(key, targets, device=device) tensors = self.load_multi(key, targets, device=device)
weight = tensors[".ffn_gate_inp.weight"] weight = tensors[".weight"]
e_score_correction_bias = tensors[".exp_probs_b.bias"] e_score_correction_bias = tensors[".e_score_correction_bias"]
weight_type = self.gguf_loader.tensor_info[key + ".ffn_gate_inp.weight"]["ggml_type"] # weight_type = self.gguf_loader.tensor_info[key + ".weight"]["ggml_type"]
e_score_correction_bias_type = self.gguf_loader.tensor_info[key + ".exp_probs_b.bias"]["ggml_type"] res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias}
else: else:
raise ValueError(f"Experts {key} not found in gguf_loader") raise ValueError(f"Experts {key} not found in gguf_loader")
res = {"weight": weight, "e_score_correction_bias": e_score_correction_bias, "weight_type": weight_type, "e_score_correction_bias_type": e_score_correction_bias_type}
return res return res
def load_multi(self, key: str, keys: list[str], device: str = "cpu"): def load_multi(self, key: str, keys: list[str], device: str = "cpu"):
...@@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase): ...@@ -106,8 +102,6 @@ class KMoEGate(BaseInjectedModule, KMoEGateBase):
if w is None: w = self.load_weights(device=device) if w is None: w = self.load_weights(device=device)
if isinstance(w, dict): if isinstance(w, dict):
self.weight_type = w["weight_type"]
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
self.orig_module.weight = nn.Parameter(w["weight"]) self.orig_module.weight = nn.Parameter(w["weight"])
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"]) self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
else: else:
...@@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase): ...@@ -175,8 +169,6 @@ class KMoEGateQwen2Moe(BaseInjectedModule, KMoEGateBase):
if w is None: w = self.load_weights(device=device) if w is None: w = self.load_weights(device=device)
if isinstance(w, dict): if isinstance(w, dict):
self.weight_type = w["weight_type"]
self.e_score_correction_bias_type = w["e_score_correction_bias_type"]
self.orig_module.weight = nn.Parameter(w["weight"]) self.orig_module.weight = nn.Parameter(w["weight"])
self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"]) self.orig_module.e_score_correction_bias = nn.Parameter(w["e_score_correction_bias"])
else: else:
......
...@@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm ...@@ -29,7 +29,7 @@ from ktransformers.models.modeling_deepseek_v3 import DeepseekV3RMSNorm
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm from ktransformers.models.modeling_qwen2_moe import Qwen2MoeRMSNorm
from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm from ktransformers.models.modeling_qwen3_moe import Qwen3MoeRMSNorm
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from flashinfer.norm import ( from flashinfer.norm import (
fused_add_rmsnorm, fused_add_rmsnorm,
rmsnorm, rmsnorm,
......
...@@ -16,7 +16,7 @@ import torch ...@@ -16,7 +16,7 @@ import torch
from torch import Tensor, nn from torch import Tensor, nn
import KTransformersOps import KTransformersOps
import vLLMMarlin import vLLMMarlin
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader, SafeTensorLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import ( from ktransformers.ktransformers_ext.operators.custom_marlin.quantize.utils.marlin_utils import (
MarlinWorkspace, MarlinWorkspace,
...@@ -83,15 +83,15 @@ class KLinearBase(ABC): ...@@ -83,15 +83,15 @@ class KLinearBase(ABC):
keys = [self.key] keys = [self.key]
for key in keys: for key in keys:
if self.gguf_loader.safetensor_loader is not None: if isinstance(self.gguf_loader, SafeTensorLoader):
# using safetensor_loader # using safetensor_loader
tensor = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight') tensor = self.gguf_loader.load_tensor(key+'.weight')
if key+'.weight_scale_inv' in self.gguf_loader.safetensor_loader.tensor_file_map: if self.gguf_loader.has_tensor(key+'.weight_scale_inv'):
weight_scale_inv = self.gguf_loader.safetensor_loader.load_tensor(key+'.weight_scale_inv') weight_scale_inv = self.gguf_loader.load_tensor(key+'.weight_scale_inv')
return nn.Parameter(tensor), nn.Parameter(weight_scale_inv) return nn.Parameter(tensor), nn.Parameter(weight_scale_inv)
return nn.Parameter(tensor) return nn.Parameter(tensor)
elif key + ".weight" in self.gguf_loader.tensor_file_map: elif self.gguf_loader.has_tensor(key + ".weight"):
if key + ".bias" in self.gguf_loader.tensor_file_map: if key + ".bias" in self.gguf_loader.tensor_file_map:
tensors = self.load_multi(key, ["weight", "bias"], device=device) tensors = self.load_multi(key, ["weight", "bias"], device=device)
tensor = tensors["weight"] tensor = tensors["weight"]
...@@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase): ...@@ -760,7 +760,7 @@ class KLinearCPUInfer(KLinearBase):
self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device) self.output_gpu = torch.zeros((1, 1, self.out_features), device=self.out_device)
def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"): def load_weights(self, w: dict | nn.Parameter | tuple | None = None, device: str = "cpu"):
if self.key + ".weight" in self.gguf_loader.tensor_info: if self.gguf_loader.has_tensor(self.key + ".weight"):
if self.key + ".bias" in self.gguf_loader.tensor_file_map: if self.key + ".bias" in self.gguf_loader.tensor_file_map:
self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight") self.weight = self.gguf_loader.get_mmap_tensor(self.key + ".weight")
self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"] self.weight_type = self.gguf_loader.tensor_info[self.key + ".weight"]["ggml_type"]
......
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from transformers import PretrainedConfig from transformers import PretrainedConfig
import torch.nn as nn import torch.nn as nn
from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP from ktransformers.models.modeling_deepseek_v3 import DeepseekV3MLP
......
...@@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig ...@@ -58,7 +58,7 @@ from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.models.configuration_llama import LlamaConfig from ktransformers.models.configuration_llama import LlamaConfig
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState, get_compute_capability from ktransformers.util.utils import InferenceState, get_compute_capability
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_loader import GGUFLoader
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from ktransformers.models.modeling_llama import ( from ktransformers.models.modeling_llama import (
LlamaDecoderLayer, LlamaDecoderLayer,
......
...@@ -12,7 +12,7 @@ from torch import nn ...@@ -12,7 +12,7 @@ from torch import nn
from transformers import AutoConfig from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
# from operators import BaseInjectedModule # from operators import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf from ktransformers.util.custom_loader import GGUFLoader, ModelLoaderFactory
from ktransformers.util.utils import set_module, load_weights from ktransformers.util.utils import set_module, load_weights
import itertools import itertools
import copy import copy
...@@ -54,7 +54,7 @@ def del_meta(module:nn.Module): ...@@ -54,7 +54,7 @@ def del_meta(module:nn.Module):
def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"): def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, prefix: str="", default_device: str = "cuda:0"):
module_name = prefix[:-1] module_name = prefix[:-1]
translated_name = translate_name_to_gguf(prefix)[:-1] # translated_name = translate_name_to_gguf(prefix)[:-1]
#print("gen_optimize_config", prefix, module_name, translated_name) #print("gen_optimize_config", prefix, module_name, translated_name)
recursive = True recursive = True
for rule in rule_list: for rule in rule_list:
...@@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -76,7 +76,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if "replace" in rule: if "replace" in rule:
replace_meta = rule["replace"] replace_meta = rule["replace"]
if module_name not in out_data: if module_name not in out_data:
out_data[module_name]={"key": translated_name, out_data[module_name]={"key": module_name,
"class": replace_meta["class"] if "class" in replace_meta else "default", "class": replace_meta["class"] if "class" in replace_meta else "default",
# "device": replace_meta["device"] if "device" in replace_meta else default_device, # "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()} "kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
...@@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -91,7 +91,7 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if module_name not in out_data: if module_name not in out_data:
out_data[module_name]= { out_data[module_name]= {
"class": "default", "class": "default",
"key": translated_name, "key": module_name,
"kwargs": {"generate_device": default_device, "kwargs": {"generate_device": default_device,
"prefill_device": default_device} "prefill_device": default_device}
} }
...@@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo ...@@ -123,12 +123,12 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
model_config = translate_model_config(model_config) model_config = translate_model_config(model_config)
gguf_loader=GGUFLoader(gguf_path) weights_loader = ModelLoaderFactory.create_loader(gguf_path)
with torch.device("meta"): with torch.device("meta"):
inject(module, optimize_config, model_config, gguf_loader) inject(module, optimize_config, model_config, weights_loader)
# pre load lm_head because its big inter result # pre load lm_head because its big inter result
load_weights(module.lm_head, gguf_loader, "lm_head.") load_weights(module.lm_head, weights_loader, "lm_head.")
load_weights(module, gguf_loader) load_weights(module, weights_loader)
module.gguf_loader = gguf_loader module.gguf_loader = weights_loader
del_meta(module) del_meta(module)
torch.cuda.empty_cache() torch.cuda.empty_cache()
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearFP8"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\..*\\.mlp$"
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace:
class: ktransformers.operators.experts.KDeepseekV3MoEV2 # mlp module with custom forward function
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace:
class: ktransformers.operators.gate.KMoEGate
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExpertsV2 # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda"
prefill_op: "KExpertsTorch"
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda"
backend: "llamafile"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RMSNorm
replace:
class: ktransformers.operators.layernorm.RMSNorm
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MLP
replace:
class: ktransformers.operators.mlp.kDeepseekV3MLP
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^lm_head$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "VLinearMarlin"
prefill_op: "KLinearTorch"
\ No newline at end of file
...@@ -128,10 +128,7 @@ class ArgumentParser: ...@@ -128,10 +128,7 @@ class ArgumentParser:
else: else:
args.model_dir = self.cfg.model_dir args.model_dir = self.cfg.model_dir
args.model_path = self.cfg.model_path args.model_path = self.cfg.model_path
# set config from args
for key, value in vars(args).items():
if value is not None and hasattr(self.cfg, key):
setattr(self.cfg, key, value)
# we add the name not match args individually # we add the name not match args individually
self.cfg.model_device = args.device self.cfg.model_device = args.device
self.cfg.mount_web = args.web self.cfg.mount_web = args.web
...@@ -140,10 +137,15 @@ class ArgumentParser: ...@@ -140,10 +137,15 @@ class ArgumentParser:
self.cfg.user_force_think = args.force_think self.cfg.user_force_think = args.force_think
model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True) model_config = AutoConfig.from_pretrained(args.model_dir, trust_remote_code=True)
if args.architectures == "Qwen3MoeForCausalLM" or args.architectures == "Qwen2MoeForCausalLM" : if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
args.architectures = model_config.architectures[0]
else: else:
args.gpu_memory_size = args.cache_lens*2*576*61 args.gpu_memory_size = args.cache_lens*2*576*61
# set config from args
for key, value in vars(args).items():
if value is not None and hasattr(self.cfg, key):
setattr(self.cfg, key, value)
self.cfg.gpu_memory_size = args.gpu_memory_size self.cfg.gpu_memory_size = args.gpu_memory_size
free_ports = get_free_ports(3, [args.port]) free_ports = get_free_ports(3, [args.port])
args.sched_port = free_ports[0] args.sched_port = free_ports[0]
......
...@@ -197,7 +197,7 @@ class Engine: ...@@ -197,7 +197,7 @@ class Engine:
self.block_num = inference_context.k_cache[0].size(1) self.block_num = inference_context.k_cache[0].size(1)
#@TODO add config #@TODO add config
if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM": if config.architectures[0] == "Qwen2MoeForCausalLM" or config.architectures[0] == "Qwen3MoeForCausalLM":
self.model.init_wrapper(self.args.use_cuda_graph, self.device, 1024 ,args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens) self.model.init_wrapper(self.args.use_cuda_graph, self.device, Config().chunk_size, args.max_batch_size, self.block_num) # TODO: 1024 is a magic number(max_batch_tokens)
else: else:
self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num) self.model.init_wrapper(self.args.use_cuda_graph, self.device, args.max_batch_size, self.block_num)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment