"src/libtorchaudio/sox/utils.cpp" did not exist on "4f9b55201dea6d0dd5165a4f684d1160ee9ca94a"
Unverified Commit 6c4ed591 authored by Atream's avatar Atream Committed by GitHub
Browse files

Merge pull request #886 from kvcache-ai/fix-singleton-zbx

fix-singleton
parents 7f57769c 6f43bbe5
......@@ -56,7 +56,7 @@ def local_chat(
model_path: str | None = None,
optimize_config_path: str = None,
gguf_path: str | None = None,
max_new_tokens: int = 300,
max_new_tokens: int = 1000,
cpu_infer: int = Config().cpu_infer,
use_cuda_graph: bool = True,
prompt_file : str | None = None,
......
......@@ -26,6 +26,7 @@ import json
class DynamicScaledDotProductAttention:
remaining_length: int
cpu_infer = None
def __init__(
self,
......@@ -180,7 +181,9 @@ class DynamicScaledDotProductAttention:
self.preselect_block_num = 0 # block_num before preselect
self.evict_tokens = 0
self.cpu_infer = CPUInfer(threads_num)
if DynamicScaledDotProductAttention.cpu_infer is None:
DynamicScaledDotProductAttention.cpu_infer = CPUInfer(threads_num)
self.cpu_infer = DynamicScaledDotProductAttention.cpu_infer
self.local_thread = CPUInferKVCache(
self.layer_num,
self.kv_head_num,
......
......@@ -120,7 +120,7 @@ class KExpertsCPU(KExpertsBase):
output_gpu_map:dict = {} # Manage output tensor buffer on different gpu
#stream_map:dict = {} # Manage cuda stream on different gpu
#gguf_loader:GGUFLoader = None
CPU_INFER = CPUInfer(Config().cpu_infer)
CPU_INFER = None
def __init__(
self,
key: str,
......@@ -133,6 +133,8 @@ class KExpertsCPU(KExpertsBase):
**kwargs
):
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
if KExpertsCPU.CPU_INFER is None:
KExpertsCPU.CPU_INFER = CPUInfer(Config().cpu_infer)
#if KExpertsCPU.gguf_loader is None:
# KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
self.gguf_loader = gguf_loader
......
......@@ -360,7 +360,7 @@ class KLinearMarlin(KLinearBase):
self.workspace = None
class KLinearCPUInfer(KLinearBase):
CPU_INFER = CPUInfer(Config().cpu_infer)
CPU_INFER = None
def __init__(
self,
key: str,
......@@ -374,6 +374,8 @@ class KLinearCPUInfer(KLinearBase):
**kwargs,
):
super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
if KLinearCPUInfer.CPU_INFER is None:
KLinearCPUInfer.CPU_INFER = CPUInfer(Config().cpu_infer)
self.has_bias = False
self.dtype = torch.get_default_dtype()
self.w = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment