Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
6c4ed591
Unverified
Commit
6c4ed591
authored
Mar 14, 2025
by
Atream
Committed by
GitHub
Mar 14, 2025
Browse files
Merge pull request #886 from kvcache-ai/fix-singleton-zbx
fix-singleton
parents
7f57769c
6f43bbe5
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
11 additions
and
4 deletions
+11
-4
ktransformers/local_chat.py
ktransformers/local_chat.py
+1
-1
ktransformers/operators/dynamic_attention.py
ktransformers/operators/dynamic_attention.py
+4
-1
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+3
-1
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+3
-1
No files found.
ktransformers/local_chat.py
View file @
6c4ed591
...
@@ -56,7 +56,7 @@ def local_chat(
...
@@ -56,7 +56,7 @@ def local_chat(
model_path
:
str
|
None
=
None
,
model_path
:
str
|
None
=
None
,
optimize_config_path
:
str
=
None
,
optimize_config_path
:
str
=
None
,
gguf_path
:
str
|
None
=
None
,
gguf_path
:
str
|
None
=
None
,
max_new_tokens
:
int
=
3
00
,
max_new_tokens
:
int
=
10
00
,
cpu_infer
:
int
=
Config
().
cpu_infer
,
cpu_infer
:
int
=
Config
().
cpu_infer
,
use_cuda_graph
:
bool
=
True
,
use_cuda_graph
:
bool
=
True
,
prompt_file
:
str
|
None
=
None
,
prompt_file
:
str
|
None
=
None
,
...
...
ktransformers/operators/dynamic_attention.py
View file @
6c4ed591
...
@@ -26,6 +26,7 @@ import json
...
@@ -26,6 +26,7 @@ import json
class
DynamicScaledDotProductAttention
:
class
DynamicScaledDotProductAttention
:
remaining_length
:
int
remaining_length
:
int
cpu_infer
=
None
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -180,7 +181,9 @@ class DynamicScaledDotProductAttention:
...
@@ -180,7 +181,9 @@ class DynamicScaledDotProductAttention:
self
.
preselect_block_num
=
0
# block_num before preselect
self
.
preselect_block_num
=
0
# block_num before preselect
self
.
evict_tokens
=
0
self
.
evict_tokens
=
0
self
.
cpu_infer
=
CPUInfer
(
threads_num
)
if
DynamicScaledDotProductAttention
.
cpu_infer
is
None
:
DynamicScaledDotProductAttention
.
cpu_infer
=
CPUInfer
(
threads_num
)
self
.
cpu_infer
=
DynamicScaledDotProductAttention
.
cpu_infer
self
.
local_thread
=
CPUInferKVCache
(
self
.
local_thread
=
CPUInferKVCache
(
self
.
layer_num
,
self
.
layer_num
,
self
.
kv_head_num
,
self
.
kv_head_num
,
...
...
ktransformers/operators/experts.py
View file @
6c4ed591
...
@@ -120,7 +120,7 @@ class KExpertsCPU(KExpertsBase):
...
@@ -120,7 +120,7 @@ class KExpertsCPU(KExpertsBase):
output_gpu_map
:
dict
=
{}
# Manage output tensor buffer on different gpu
output_gpu_map
:
dict
=
{}
# Manage output tensor buffer on different gpu
#stream_map:dict = {} # Manage cuda stream on different gpu
#stream_map:dict = {} # Manage cuda stream on different gpu
#gguf_loader:GGUFLoader = None
#gguf_loader:GGUFLoader = None
CPU_INFER
=
CPUInfer
(
Config
().
cpu_infer
)
CPU_INFER
=
None
def
__init__
(
def
__init__
(
self
,
self
,
key
:
str
,
key
:
str
,
...
@@ -133,6 +133,8 @@ class KExpertsCPU(KExpertsBase):
...
@@ -133,6 +133,8 @@ class KExpertsCPU(KExpertsBase):
**
kwargs
**
kwargs
):
):
super
().
__init__
(
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
super
().
__init__
(
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
if
KExpertsCPU
.
CPU_INFER
is
None
:
KExpertsCPU
.
CPU_INFER
=
CPUInfer
(
Config
().
cpu_infer
)
#if KExpertsCPU.gguf_loader is None:
#if KExpertsCPU.gguf_loader is None:
# KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
# KExpertsCPU.gguf_loader = GGUFLoader("/mnt/data/model/DeepseekV3-q4km-gguf")
self
.
gguf_loader
=
gguf_loader
self
.
gguf_loader
=
gguf_loader
...
...
ktransformers/operators/linear.py
View file @
6c4ed591
...
@@ -360,7 +360,7 @@ class KLinearMarlin(KLinearBase):
...
@@ -360,7 +360,7 @@ class KLinearMarlin(KLinearBase):
self
.
workspace
=
None
self
.
workspace
=
None
class
KLinearCPUInfer
(
KLinearBase
):
class
KLinearCPUInfer
(
KLinearBase
):
CPU_INFER
=
CPUInfer
(
Config
().
cpu_infer
)
CPU_INFER
=
None
def
__init__
(
def
__init__
(
self
,
self
,
key
:
str
,
key
:
str
,
...
@@ -374,6 +374,8 @@ class KLinearCPUInfer(KLinearBase):
...
@@ -374,6 +374,8 @@ class KLinearCPUInfer(KLinearBase):
**
kwargs
,
**
kwargs
,
):
):
super
().
__init__
(
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
super
().
__init__
(
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
if
KLinearCPUInfer
.
CPU_INFER
is
None
:
KLinearCPUInfer
.
CPU_INFER
=
CPUInfer
(
Config
().
cpu_infer
)
self
.
has_bias
=
False
self
.
has_bias
=
False
self
.
dtype
=
torch
.
get_default_dtype
()
self
.
dtype
=
torch
.
get_default_dtype
()
self
.
w
=
None
self
.
w
=
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment