Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
72d09f3f
Unverified
Commit
72d09f3f
authored
Feb 22, 2025
by
Atream
Committed by
GitHub
Feb 22, 2025
Browse files
Merge pull request #597 from kvcache-ai/feat-more-context
Feat more context
parents
e9089631
f7f10598
Changes
29
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
146 additions
and
45 deletions
+146
-45
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
...s/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+12
-1
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+12
-0
ktransformers/optimize/optimize_rules/Mixtral.yaml
ktransformers/optimize/optimize_rules/Mixtral.yaml
+10
-0
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
...ize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+11
-1
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
...mers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+10
-0
ktransformers/server/backend/interfaces/ktransformers.py
ktransformers/server/backend/interfaces/ktransformers.py
+1
-1
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+78
-41
ktransformers/util/utils.py
ktransformers/util/utils.py
+1
-1
test_prompt.txt
test_prompt.txt
+11
-0
No files found.
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
View file @
72d09f3f
...
...
@@ -135,7 +135,18 @@
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)|(lm_head)"
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)"
replace
:
class
:
"
default"
kwargs
:
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
View file @
72d09f3f
...
...
@@ -5,6 +5,18 @@
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
...
...
ktransformers/optimize/optimize_rules/Mixtral.yaml
View file @
72d09f3f
...
...
@@ -15,6 +15,16 @@
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.block_sparse_moe$"
class
:
ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
...
...
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
View file @
72d09f3f
...
...
@@ -77,9 +77,19 @@
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
(^model.norm)
|(^lm_head)
"
name
:
"
(^model.norm)"
replace
:
class
:
"
default"
kwargs
:
...
...
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
View file @
72d09f3f
...
...
@@ -15,6 +15,16 @@
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
...
...
ktransformers/server/backend/interfaces/ktransformers.py
View file @
72d09f3f
...
...
@@ -25,10 +25,10 @@ class KTransformersThreadContext(TransformersThreadContext):
class
KTransformersInterface
(
TransformersInterface
):
def
__init__
(
self
,
args
:
ConfigArgs
=
default_args
):
self
.
args
=
args
torch
.
set_default_dtype
(
torch
.
bfloat16
)
torch
.
set_grad_enabled
(
False
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model_dir
,
device
=
args
.
device
,
trust_remote_code
=
args
.
trust_remote_code
)
config
=
AutoConfig
.
from_pretrained
(
args
.
model_dir
,
trust_remote_code
=
args
.
trust_remote_code
)
torch
.
set_default_dtype
(
config
.
torch_dtype
)
if
config
.
architectures
[
0
]
==
"Qwen2MoeForCausalLM"
:
config
.
_attn_implementation
=
"flash_attention_2"
...
...
ktransformers/util/custom_gguf.py
View file @
72d09f3f
...
...
@@ -25,6 +25,7 @@ import os
from
enum
import
IntEnum
import
torch
import
KTransformersOps
import
ctypes
class
GGMLQuantizationType
(
IntEnum
):
F32
=
0
...
...
@@ -285,7 +286,7 @@ class GGUFLoader:
itemsize
=
int
(
np
.
empty
([],
dtype
=
item_type
).
itemsize
)
return
mmap_data
[
offset
:
offset
+
itemsize
*
item_count
]
def
load_expert_tensor
(
self
,
name
,
data
,
expert_id
,
elements_per_expert
,
device
=
"
gpu"
)
->
torch
.
Tensor
:
def
load_expert_tensor
(
self
,
name
,
data
,
expert_id
,
elements_per_expert
,
device
=
"
cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
)
->
torch
.
Tensor
:
t
=
self
.
tensor_info
[
name
]
if
device
.
lower
()
==
"cpu"
:
print
(
f
"loading expert
{
expert_id
}
of
{
name
}
with CPU"
)
...
...
@@ -304,16 +305,16 @@ class GGUFLoader:
data
=
data
[
offset
:
offset
+
block_size
*
blocks_per_experts
]
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
,
target_dtype
)
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
values
=
torch
.
from_numpy
(
values
.
copy
()
)
values
=
values
.
view
(
shape
[
-
2
::
-
1
])
return
values
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
,
target_dtype
=
torch
.
get_default_dtype
()
)
->
torch
.
Tensor
:
t
=
self
.
tensor_info
[
name
]
if
device
.
lower
()
==
"cpu"
:
print
(
f
"loading
{
name
}
with CPU"
)
...
...
@@ -328,16 +329,36 @@ class GGUFLoader:
data
=
self
.
get_mmap_tensor
(
name
)
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
#values = GGML_DEQUANTIZE[ggml_name](data)
#print("load_gguf_tensor")
#values = torch.from_numpy(values).to(device = device)
block_size
=
GGML_BLOCK_SIZES
[
ggml_name
]
elements_per_block
=
GGML_ELEMENTS_PER_BLOCK
[
ggml_name
]
num_elements
=
int
(
np
.
prod
(
shape
))
num_blocks
=
num_elements
//
elements_per_block
blocks_per_iter
=
16384
if
num_blocks
>
blocks_per_iter
:
# dequant large tensor
values
=
torch
.
empty
((
num_blocks
,
elements_per_block
),
dtype
=
torch
.
float
,
device
=
device
)
for
i
in
range
(
(
num_blocks
+
blocks_per_iter
-
1
)
//
blocks_per_iter
):
blocks_begin
=
i
*
blocks_per_iter
blocks_end
=
min
(
blocks_begin
+
blocks_per_iter
,
num_blocks
)
if
"cuda"
in
device
.
lower
():
cur_values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
[
blocks_begin
*
block_size
:
blocks_end
*
block_size
],
device
,
target_dtype
)
else
:
cur_values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
[
blocks_begin
*
block_size
:
blocks_end
*
block_size
])
cur_values
=
torch
.
from_numpy
(
cur_values
.
copy
())
cur_values
=
cur_values
.
view
(
-
1
,
elements_per_block
)
values
[
blocks_begin
:
blocks_end
]
=
cur_values
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
if
ggml_name
==
"BF16"
:
values
=
values
.
view
(
torch
.
bfloat16
)
values
=
values
.
view
(
shape
[::
-
1
])
if
"attn_q"
in
name
and
self
.
gguf_file_meta
[
'general.architecture'
]
in
[
"llama"
]:
n_head
=
self
.
gguf_file_meta
[
'llama.attention.head_count'
]
...
...
@@ -433,14 +454,15 @@ def dequantize_q2_k(data):
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
def
dequantize_q2_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q2_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q2_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q2_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q2_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q2_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q3_k
(
data
):
# C implementation
...
...
@@ -484,14 +506,15 @@ def dequantize_q3_k(data):
(((
qs
[:,
48
:
64
]
>>
6
)
&
3
)
-
bits
[:,
16
:,
7
])
],
axis
=
1
)
def
dequantize_q3_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q3_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q3_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q3_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q3_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q3_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q4_k
(
data
):
# C implementation
...
...
@@ -515,13 +538,15 @@ def dequantize_q4_k(data):
# Dequantize final weights using scales and offsets
return
factors
*
qs2
-
offsets
def
dequantize_q4_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q4_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()):
block_size
=
GGML_BLOCK_SIZES
[
"Q4_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q4_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q4_k
(
data
,
144
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q4_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q5_k
(
data
):
# C implementation
...
...
@@ -579,14 +604,15 @@ def dequantize_q5_k(data):
d8
*
(
qs_hi_4
[:,
3
]
+
(
bits
[:,
:,
7
]
<<
4
))
-
m8
,
],
axis
=
1
)
def
dequantize_q5_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q5_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q5_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q5_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q5_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q5_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q6_k
(
data
):
# C implementation
...
...
@@ -637,13 +663,14 @@ def dequantize_q6_k(data):
],
axis
=
1
)
# @torch.jit.script
def
dequantize_q6_k_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
):
def
dequantize_q6_k_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q6_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q6_K"
]
device
=
torch
.
device
(
device
)
num_blocks
=
len
(
data
)
//
block_size
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q6_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
kvalues_iq4nl
=
np
.
array
([
-
127
,
-
104
,
-
83
,
-
65
,
-
49
,
-
35
,
-
22
,
-
10
,
1
,
13
,
25
,
38
,
53
,
69
,
89
,
113
],
dtype
=
np
.
int8
)
...
...
@@ -677,13 +704,14 @@ def dequantize_iq4_xs(data):
return
y
.
flatten
()
def
dequantize_iq4_xs_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
):
def
dequantize_iq4_xs_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"IQ4_XS"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"IQ4_XS"
]
device
=
torch
.
device
(
device
)
num_blocks
=
len
(
data
)
//
block_size
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_iq4_xs
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_iq4_xs
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q4_0
(
data
):
# C implementation
...
...
@@ -700,7 +728,7 @@ def dequantize_q4_0(data):
scales
*
((
qs
>>
4
).
astype
(
np
.
int8
)
-
8
),
],
axis
=
1
)
def
dequantize_q4_0_gpu
(
data
):
def
dequantize_q4_0_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
raise
NotImplementedError
()
def
dequantize_q5_0
(
data
):
...
...
@@ -724,7 +752,7 @@ def dequantize_q5_0(data):
scales
*
x1
,
],
axis
=
1
)
def
dequantize_q5_0_gpu
(
data
):
def
dequantize_q5_0_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
raise
NotImplementedError
()
def
dequantize_q8_0
(
data
):
...
...
@@ -736,32 +764,41 @@ def dequantize_q8_0(data):
qs
=
np
.
frombuffer
(
data
,
dtype
=
np
.
int8
).
reshape
(
num_blocks
,
2
+
32
)[:,
2
:]
return
scales
*
qs
def
dequantize_q8_0_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q8_0_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
# C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
num_blocks
=
len
(
data
)
//
GGML_BLOCK_SIZES
[
"Q8_0"
]
block_size
=
GGML_BLOCK_SIZES
[
"Q8_0"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q8_0"
]
device
=
torch
.
device
(
device
)
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q8_0
(
data
,
34
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q8_0
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_f32
(
data
):
return
np
.
frombuffer
(
data
,
dtype
=
np
.
float32
)
def
dequantize_f32_gpu
(
data
,
device
):
def
dequantize_f32_gpu
(
data
,
device
,
target_dtype
=
torch
.
get_default_dtype
()
):
data
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float32
)
res
=
torch
.
from_numpy
(
data
)
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
)
res
=
torch
.
from_numpy
(
data
.
copy
()
)
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
,
dtype
=
target_dtype
)
res_gpu
.
copy_
(
res
)
return
res_gpu
def
dequantize_f16
(
data
):
return
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
)
def
dequantize_f16_gpu
(
data
,
device
):
def
dequantize_f16_gpu
(
data
,
device
,
target_dtype
=
torch
.
get_default_dtype
()):
data
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
)
res
=
torch
.
from_numpy
(
data
.
copy
())
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
,
dtype
=
target_dtype
)
res_gpu
.
copy_
(
res
)
return
res_gpu
def
dequantize_bf16_gpu
(
data
,
device
,
target_dtype
=
torch
.
get_default_dtype
()):
data
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
)
res
=
torch
.
from_numpy
(
data
)
res
=
torch
.
from_numpy
(
data
.
copy
()
)
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
)
res_gpu
.
copy_
(
res
)
return
res_gpu
...
...
@@ -784,7 +821,7 @@ GGML_DEQUANTIZE = {
GGML_DEQUANTIZE_GPU
=
{
"F32"
:
dequantize_f32_gpu
,
"F16"
:
dequantize_f16_gpu
,
"BF16"
:
dequantize_f16_gpu
,
"BF16"
:
dequantize_
b
f16_gpu
,
"Q4_0"
:
dequantize_q4_0_gpu
,
"Q5_0"
:
dequantize_q5_0_gpu
,
"Q8_0"
:
dequantize_q8_0_gpu
,
...
...
ktransformers/util/utils.py
View file @
72d09f3f
...
...
@@ -79,7 +79,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
raise
Exception
(
f
"can't find
{
translated_key
}
in GGUF file!"
)
def
load_weights
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
#
print(f"recursively loading weights {prefix}
,{return_when_injected=}, {only_load_injected=}
")
#print(f"recursively loading weights {prefix}")
if
not
isinstance
(
module
,
base_operator
.
BaseInjectedModule
):
load_cur_state_dict
(
module
,
gguf_loader
,
prefix
)
for
name
,
child
in
module
.
_modules
.
items
():
...
...
test_prompt.txt
View file @
72d09f3f
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment