Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
72d09f3f
Unverified
Commit
72d09f3f
authored
Feb 22, 2025
by
Atream
Committed by
GitHub
Feb 22, 2025
Browse files
Merge pull request #597 from kvcache-ai/feat-more-context
Feat more context
parents
e9089631
f7f10598
Changes
29
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
146 additions
and
45 deletions
+146
-45
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
...s/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+12
-1
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+12
-0
ktransformers/optimize/optimize_rules/Mixtral.yaml
ktransformers/optimize/optimize_rules/Mixtral.yaml
+10
-0
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
...ize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+11
-1
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
...mers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+10
-0
ktransformers/server/backend/interfaces/ktransformers.py
ktransformers/server/backend/interfaces/ktransformers.py
+1
-1
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+78
-41
ktransformers/util/utils.py
ktransformers/util/utils.py
+1
-1
test_prompt.txt
test_prompt.txt
+11
-0
No files found.
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
View file @
72d09f3f
...
...
@@ -135,7 +135,18 @@
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)|(lm_head)"
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)"
replace
:
class
:
"
default"
kwargs
:
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
View file @
72d09f3f
...
...
@@ -5,6 +5,18 @@
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
...
...
ktransformers/optimize/optimize_rules/Mixtral.yaml
View file @
72d09f3f
...
...
@@ -15,6 +15,16 @@
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.block_sparse_moe$"
class
:
ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
...
...
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
View file @
72d09f3f
...
...
@@ -77,9 +77,19 @@
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
(^model.norm)
|(^lm_head)
"
name
:
"
(^model.norm)"
replace
:
class
:
"
default"
kwargs
:
...
...
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
View file @
72d09f3f
...
...
@@ -15,6 +15,16 @@
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^lm_head"
class
:
torch.nn.Linear
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
...
...
ktransformers/server/backend/interfaces/ktransformers.py
View file @
72d09f3f
...
...
@@ -25,10 +25,10 @@ class KTransformersThreadContext(TransformersThreadContext):
class
KTransformersInterface
(
TransformersInterface
):
def
__init__
(
self
,
args
:
ConfigArgs
=
default_args
):
self
.
args
=
args
torch
.
set_default_dtype
(
torch
.
bfloat16
)
torch
.
set_grad_enabled
(
False
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model_dir
,
device
=
args
.
device
,
trust_remote_code
=
args
.
trust_remote_code
)
config
=
AutoConfig
.
from_pretrained
(
args
.
model_dir
,
trust_remote_code
=
args
.
trust_remote_code
)
torch
.
set_default_dtype
(
config
.
torch_dtype
)
if
config
.
architectures
[
0
]
==
"Qwen2MoeForCausalLM"
:
config
.
_attn_implementation
=
"flash_attention_2"
...
...
ktransformers/util/custom_gguf.py
View file @
72d09f3f
...
...
@@ -25,6 +25,7 @@ import os
from
enum
import
IntEnum
import
torch
import
KTransformersOps
import
ctypes
class
GGMLQuantizationType
(
IntEnum
):
F32
=
0
...
...
@@ -285,7 +286,7 @@ class GGUFLoader:
itemsize
=
int
(
np
.
empty
([],
dtype
=
item_type
).
itemsize
)
return
mmap_data
[
offset
:
offset
+
itemsize
*
item_count
]
def
load_expert_tensor
(
self
,
name
,
data
,
expert_id
,
elements_per_expert
,
device
=
"
gpu"
)
->
torch
.
Tensor
:
def
load_expert_tensor
(
self
,
name
,
data
,
expert_id
,
elements_per_expert
,
device
=
"
cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
)
->
torch
.
Tensor
:
t
=
self
.
tensor_info
[
name
]
if
device
.
lower
()
==
"cpu"
:
print
(
f
"loading expert
{
expert_id
}
of
{
name
}
with CPU"
)
...
...
@@ -304,16 +305,16 @@ class GGUFLoader:
data
=
data
[
offset
:
offset
+
block_size
*
blocks_per_experts
]
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
,
target_dtype
)
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
values
=
torch
.
from_numpy
(
values
.
copy
()
)
values
=
values
.
view
(
shape
[
-
2
::
-
1
])
return
values
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
,
target_dtype
=
torch
.
get_default_dtype
()
)
->
torch
.
Tensor
:
t
=
self
.
tensor_info
[
name
]
if
device
.
lower
()
==
"cpu"
:
print
(
f
"loading
{
name
}
with CPU"
)
...
...
@@ -328,16 +329,36 @@ class GGUFLoader:
data
=
self
.
get_mmap_tensor
(
name
)
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
#values = GGML_DEQUANTIZE[ggml_name](data)
#print("load_gguf_tensor")
#values = torch.from_numpy(values).to(device = device)
block_size
=
GGML_BLOCK_SIZES
[
ggml_name
]
elements_per_block
=
GGML_ELEMENTS_PER_BLOCK
[
ggml_name
]
num_elements
=
int
(
np
.
prod
(
shape
))
num_blocks
=
num_elements
//
elements_per_block
blocks_per_iter
=
16384
if
num_blocks
>
blocks_per_iter
:
# dequant large tensor
values
=
torch
.
empty
((
num_blocks
,
elements_per_block
),
dtype
=
torch
.
float
,
device
=
device
)
for
i
in
range
(
(
num_blocks
+
blocks_per_iter
-
1
)
//
blocks_per_iter
):
blocks_begin
=
i
*
blocks_per_iter
blocks_end
=
min
(
blocks_begin
+
blocks_per_iter
,
num_blocks
)
if
"cuda"
in
device
.
lower
():
cur_values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
[
blocks_begin
*
block_size
:
blocks_end
*
block_size
],
device
,
target_dtype
)
else
:
cur_values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
[
blocks_begin
*
block_size
:
blocks_end
*
block_size
])
cur_values
=
torch
.
from_numpy
(
cur_values
.
copy
())
cur_values
=
cur_values
.
view
(
-
1
,
elements_per_block
)
values
[
blocks_begin
:
blocks_end
]
=
cur_values
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
if
ggml_name
==
"BF16"
:
values
=
values
.
view
(
torch
.
bfloat16
)
values
=
values
.
view
(
shape
[::
-
1
])
if
"attn_q"
in
name
and
self
.
gguf_file_meta
[
'general.architecture'
]
in
[
"llama"
]:
n_head
=
self
.
gguf_file_meta
[
'llama.attention.head_count'
]
...
...
@@ -433,14 +454,15 @@ def dequantize_q2_k(data):
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
def
dequantize_q2_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q2_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q2_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q2_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q2_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q2_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q3_k
(
data
):
# C implementation
...
...
@@ -484,14 +506,15 @@ def dequantize_q3_k(data):
(((
qs
[:,
48
:
64
]
>>
6
)
&
3
)
-
bits
[:,
16
:,
7
])
],
axis
=
1
)
def
dequantize_q3_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q3_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q3_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q3_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q3_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q3_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q4_k
(
data
):
# C implementation
...
...
@@ -515,13 +538,15 @@ def dequantize_q4_k(data):
# Dequantize final weights using scales and offsets
return
factors
*
qs2
-
offsets
def
dequantize_q4_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q4_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()):
block_size
=
GGML_BLOCK_SIZES
[
"Q4_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q4_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q4_k
(
data
,
144
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q4_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q5_k
(
data
):
# C implementation
...
...
@@ -579,14 +604,15 @@ def dequantize_q5_k(data):
d8
*
(
qs_hi_4
[:,
3
]
+
(
bits
[:,
:,
7
]
<<
4
))
-
m8
,
],
axis
=
1
)
def
dequantize_q5_k_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q5_k_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q5_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q5_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q5_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q5_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q6_k
(
data
):
# C implementation
...
...
@@ -637,13 +663,14 @@ def dequantize_q6_k(data):
],
axis
=
1
)
# @torch.jit.script
def
dequantize_q6_k_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
):
def
dequantize_q6_k_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"Q6_K"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q6_K"
]
device
=
torch
.
device
(
device
)
num_blocks
=
len
(
data
)
//
block_size
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q6_k
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
kvalues_iq4nl
=
np
.
array
([
-
127
,
-
104
,
-
83
,
-
65
,
-
49
,
-
35
,
-
22
,
-
10
,
1
,
13
,
25
,
38
,
53
,
69
,
89
,
113
],
dtype
=
np
.
int8
)
...
...
@@ -677,13 +704,14 @@ def dequantize_iq4_xs(data):
return
y
.
flatten
()
def
dequantize_iq4_xs_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
):
def
dequantize_iq4_xs_gpu
(
data
:
np
.
ndarray
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
block_size
=
GGML_BLOCK_SIZES
[
"IQ4_XS"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"IQ4_XS"
]
device
=
torch
.
device
(
device
)
num_blocks
=
len
(
data
)
//
block_size
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_iq4_xs
(
data
,
block_size
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_iq4_xs
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_q4_0
(
data
):
# C implementation
...
...
@@ -700,7 +728,7 @@ def dequantize_q4_0(data):
scales
*
((
qs
>>
4
).
astype
(
np
.
int8
)
-
8
),
],
axis
=
1
)
def
dequantize_q4_0_gpu
(
data
):
def
dequantize_q4_0_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
raise
NotImplementedError
()
def
dequantize_q5_0
(
data
):
...
...
@@ -724,7 +752,7 @@ def dequantize_q5_0(data):
scales
*
x1
,
],
axis
=
1
)
def
dequantize_q5_0_gpu
(
data
):
def
dequantize_q5_0_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
raise
NotImplementedError
()
def
dequantize_q8_0
(
data
):
...
...
@@ -736,32 +764,41 @@ def dequantize_q8_0(data):
qs
=
np
.
frombuffer
(
data
,
dtype
=
np
.
int8
).
reshape
(
num_blocks
,
2
+
32
)[:,
2
:]
return
scales
*
qs
def
dequantize_q8_0_gpu
(
data
,
device
:
str
=
"cuda"
):
def
dequantize_q8_0_gpu
(
data
,
device
:
str
=
"cuda"
,
target_dtype
=
torch
.
get_default_dtype
()
):
# C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
num_blocks
=
len
(
data
)
//
GGML_BLOCK_SIZES
[
"Q8_0"
]
block_size
=
GGML_BLOCK_SIZES
[
"Q8_0"
]
ele_per_blk
=
GGML_ELEMENTS_PER_BLOCK
[
"Q8_0"
]
device
=
torch
.
device
(
device
)
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q8_0
(
data
,
34
,
devic
e
)
c_pointer
=
ctypes
.
addressof
(
ctypes
.
cast
(
data
.
ctypes
.
data
,
ctypes
.
POINTER
(
ctypes
.
c_int8
)).
contents
)
return
KTransformersOps
.
dequantize_q8_0
(
c_pointer
,
data
.
size
,
block_size
,
ele_per_blk
,
device
,
target_dtyp
e
)
def
dequantize_f32
(
data
):
return
np
.
frombuffer
(
data
,
dtype
=
np
.
float32
)
def
dequantize_f32_gpu
(
data
,
device
):
def
dequantize_f32_gpu
(
data
,
device
,
target_dtype
=
torch
.
get_default_dtype
()
):
data
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float32
)
res
=
torch
.
from_numpy
(
data
)
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
)
res
=
torch
.
from_numpy
(
data
.
copy
()
)
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
,
dtype
=
target_dtype
)
res_gpu
.
copy_
(
res
)
return
res_gpu
def
dequantize_f16
(
data
):
return
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
)
def
dequantize_f16_gpu
(
data
,
device
):
def
dequantize_f16_gpu
(
data
,
device
,
target_dtype
=
torch
.
get_default_dtype
()):
data
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
)
res
=
torch
.
from_numpy
(
data
.
copy
())
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
,
dtype
=
target_dtype
)
res_gpu
.
copy_
(
res
)
return
res_gpu
def
dequantize_bf16_gpu
(
data
,
device
,
target_dtype
=
torch
.
get_default_dtype
()):
data
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
)
res
=
torch
.
from_numpy
(
data
)
res
=
torch
.
from_numpy
(
data
.
copy
()
)
res_gpu
=
torch
.
empty_like
(
res
,
device
=
device
)
res_gpu
.
copy_
(
res
)
return
res_gpu
...
...
@@ -784,7 +821,7 @@ GGML_DEQUANTIZE = {
GGML_DEQUANTIZE_GPU
=
{
"F32"
:
dequantize_f32_gpu
,
"F16"
:
dequantize_f16_gpu
,
"BF16"
:
dequantize_f16_gpu
,
"BF16"
:
dequantize_
b
f16_gpu
,
"Q4_0"
:
dequantize_q4_0_gpu
,
"Q5_0"
:
dequantize_q5_0_gpu
,
"Q8_0"
:
dequantize_q8_0_gpu
,
...
...
ktransformers/util/utils.py
View file @
72d09f3f
...
...
@@ -79,7 +79,7 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
raise
Exception
(
f
"can't find
{
translated_key
}
in GGUF file!"
)
def
load_weights
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
#
print(f"recursively loading weights {prefix}
,{return_when_injected=}, {only_load_injected=}
")
#print(f"recursively loading weights {prefix}")
if
not
isinstance
(
module
,
base_operator
.
BaseInjectedModule
):
load_cur_state_dict
(
module
,
gguf_loader
,
prefix
)
for
name
,
child
in
module
.
_modules
.
items
():
...
...
test_prompt.txt
View file @
72d09f3f
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment