Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
77a34c28
Unverified
Commit
77a34c28
authored
Aug 15, 2024
by
UnicornChan
Committed by
GitHub
Aug 15, 2024
Browse files
Merge pull request #36 from kvcache-ai/develop-0.1.2
Release v0.1.2
parents
44f57270
395cd3e7
Changes
69
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
215 additions
and
81 deletions
+215
-81
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
...mers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+29
-9
ktransformers/server/backend/interfaces/ktransformers.py
ktransformers/server/backend/interfaces/ktransformers.py
+5
-1
ktransformers/tests/dequant_gpu.py
ktransformers/tests/dequant_gpu.py
+19
-18
ktransformers/tests/dequant_gpu_t.py
ktransformers/tests/dequant_gpu_t.py
+3
-3
ktransformers/util/cuda_graph_runner.py
ktransformers/util/cuda_graph_runner.py
+14
-4
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+74
-12
ktransformers/util/utils.py
ktransformers/util/utils.py
+60
-31
setup.py
setup.py
+11
-2
third_party/llamafile/sgemm.cpp
third_party/llamafile/sgemm.cpp
+0
-1
No files found.
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
View file @
77a34c28
...
...
@@ -2,36 +2,56 @@
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
class
:
ktransformers.operators.linear.KTransformer
s
Linear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
Quantized
LinearMarlin"
prefill_op
:
"
Quantized
LinearTorch"
generate_op
:
"
K
LinearMarlin"
prefill_op
:
"
K
LinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected
# mlp module with custom forward function
class
:
ktransformers.operators.experts.KQwen2MoeSparseMoeBlock
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformers
MLP
Expert
# custom MoE Kernel with expert paralleism
device
:
"
cpu"
# which devices to load this module when initializing
class
:
ktransformers.operators.experts.KTransformersExpert
s
# custom MoE Kernel with expert paralleism
#
device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda"
prefill_
mlp_type
:
"
MLP
ExpertsTorch"
prefill_
op
:
"
K
ExpertsTorch"
generate_device
:
"
cpu"
generate_
mlp_type
:
"
MLPCPU
Experts"
generate_
op
:
"
K
Experts
CPU
"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.
layer_wise_prefill.Qwen2MoeModelPerLayerPrefil
l"
class
:
"
ktransformers.operators.
models.KQwen2MoeMode
l"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
\ No newline at end of file
ktransformers/server/backend/interfaces/ktransformers.py
View file @
77a34c28
...
...
@@ -6,6 +6,7 @@ from ktransformers.optimize.optimize import optimize_and_load_gguf
from
ktransformers.models.custom_cache
import
StaticCache
from
ktransformers.util.cuda_graph_runner
import
CUDAGraphRunner
from
ktransformers.local_chat
import
custom_models
,
default_optimize_rules
from
ktransformers.util.utils
import
get_device
class
KTransformersThreadContext
(
TransformersThreadContext
):
...
...
@@ -48,8 +49,11 @@ class KTransformersInterface(TransformersInterface):
def
decode_one_tokens
(
self
):
if
not
hasattr
(
self
,
"cuda_graph_runner"
):
device_map
=
self
.
model
.
gguf_loader
.
tensor_device_map
torch_device
=
get_device
(
'blk.0.self_attn'
,
device_map
)
torch_device
=
"cuda:0"
if
torch_device
==
"cuda"
else
torch_device
self
.
cuda_graph_runner
=
CUDAGraphRunner
()
self
.
cuda_graph_runner
.
capture
(
self
.
model
,
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
,
self
.
cache
,
return_dict
=
False
,
use_cache
=
True
)
self
.
cuda_graph_runner
.
capture
(
self
.
model
,
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
,
self
.
cache
,
main_device
=
torch_device
,
return_dict
=
False
,
use_cache
=
True
)
if
hasattr
(
self
,
"cuda_graph_runner"
):
logits
=
self
.
cuda_graph_runner
(
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
)
...
...
ktransformers/tests/dequant_gpu.py
View file @
77a34c28
import
os
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
#
os.environ["CUDA_VISIBLE_DEVICES"]="1
,2
"
# add path
import
sys
current_path
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
sys
.
path
.
append
(
current_path
+
"/../.."
)
import
pycuda.autoinit
import
pycuda.driver
as
cuda
from
pycuda.compiler
import
SourceModule
import
numpy
as
np
# from ktransformers.operators.linear import KTransformerLinear,
Quantized
LinearMarlin
# from ktransformers.operators.experts import KTransformers
MLP
Expert,
MLP
ExpertsTorch
# from ktransformers.operators.linear import KTransformer
s
Linear,
K
LinearMarlin
# from ktransformers.operators.experts import KTransformersExpert
s
,
K
ExpertsTorch
from
ktransformers.util.custom_gguf
import
GGUFLoader
import
torch
import
KTransformersOps
...
...
@@ -18,40 +15,44 @@ import time
from
transformers
import
(
AutoConfig
,
)
import
os
# CUDA_LAUNCH_BLOCKING=1
os
.
environ
[
"CUDA_LAUNCH_BLOCKING"
]
=
"1"
gguf_config
=
GGUFLoader
(
"/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m"
)
model_name
=
"/data/Qwen2-57B-A14B-Instruct"
key
=
"blk.0."
target
=
"ffn_down_exps.weight"
# Q4k
key
=
"blk.1."
target
=
"attn_q.weight"
t1
=
time
.
time
()
q_weight_cpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cpu"
)
# q_weight_cpu = torch.from_numpy(q_weight_cpu)
t2
=
time
.
time
()
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda"
)
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda
:0
"
)
t3
=
time
.
time
()
print
()
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
()
.
to
(
torch
.
float32
)
,
atol
=
1e-6
)
print
(
f
"Q
6
k
{
key
+
target
}
"
)
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
(),
atol
=
1e-6
)
print
(
f
"Q
4
k
{
key
+
target
}
"
)
print
(
"load gguf tensor from cpu cost: "
,
t2
-
t1
)
print
(
"load gguf tensor from gpu cost: "
,
t3
-
t2
)
print
(
"allclose: "
,
allclose
)
key
=
"blk.1."
target
=
"ffn_up_shexp.weight"
# Q6k
key
=
"blk.0."
target
=
"ffn_down_exps.weight"
t1
=
time
.
time
()
q_weight_cpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cpu"
)
# q_weight_cpu = torch.from_numpy(q_weight_cpu)
t2
=
time
.
time
()
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda"
)
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda
:0
"
)
t3
=
time
.
time
()
print
()
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
(),
atol
=
1e-6
)
print
(
f
"Q
4
k
{
key
+
target
}
"
)
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
()
.
to
(
torch
.
float32
)
,
atol
=
1e-6
)
print
(
f
"Q
6
k
{
key
+
target
}
"
)
print
(
"load gguf tensor from cpu cost: "
,
t2
-
t1
)
print
(
"load gguf tensor from gpu cost: "
,
t3
-
t2
)
print
(
"allclose: "
,
allclose
)
ktransformers/tests/dequant_gpu_t.py
View file @
77a34c28
...
...
@@ -7,11 +7,11 @@ import pycuda.autoinit
import
pycuda.driver
as
cuda
from
pycuda.compiler
import
SourceModule
import
numpy
as
np
from
ktransformers.operators.linear
import
KTransformerLinear
,
Quantized
LinearMarlin
from
ktransformers.operators.experts
import
KTransformers
MLP
Expert
,
MLP
ExpertsTorch
from
ktransformers.operators.linear
import
KTransformer
s
Linear
,
K
LinearMarlin
from
ktransformers.operators.experts
import
KTransformersExpert
s
,
K
ExpertsTorch
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
dequantize_q4_k_gpu
,
dequantize_q4_k
import
torch
import
Cuda
Ops
import
KTransformers
Ops
torch
.
set_default_dtype
(
torch
.
bfloat16
)
import
time
from
transformers
import
(
...
...
ktransformers/util/cuda_graph_runner.py
View file @
77a34c28
...
...
@@ -21,6 +21,7 @@ class CUDAGraphRunner:
position_ids
,
cache_position
,
past_key_values
,
main_device
,
**
kwargs
,
)
->
None
:
assert
self
.
graph
is
None
...
...
@@ -29,15 +30,24 @@ class CUDAGraphRunner:
self
.
graph
=
torch
.
cuda
.
CUDAGraph
()
#self.graph.enable_debug_mode()
self
.
model
=
model
inputs_embeds
=
model
.
model
.
embed_tokens
(
cur_token
.
to
(
"cpu"
)).
to
(
"cuda"
)
with
torch
.
cuda
.
graph
(
self
.
graph
):
inputs_embeds
=
model
.
model
.
embed_tokens
(
cur_token
.
to
(
"cpu"
)).
to
(
main_device
)
# torch.cuda.set_device can't set "cuda", must have a index
if
main_device
==
"cuda"
:
main_device
=
"cuda:0"
torch
.
cuda
.
set_device
(
main_device
)
self
.
main_device
=
main_device
capture_stream
=
torch
.
cuda
.
Stream
()
with
torch
.
cuda
.
graph
(
self
.
graph
,
stream
=
capture_stream
):
logits
=
model
(
inputs_embeds
=
inputs_embeds
,
position_ids
=
position_ids
,
cache_position
=
cache_position
,
past_key_values
=
past_key_values
,
**
kwargs
)[
0
]
capture_stream
.
wait_stream
(
torch
.
cuda
.
current_stream
())
torch
.
cuda
.
set_device
(
main_device
)
torch
.
cuda
.
set_stream
(
capture_stream
)
past_key_values
.
change_seq_length
(
-
1
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
(
self
.
main_device
)
#self.graph.debug_dump("cuda_graph_hooked.dot")
# Save the input and output buffers.
...
...
@@ -65,7 +75,7 @@ class CUDAGraphRunner:
#print("begin replay")
#time.sleep(1)
self
.
graph
.
replay
()
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
(
self
.
main_device
)
# Return the output tensor.
return
self
.
output_buffers
[
"logits"
]
...
...
ktransformers/util/custom_gguf.py
View file @
77a34c28
...
...
@@ -5,8 +5,8 @@ Description :
Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-26 08:48:54
Version : 1.0.0
LastEditors :
Azure
LastEditTime : 2024-0
7-26
0
9
:2
8:2
5
LastEditors :
kkk1nak0
LastEditTime : 2024-0
8-12
0
7
:2
1:5
5
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
...
...
@@ -18,6 +18,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
import
struct
import
warnings
import
numpy
as
np
import
re
import
numpy.typing
as
npt
from
typing
import
Sequence
import
os
...
...
@@ -168,6 +169,7 @@ class GGUFLoader:
self
.
tensor_file_map
=
{}
self
.
file_data_map
=
{}
self
.
gguf_file_meta
=
{}
self
.
tensor_device_map
=
{}
# Walk through all the .gguf files in the directory
for
root
,
dirs
,
files
in
os
.
walk
(
gguf_path
):
...
...
@@ -292,8 +294,19 @@ class GGUFLoader:
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
return
values
.
view
(
shape
[::
-
1
])
values
=
values
.
view
(
shape
[::
-
1
])
if
"attn_q"
in
name
and
self
.
gguf_file_meta
[
'general.architecture'
]
in
[
"llama"
]:
n_head
=
self
.
gguf_file_meta
[
'llama.attention.head_count'
]
values
=
(
values
.
reshape
(
n_head
,
values
.
shape
[
0
]
//
n_head
//
2
,
2
,
*
values
.
shape
[
1
:])
.
swapaxes
(
1
,
2
)
.
reshape
(
values
.
shape
))
elif
"attn_k"
in
name
and
self
.
gguf_file_meta
[
'general.architecture'
]
in
[
"llama"
]:
n_head
=
self
.
gguf_file_meta
[
'llama.attention.head_count_kv'
]
values
=
(
values
.
reshape
(
n_head
,
values
.
shape
[
0
]
//
n_head
//
2
,
2
,
*
values
.
shape
[
1
:])
.
swapaxes
(
1
,
2
)
.
reshape
(
values
.
shape
))
return
values
def
read_value
(
f
,
data_type
):
if
data_type
==
DATA_TYPES
[
"string"
]:
...
...
@@ -377,8 +390,14 @@ def dequantize_q2_k(data):
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
def
dequantize_q2_k_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q2_k_gpu
(
data
,
device
:
str
=
"cuda"
):
block_size
=
GGML_BLOCK_SIZES
[
"Q2_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q2_k
(
data
,
block_size
,
device
)
def
dequantize_q3_k
(
data
):
# C implementation
...
...
@@ -422,8 +441,14 @@ def dequantize_q3_k(data):
(((
qs
[:,
48
:
64
]
>>
6
)
&
3
)
-
bits
[:,
16
:,
7
])
],
axis
=
1
)
def
dequantize_q3_k_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q3_k_gpu
(
data
,
device
:
str
=
"cuda"
):
block_size
=
GGML_BLOCK_SIZES
[
"Q3_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q3_k
(
data
,
block_size
,
device
)
def
dequantize_q4_k
(
data
):
# C implementation
...
...
@@ -511,9 +536,14 @@ def dequantize_q5_k(data):
d8
*
(
qs_hi_4
[:,
3
]
+
(
bits
[:,
:,
7
]
<<
4
))
-
m8
,
],
axis
=
1
)
def
dequantize_q5_k_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q5_k_gpu
(
data
,
device
:
str
=
"cuda"
):
block_size
=
GGML_BLOCK_SIZES
[
"Q5_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q5_k
(
data
,
block_size
,
device
)
def
dequantize_q6_k
(
data
):
# C implementation
...
...
@@ -570,7 +600,7 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
num_blocks
=
len
(
data
)
//
block_size
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
210
,
device
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
block_size
,
device
)
def
dequantize_q4_0
(
data
):
# C implementation
...
...
@@ -679,7 +709,34 @@ GGML_DEQUANTIZE_GPU = {
"Q6_K"
:
dequantize_q6_k_gpu
,
}
def
translate_name_to_gguf_mixtral
(
name
):
replacement_template
=
{
"w1.weight"
:
"ffn_gate"
,
"w2.weight"
:
"ffn_down"
,
"w3.weight"
:
"ffn_up"
}
pattern
=
re
.
compile
(
r
"model.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.(w\d\.weight)"
)
def
replace_match
(
match
):
blk_id
=
match
.
group
(
1
)
expert_id
=
match
.
group
(
2
)
weight_type
=
match
.
group
(
3
)
if
weight_type
in
replacement_template
:
return
f
"blk.
{
blk_id
}
.
{
replacement_template
[
weight_type
]
}
.
{
expert_id
}
.weight"
else
:
return
match
.
group
(
0
)
new_name
=
re
.
sub
(
pattern
,
replace_match
,
name
)
return
new_name
def
translate_name_to_gguf
(
name
):
name
=
translate_name_to_gguf_mixtral
(
name
)
name
=
name
.
replace
(
"lm_head."
,
"output."
)
name
=
name
.
replace
(
"model.embed_tokens."
,
"token_embd."
)
name
=
name
.
replace
(
"model.norm."
,
"output_norm."
)
...
...
@@ -716,9 +773,14 @@ def translate_name_to_gguf(name):
name
=
name
.
replace
(
".mlp.experts.ffn_gate_exps"
,
".ffn_gate_exps"
)
name
=
name
.
replace
(
".mlp.experts.ffn_up_exps"
,
".ffn_up_exps"
)
name
=
name
.
replace
(
".block_sparse_moe.gate."
,
".ffn_gate_inp."
)
name
=
name
.
replace
(
".block_sparse_moe.experts"
,
""
)
return
name
if
__name__
==
'__main__'
:
gguf_path
=
'/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
loader
=
GGUFLoader
(
gguf_path
)
loader
.
load_gguf_tensor
(
'token_embd.weight'
)
ktransformers/util/utils.py
View file @
77a34c28
...
...
@@ -39,6 +39,22 @@ def set_param(module: nn.Module, name: str, weights: torch.Tensor):
param
.
unsqueeze_
(
0
)
setattr
(
module
,
name
,
param
)
def
get_device
(
gguf_module_key
:
str
,
device_map
:
dict
):
if
gguf_module_key
in
device_map
:
return
device_map
[
gguf_module_key
][
"generate_device"
]
else
:
return
"cuda"
def
get_all_used_cuda_device
(
device_map
:
dict
):
all_device_list
=
set
()
for
key
in
device_map
:
all_device_list
.
add
(
device_map
[
key
][
"generate_device"
])
if
"generate_device"
in
device_map
[
key
]
else
None
all_device_list
.
add
(
device_map
[
key
][
"prefill_device"
])
if
"prefill_device"
in
device_map
[
key
]
else
None
if
"cpu"
in
all_device_list
:
all_device_list
.
remove
(
"cpu"
)
all_device_list
=
list
(
all_device_list
)
return
all_device_list
def
load_cur_state_dict
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
:
str
=
""
):
prefix
=
prefix
.
replace
(
"orig_module."
,
""
)
persistent_buffers
=
{
k
:
v
for
k
,
v
in
module
.
_buffers
.
items
()
if
k
not
in
module
.
_non_persistent_buffers_set
}
...
...
@@ -47,18 +63,19 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
for
name
,
param
in
local_state
.
items
():
key
=
prefix
+
name
translated_key
=
translate_name_to_gguf
(
key
)
print
(
"default loading weights"
,
key
,
translated_key
)
if
translated_key
in
gguf_loader
.
tensor_file_map
:
target_dtype
=
torch
.
get_default_dtype
()
device
=
"cpu"
if
"embd"
in
translated_key
else
"cuda"
device
=
get_device
(
translated_key
[:
translated_key
.
rfind
(
"."
)],
gguf_loader
.
tensor_device_map
)
print
(
f
"loading
{
translated_key
}
to
{
device
}
"
)
# device = "cpu" if "embd" in translated_key else "cuda"
weights
=
gguf_loader
.
load_gguf_tensor
(
translated_key
,
device
=
device
).
to
(
dtype
=
target_dtype
)
set_param
(
module
,
name
,
weights
)
del
weights
else
:
#print(load_config.tensor_file_map.keys())
raise
Exception
(
f
"can't f
a
nd
{
translated_key
}
in GGUF file!"
)
raise
Exception
(
f
"can't f
i
nd
{
translated_key
}
in GGUF file!"
)
def
load_weights
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
,
return_when_injected
:
bool
=
False
,
only_load_injected
:
bool
=
False
):
def
load_weights
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
# print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
if
not
isinstance
(
module
,
base_operator
.
BaseInjectedModule
):
load_cur_state_dict
(
module
,
gguf_loader
,
prefix
)
...
...
@@ -66,29 +83,36 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix='', return_whe
load_weights
(
child
,
gguf_loader
,
prefix
+
name
+
"."
)
else
:
module
.
load
()
def
prefill_and_generate
(
model
,
tokenizer
,
inputs
,
max_new_tokens
=
10000
):
def
prefill_and_generate
(
model
,
tokenizer
,
inputs
,
max_new_tokens
=
10000
,
use_cuda_graph
:
bool
=
True
):
import
os
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
torch
.
_dynamo
.
config
.
suppress_errors
=
True
batch_size
,
seq_length
=
inputs
.
shape
torch_device
=
inputs
.
device
device_map
=
model
.
gguf_loader
.
tensor_device_map
torch_device
=
get_device
(
'blk.0.self_attn'
,
device_map
)
torch_device
=
"cuda:0"
if
torch_device
==
"cuda"
else
torch_device
inputs
=
inputs
.
to
(
torch_device
)
all_cuda_device
=
get_all_used_cuda_device
(
device_map
)
tokens
=
[]
def
decode_one_tokens
(
cuda_graph_runner
,
cur_token
,
position_ids
,
cache_position
,
past_key_values
):
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
def
decode_one_tokens
(
cuda_graph_runner
,
cur_token
,
position_ids
,
cache_position
,
past_key_values
,
use_cuda_graph
:
bool
=
True
):
if
use_cuda_graph
:
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
else
:
# custom_stream = torch.cuda.Stream()
torch
.
cuda
.
set_device
(
torch_device
)
inputs_embeds
=
model
.
model
.
embed_tokens
(
cur_token
.
to
(
"cpu"
)).
to
(
torch_device
)
# with torch.cuda.stream(custom_stream):
logits
=
model
(
inputs_embeds
=
inputs_embeds
,
position_ids
=
position_ids
,
cache_position
=
cache_position
,
past_key_values
=
past_key_values
,
return_dict
=
False
,
use_cache
=
True
)[
0
]
past_key_values
.
change_seq_length
(
1
)
"""
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
custom_stream = torch.cuda.Stream()
with torch.cuda.stream(custom_stream):
logits=model(inputs_embeds = inputs_embeds,
position_ids = position_ids,
cache_position = cache_position,
past_key_values = past_key_values,
return_dict = False, use_cache = True) [0]
"""
torch
.
cuda
.
synchronize
()
for
device
in
all_cuda_device
:
torch
.
cuda
.
synchronize
(
device
)
#print(logits)
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
if
generation_config
.
do_sample
:
...
...
@@ -97,11 +121,12 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
else
:
next_token
=
torch
.
argmax
(
next_token_scores
,
dim
=-
1
)
return
next_token
torch
.
cuda
.
set_device
(
torch_device
)
with
torch
.
no_grad
():
stream
=
TextStreamer
(
tokenizer
)
past_key_values
=
StaticCache
(
config
=
model
.
config
,
max_batch_size
=
1
,
max_cache_len
=
seq_length
+
max_new_tokens
,
device
=
torch_
device
,
dtype
=
model
.
dtype
config
=
model
.
config
,
max_batch_size
=
1
,
max_cache_len
=
seq_length
+
max_new_tokens
,
device
=
device
_map
,
dtype
=
model
.
dtype
)
cache_position
=
torch
.
arange
(
seq_length
,
device
=
torch_device
)
generated_ids
=
torch
.
zeros
(
...
...
@@ -111,21 +136,21 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
past_key_values
.
cur_idx
=
cache_position
start_time
=
time
.
time
()
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
"cuda"
)
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
torch_device
)
logits
=
model
(
inputs_embeds
=
inputs_embeds
,
cache_position
=
cache_position
,
past_key_values
=
past_key_values
,
return_dict
=
False
,
use_cache
=
True
)[
0
][:,
-
1
,:].
unsqueeze
(
0
).
clone
()
)[
0
][:,
-
1
,:].
unsqueeze
(
0
).
clone
()
.
to
(
torch_device
)
generation_config
,
model_kwargs
=
model
.
_prepare_generation_config
(
None
,
max_length
=
max_new_tokens
,
do_sample
=
True
,
top_k
=
5
,
top_p
=
0.85
,
temperature
=
0.1
# change this to modify generate config
)
try
:
# transformers==4.43
logits_warper
=
(
model
.
_get_logits_warper
(
generation_config
,
device
=
inputs
.
device
)
if
generation_config
.
do_sample
else
None
model
.
_get_logits_warper
(
generation_config
,
device
=
inputs
.
device
)
)
except
:
logits_warper
=
(
model
.
_get_logits_warper
(
generation_config
)
if
generation_config
.
do_sample
else
None
model
.
_get_logits_warper
(
generation_config
)
)
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
if
generation_config
.
do_sample
:
...
...
@@ -137,7 +162,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
prefill_count
=
seq_length
prefill_time
=
first_token_time
print
(
stream
.
put
(
next_token
.
item
()),
end
=
""
,
flush
=
True
)
generated_ids
[:,
seq_length
]
=
next_token
tokens
.
append
(
next_token
)
...
...
@@ -145,12 +169,16 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
cache_position
=
torch
.
tensor
([
seq_length
],
device
=
torch_device
)
position_ids
=
cache_position
.
unsqueeze
(
0
)
seq_length
+=
1
cuda_graph_runner
=
CUDAGraphRunner
()
cuda_graph_runner
.
capture
(
model
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
,
return_dict
=
False
,
use_cache
=
True
)
if
use_cuda_graph
:
cuda_graph_runner
=
CUDAGraphRunner
()
cuda_graph_runner
.
capture
(
model
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
,
torch_device
,
return_dict
=
False
,
use_cache
=
True
)
else
:
cuda_graph_runner
=
None
start_time
=
time
.
time
()
for
_
in
range
(
1
,
max_new_tokens
):
next_token
=
decode_one_tokens
(
cuda_graph_runner
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
)
next_token
=
decode_one_tokens
(
cuda_graph_runner
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
,
use_cuda_graph
).
to
(
torch_device
)
inputs
=
torch
.
cat
((
inputs
,
next_token
.
unsqueeze
(
0
)),
dim
=-
1
)
generated_ids
[:,
cache_position
]
=
next_token
.
int
()
tokens
.
append
(
next_token
.
int
())
...
...
@@ -163,6 +191,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
print
(
stream
.
put
(
next_token
.
item
()),
end
=
""
,
flush
=
True
)
cache_position
+=
1
position_ids
=
cache_position
.
unsqueeze
(
0
)
total_time
=
time
.
time
()
-
start_time
tokens_generated
=
len
(
tokens
)
...
...
setup.py
View file @
77a34c28
...
...
@@ -6,7 +6,7 @@ Author : chenxl
Date : 2024-07-27 16:15:27
Version : 1.0.0
LastEditors : chenxl
LastEditTime : 2024-08-
08 02:45
:1
5
LastEditTime : 2024-08-
14 16:36
:1
9
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
...
...
@@ -299,6 +299,15 @@ setup(
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu'
,
'ktransformers/ktransformers_ext/cuda/binding.cpp'
,
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
])
],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
],
'nvcc'
:
[
'-O3'
,
'--use_fast_math'
,
'-Xcompiler'
,
'-fPIC'
,
]
}
)
]
)
third_party/llamafile/sgemm.cpp
View file @
77a34c28
...
...
@@ -94,7 +94,6 @@ static const struct GemmFuncs {
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX512F__)
printf
(
"__AVX512F__
\n
"
);
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
// AMD Zen4+ (2023-)
sgemm
=
llamafile_sgemm_amd_zen4
;
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment