Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
77a34c28
"web/vscode:/vscode.git/clone" did not exist on "820e581ad8bcc9e7134f87a8e9097ef2f7280877"
Unverified
Commit
77a34c28
authored
Aug 15, 2024
by
UnicornChan
Committed by
GitHub
Aug 15, 2024
Browse files
Merge pull request #36 from kvcache-ai/develop-0.1.2
Release v0.1.2
parents
44f57270
395cd3e7
Changes
69
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
215 additions
and
81 deletions
+215
-81
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
...mers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+29
-9
ktransformers/server/backend/interfaces/ktransformers.py
ktransformers/server/backend/interfaces/ktransformers.py
+5
-1
ktransformers/tests/dequant_gpu.py
ktransformers/tests/dequant_gpu.py
+19
-18
ktransformers/tests/dequant_gpu_t.py
ktransformers/tests/dequant_gpu_t.py
+3
-3
ktransformers/util/cuda_graph_runner.py
ktransformers/util/cuda_graph_runner.py
+14
-4
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+74
-12
ktransformers/util/utils.py
ktransformers/util/utils.py
+60
-31
setup.py
setup.py
+11
-2
third_party/llamafile/sgemm.cpp
third_party/llamafile/sgemm.cpp
+0
-1
No files found.
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
View file @
77a34c28
...
...
@@ -2,36 +2,56 @@
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
class
:
ktransformers.operators.linear.KTransformer
s
Linear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
Quantized
LinearMarlin"
prefill_op
:
"
Quantized
LinearTorch"
generate_op
:
"
K
LinearMarlin"
prefill_op
:
"
K
LinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected
# mlp module with custom forward function
class
:
ktransformers.operators.experts.KQwen2MoeSparseMoeBlock
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformers
MLP
Expert
# custom MoE Kernel with expert paralleism
device
:
"
cpu"
# which devices to load this module when initializing
class
:
ktransformers.operators.experts.KTransformersExpert
s
# custom MoE Kernel with expert paralleism
#
device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda"
prefill_
mlp_type
:
"
MLP
ExpertsTorch"
prefill_
op
:
"
K
ExpertsTorch"
generate_device
:
"
cpu"
generate_
mlp_type
:
"
MLPCPU
Experts"
generate_
op
:
"
K
Experts
CPU
"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.
layer_wise_prefill.Qwen2MoeModelPerLayerPrefil
l"
class
:
"
ktransformers.operators.
models.KQwen2MoeMode
l"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
\ No newline at end of file
ktransformers/server/backend/interfaces/ktransformers.py
View file @
77a34c28
...
...
@@ -6,6 +6,7 @@ from ktransformers.optimize.optimize import optimize_and_load_gguf
from
ktransformers.models.custom_cache
import
StaticCache
from
ktransformers.util.cuda_graph_runner
import
CUDAGraphRunner
from
ktransformers.local_chat
import
custom_models
,
default_optimize_rules
from
ktransformers.util.utils
import
get_device
class
KTransformersThreadContext
(
TransformersThreadContext
):
...
...
@@ -48,8 +49,11 @@ class KTransformersInterface(TransformersInterface):
def
decode_one_tokens
(
self
):
if
not
hasattr
(
self
,
"cuda_graph_runner"
):
device_map
=
self
.
model
.
gguf_loader
.
tensor_device_map
torch_device
=
get_device
(
'blk.0.self_attn'
,
device_map
)
torch_device
=
"cuda:0"
if
torch_device
==
"cuda"
else
torch_device
self
.
cuda_graph_runner
=
CUDAGraphRunner
()
self
.
cuda_graph_runner
.
capture
(
self
.
model
,
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
,
self
.
cache
,
return_dict
=
False
,
use_cache
=
True
)
self
.
cuda_graph_runner
.
capture
(
self
.
model
,
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
,
self
.
cache
,
main_device
=
torch_device
,
return_dict
=
False
,
use_cache
=
True
)
if
hasattr
(
self
,
"cuda_graph_runner"
):
logits
=
self
.
cuda_graph_runner
(
self
.
current_ids
,
self
.
active_cache_position
.
unsqueeze
(
0
),
self
.
active_cache_position
)
...
...
ktransformers/tests/dequant_gpu.py
View file @
77a34c28
import
os
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"1"
#
os.environ["CUDA_VISIBLE_DEVICES"]="1
,2
"
# add path
import
sys
current_path
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
sys
.
path
.
append
(
current_path
+
"/../.."
)
import
pycuda.autoinit
import
pycuda.driver
as
cuda
from
pycuda.compiler
import
SourceModule
import
numpy
as
np
# from ktransformers.operators.linear import KTransformerLinear,
Quantized
LinearMarlin
# from ktransformers.operators.experts import KTransformers
MLP
Expert,
MLP
ExpertsTorch
# from ktransformers.operators.linear import KTransformer
s
Linear,
K
LinearMarlin
# from ktransformers.operators.experts import KTransformersExpert
s
,
K
ExpertsTorch
from
ktransformers.util.custom_gguf
import
GGUFLoader
import
torch
import
KTransformersOps
...
...
@@ -18,40 +15,44 @@ import time
from
transformers
import
(
AutoConfig
,
)
import
os
# CUDA_LAUNCH_BLOCKING=1
os
.
environ
[
"CUDA_LAUNCH_BLOCKING"
]
=
"1"
gguf_config
=
GGUFLoader
(
"/data/Qwen2-57B-A14B-Instruct-GGUF/q4_k_m"
)
model_name
=
"/data/Qwen2-57B-A14B-Instruct"
key
=
"blk.0."
target
=
"ffn_down_exps.weight"
# Q4k
key
=
"blk.1."
target
=
"attn_q.weight"
t1
=
time
.
time
()
q_weight_cpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cpu"
)
# q_weight_cpu = torch.from_numpy(q_weight_cpu)
t2
=
time
.
time
()
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda"
)
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda
:0
"
)
t3
=
time
.
time
()
print
()
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
()
.
to
(
torch
.
float32
)
,
atol
=
1e-6
)
print
(
f
"Q
6
k
{
key
+
target
}
"
)
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
(),
atol
=
1e-6
)
print
(
f
"Q
4
k
{
key
+
target
}
"
)
print
(
"load gguf tensor from cpu cost: "
,
t2
-
t1
)
print
(
"load gguf tensor from gpu cost: "
,
t3
-
t2
)
print
(
"allclose: "
,
allclose
)
key
=
"blk.1."
target
=
"ffn_up_shexp.weight"
# Q6k
key
=
"blk.0."
target
=
"ffn_down_exps.weight"
t1
=
time
.
time
()
q_weight_cpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cpu"
)
# q_weight_cpu = torch.from_numpy(q_weight_cpu)
t2
=
time
.
time
()
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda"
)
q_weight_gpu
=
gguf_config
.
load_gguf_tensor
(
key
+
target
,
"cuda
:0
"
)
t3
=
time
.
time
()
print
()
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
(),
atol
=
1e-6
)
print
(
f
"Q
4
k
{
key
+
target
}
"
)
allclose
=
torch
.
allclose
(
q_weight_cpu
,
q_weight_gpu
.
cpu
()
.
to
(
torch
.
float32
)
,
atol
=
1e-6
)
print
(
f
"Q
6
k
{
key
+
target
}
"
)
print
(
"load gguf tensor from cpu cost: "
,
t2
-
t1
)
print
(
"load gguf tensor from gpu cost: "
,
t3
-
t2
)
print
(
"allclose: "
,
allclose
)
ktransformers/tests/dequant_gpu_t.py
View file @
77a34c28
...
...
@@ -7,11 +7,11 @@ import pycuda.autoinit
import
pycuda.driver
as
cuda
from
pycuda.compiler
import
SourceModule
import
numpy
as
np
from
ktransformers.operators.linear
import
KTransformerLinear
,
Quantized
LinearMarlin
from
ktransformers.operators.experts
import
KTransformers
MLP
Expert
,
MLP
ExpertsTorch
from
ktransformers.operators.linear
import
KTransformer
s
Linear
,
K
LinearMarlin
from
ktransformers.operators.experts
import
KTransformersExpert
s
,
K
ExpertsTorch
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
dequantize_q4_k_gpu
,
dequantize_q4_k
import
torch
import
Cuda
Ops
import
KTransformers
Ops
torch
.
set_default_dtype
(
torch
.
bfloat16
)
import
time
from
transformers
import
(
...
...
ktransformers/util/cuda_graph_runner.py
View file @
77a34c28
...
...
@@ -21,6 +21,7 @@ class CUDAGraphRunner:
position_ids
,
cache_position
,
past_key_values
,
main_device
,
**
kwargs
,
)
->
None
:
assert
self
.
graph
is
None
...
...
@@ -29,15 +30,24 @@ class CUDAGraphRunner:
self
.
graph
=
torch
.
cuda
.
CUDAGraph
()
#self.graph.enable_debug_mode()
self
.
model
=
model
inputs_embeds
=
model
.
model
.
embed_tokens
(
cur_token
.
to
(
"cpu"
)).
to
(
"cuda"
)
with
torch
.
cuda
.
graph
(
self
.
graph
):
inputs_embeds
=
model
.
model
.
embed_tokens
(
cur_token
.
to
(
"cpu"
)).
to
(
main_device
)
# torch.cuda.set_device can't set "cuda", must have a index
if
main_device
==
"cuda"
:
main_device
=
"cuda:0"
torch
.
cuda
.
set_device
(
main_device
)
self
.
main_device
=
main_device
capture_stream
=
torch
.
cuda
.
Stream
()
with
torch
.
cuda
.
graph
(
self
.
graph
,
stream
=
capture_stream
):
logits
=
model
(
inputs_embeds
=
inputs_embeds
,
position_ids
=
position_ids
,
cache_position
=
cache_position
,
past_key_values
=
past_key_values
,
**
kwargs
)[
0
]
capture_stream
.
wait_stream
(
torch
.
cuda
.
current_stream
())
torch
.
cuda
.
set_device
(
main_device
)
torch
.
cuda
.
set_stream
(
capture_stream
)
past_key_values
.
change_seq_length
(
-
1
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
(
self
.
main_device
)
#self.graph.debug_dump("cuda_graph_hooked.dot")
# Save the input and output buffers.
...
...
@@ -65,7 +75,7 @@ class CUDAGraphRunner:
#print("begin replay")
#time.sleep(1)
self
.
graph
.
replay
()
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
(
self
.
main_device
)
# Return the output tensor.
return
self
.
output_buffers
[
"logits"
]
...
...
ktransformers/util/custom_gguf.py
View file @
77a34c28
...
...
@@ -5,8 +5,8 @@ Description :
Author : Azure-Tang, Boxin Zhang, chenht2022
Date : 2024-07-26 08:48:54
Version : 1.0.0
LastEditors :
Azure
LastEditTime : 2024-0
7-26
0
9
:2
8:2
5
LastEditors :
kkk1nak0
LastEditTime : 2024-0
8-12
0
7
:2
1:5
5
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
...
...
@@ -18,6 +18,7 @@ Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
import
struct
import
warnings
import
numpy
as
np
import
re
import
numpy.typing
as
npt
from
typing
import
Sequence
import
os
...
...
@@ -168,6 +169,7 @@ class GGUFLoader:
self
.
tensor_file_map
=
{}
self
.
file_data_map
=
{}
self
.
gguf_file_meta
=
{}
self
.
tensor_device_map
=
{}
# Walk through all the .gguf files in the directory
for
root
,
dirs
,
files
in
os
.
walk
(
gguf_path
):
...
...
@@ -292,8 +294,19 @@ class GGUFLoader:
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
return
values
.
view
(
shape
[::
-
1
])
values
=
values
.
view
(
shape
[::
-
1
])
if
"attn_q"
in
name
and
self
.
gguf_file_meta
[
'general.architecture'
]
in
[
"llama"
]:
n_head
=
self
.
gguf_file_meta
[
'llama.attention.head_count'
]
values
=
(
values
.
reshape
(
n_head
,
values
.
shape
[
0
]
//
n_head
//
2
,
2
,
*
values
.
shape
[
1
:])
.
swapaxes
(
1
,
2
)
.
reshape
(
values
.
shape
))
elif
"attn_k"
in
name
and
self
.
gguf_file_meta
[
'general.architecture'
]
in
[
"llama"
]:
n_head
=
self
.
gguf_file_meta
[
'llama.attention.head_count_kv'
]
values
=
(
values
.
reshape
(
n_head
,
values
.
shape
[
0
]
//
n_head
//
2
,
2
,
*
values
.
shape
[
1
:])
.
swapaxes
(
1
,
2
)
.
reshape
(
values
.
shape
))
return
values
def
read_value
(
f
,
data_type
):
if
data_type
==
DATA_TYPES
[
"string"
]:
...
...
@@ -377,8 +390,14 @@ def dequantize_q2_k(data):
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
def
dequantize_q2_k_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q2_k_gpu
(
data
,
device
:
str
=
"cuda"
):
block_size
=
GGML_BLOCK_SIZES
[
"Q2_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q2_k
(
data
,
block_size
,
device
)
def
dequantize_q3_k
(
data
):
# C implementation
...
...
@@ -422,8 +441,14 @@ def dequantize_q3_k(data):
(((
qs
[:,
48
:
64
]
>>
6
)
&
3
)
-
bits
[:,
16
:,
7
])
],
axis
=
1
)
def
dequantize_q3_k_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q3_k_gpu
(
data
,
device
:
str
=
"cuda"
):
block_size
=
GGML_BLOCK_SIZES
[
"Q3_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q3_k
(
data
,
block_size
,
device
)
def
dequantize_q4_k
(
data
):
# C implementation
...
...
@@ -511,9 +536,14 @@ def dequantize_q5_k(data):
d8
*
(
qs_hi_4
[:,
3
]
+
(
bits
[:,
:,
7
]
<<
4
))
-
m8
,
],
axis
=
1
)
def
dequantize_q5_k_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q5_k_gpu
(
data
,
device
:
str
=
"cuda"
):
block_size
=
GGML_BLOCK_SIZES
[
"Q5_K"
]
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
device
=
torch
.
device
(
device
)
# TODO: this and from_numpy in other functions will cause a warning saying that numpy is not writable,
# the best way to fix this is transfer ptr to KTransformersOps instead of Tensor.
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q5_k
(
data
,
block_size
,
device
)
def
dequantize_q6_k
(
data
):
# C implementation
...
...
@@ -570,7 +600,7 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
num_blocks
=
len
(
data
)
//
block_size
data
=
np
.
frombuffer
(
data
,
dtype
=
data
.
dtype
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
210
,
device
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
block_size
,
device
)
def
dequantize_q4_0
(
data
):
# C implementation
...
...
@@ -679,7 +709,34 @@ GGML_DEQUANTIZE_GPU = {
"Q6_K"
:
dequantize_q6_k_gpu
,
}
def
translate_name_to_gguf_mixtral
(
name
):
replacement_template
=
{
"w1.weight"
:
"ffn_gate"
,
"w2.weight"
:
"ffn_down"
,
"w3.weight"
:
"ffn_up"
}
pattern
=
re
.
compile
(
r
"model.layers\.(\d+)\.block_sparse_moe\.experts\.(\d+)\.(w\d\.weight)"
)
def
replace_match
(
match
):
blk_id
=
match
.
group
(
1
)
expert_id
=
match
.
group
(
2
)
weight_type
=
match
.
group
(
3
)
if
weight_type
in
replacement_template
:
return
f
"blk.
{
blk_id
}
.
{
replacement_template
[
weight_type
]
}
.
{
expert_id
}
.weight"
else
:
return
match
.
group
(
0
)
new_name
=
re
.
sub
(
pattern
,
replace_match
,
name
)
return
new_name
def
translate_name_to_gguf
(
name
):
name
=
translate_name_to_gguf_mixtral
(
name
)
name
=
name
.
replace
(
"lm_head."
,
"output."
)
name
=
name
.
replace
(
"model.embed_tokens."
,
"token_embd."
)
name
=
name
.
replace
(
"model.norm."
,
"output_norm."
)
...
...
@@ -716,9 +773,14 @@ def translate_name_to_gguf(name):
name
=
name
.
replace
(
".mlp.experts.ffn_gate_exps"
,
".ffn_gate_exps"
)
name
=
name
.
replace
(
".mlp.experts.ffn_up_exps"
,
".ffn_up_exps"
)
name
=
name
.
replace
(
".block_sparse_moe.gate."
,
".ffn_gate_inp."
)
name
=
name
.
replace
(
".block_sparse_moe.experts"
,
""
)
return
name
if
__name__
==
'__main__'
:
gguf_path
=
'/mnt/data/model/DeepSeek-Coder-V2-GGUF-WJH'
loader
=
GGUFLoader
(
gguf_path
)
loader
.
load_gguf_tensor
(
'token_embd.weight'
)
ktransformers/util/utils.py
View file @
77a34c28
...
...
@@ -39,6 +39,22 @@ def set_param(module: nn.Module, name: str, weights: torch.Tensor):
param
.
unsqueeze_
(
0
)
setattr
(
module
,
name
,
param
)
def
get_device
(
gguf_module_key
:
str
,
device_map
:
dict
):
if
gguf_module_key
in
device_map
:
return
device_map
[
gguf_module_key
][
"generate_device"
]
else
:
return
"cuda"
def
get_all_used_cuda_device
(
device_map
:
dict
):
all_device_list
=
set
()
for
key
in
device_map
:
all_device_list
.
add
(
device_map
[
key
][
"generate_device"
])
if
"generate_device"
in
device_map
[
key
]
else
None
all_device_list
.
add
(
device_map
[
key
][
"prefill_device"
])
if
"prefill_device"
in
device_map
[
key
]
else
None
if
"cpu"
in
all_device_list
:
all_device_list
.
remove
(
"cpu"
)
all_device_list
=
list
(
all_device_list
)
return
all_device_list
def
load_cur_state_dict
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
:
str
=
""
):
prefix
=
prefix
.
replace
(
"orig_module."
,
""
)
persistent_buffers
=
{
k
:
v
for
k
,
v
in
module
.
_buffers
.
items
()
if
k
not
in
module
.
_non_persistent_buffers_set
}
...
...
@@ -47,18 +63,19 @@ def load_cur_state_dict(module: nn.Module, gguf_loader: GGUFLoader, prefix: str
for
name
,
param
in
local_state
.
items
():
key
=
prefix
+
name
translated_key
=
translate_name_to_gguf
(
key
)
print
(
"default loading weights"
,
key
,
translated_key
)
if
translated_key
in
gguf_loader
.
tensor_file_map
:
target_dtype
=
torch
.
get_default_dtype
()
device
=
"cpu"
if
"embd"
in
translated_key
else
"cuda"
device
=
get_device
(
translated_key
[:
translated_key
.
rfind
(
"."
)],
gguf_loader
.
tensor_device_map
)
print
(
f
"loading
{
translated_key
}
to
{
device
}
"
)
# device = "cpu" if "embd" in translated_key else "cuda"
weights
=
gguf_loader
.
load_gguf_tensor
(
translated_key
,
device
=
device
).
to
(
dtype
=
target_dtype
)
set_param
(
module
,
name
,
weights
)
del
weights
else
:
#print(load_config.tensor_file_map.keys())
raise
Exception
(
f
"can't f
a
nd
{
translated_key
}
in GGUF file!"
)
raise
Exception
(
f
"can't f
i
nd
{
translated_key
}
in GGUF file!"
)
def
load_weights
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
,
return_when_injected
:
bool
=
False
,
only_load_injected
:
bool
=
False
):
def
load_weights
(
module
:
nn
.
Module
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
# print(f"recursively loading weights {prefix},{return_when_injected=}, {only_load_injected=}")
if
not
isinstance
(
module
,
base_operator
.
BaseInjectedModule
):
load_cur_state_dict
(
module
,
gguf_loader
,
prefix
)
...
...
@@ -66,29 +83,36 @@ def load_weights(module:nn.Module, gguf_loader:GGUFLoader, prefix='', return_whe
load_weights
(
child
,
gguf_loader
,
prefix
+
name
+
"."
)
else
:
module
.
load
()
def
prefill_and_generate
(
model
,
tokenizer
,
inputs
,
max_new_tokens
=
10000
):
def
prefill_and_generate
(
model
,
tokenizer
,
inputs
,
max_new_tokens
=
10000
,
use_cuda_graph
:
bool
=
True
):
import
os
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
torch
.
_dynamo
.
config
.
suppress_errors
=
True
batch_size
,
seq_length
=
inputs
.
shape
torch_device
=
inputs
.
device
device_map
=
model
.
gguf_loader
.
tensor_device_map
torch_device
=
get_device
(
'blk.0.self_attn'
,
device_map
)
torch_device
=
"cuda:0"
if
torch_device
==
"cuda"
else
torch_device
inputs
=
inputs
.
to
(
torch_device
)
all_cuda_device
=
get_all_used_cuda_device
(
device_map
)
tokens
=
[]
def
decode_one_tokens
(
cuda_graph_runner
,
cur_token
,
position_ids
,
cache_position
,
past_key_values
):
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
def
decode_one_tokens
(
cuda_graph_runner
,
cur_token
,
position_ids
,
cache_position
,
past_key_values
,
use_cuda_graph
:
bool
=
True
):
if
use_cuda_graph
:
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
else
:
# custom_stream = torch.cuda.Stream()
torch
.
cuda
.
set_device
(
torch_device
)
inputs_embeds
=
model
.
model
.
embed_tokens
(
cur_token
.
to
(
"cpu"
)).
to
(
torch_device
)
# with torch.cuda.stream(custom_stream):
logits
=
model
(
inputs_embeds
=
inputs_embeds
,
position_ids
=
position_ids
,
cache_position
=
cache_position
,
past_key_values
=
past_key_values
,
return_dict
=
False
,
use_cache
=
True
)[
0
]
past_key_values
.
change_seq_length
(
1
)
"""
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
custom_stream = torch.cuda.Stream()
with torch.cuda.stream(custom_stream):
logits=model(inputs_embeds = inputs_embeds,
position_ids = position_ids,
cache_position = cache_position,
past_key_values = past_key_values,
return_dict = False, use_cache = True) [0]
"""
torch
.
cuda
.
synchronize
()
for
device
in
all_cuda_device
:
torch
.
cuda
.
synchronize
(
device
)
#print(logits)
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
if
generation_config
.
do_sample
:
...
...
@@ -97,11 +121,12 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
else
:
next_token
=
torch
.
argmax
(
next_token_scores
,
dim
=-
1
)
return
next_token
torch
.
cuda
.
set_device
(
torch_device
)
with
torch
.
no_grad
():
stream
=
TextStreamer
(
tokenizer
)
past_key_values
=
StaticCache
(
config
=
model
.
config
,
max_batch_size
=
1
,
max_cache_len
=
seq_length
+
max_new_tokens
,
device
=
torch_
device
,
dtype
=
model
.
dtype
config
=
model
.
config
,
max_batch_size
=
1
,
max_cache_len
=
seq_length
+
max_new_tokens
,
device
=
device
_map
,
dtype
=
model
.
dtype
)
cache_position
=
torch
.
arange
(
seq_length
,
device
=
torch_device
)
generated_ids
=
torch
.
zeros
(
...
...
@@ -111,21 +136,21 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
past_key_values
.
cur_idx
=
cache_position
start_time
=
time
.
time
()
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
"cuda"
)
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
torch_device
)
logits
=
model
(
inputs_embeds
=
inputs_embeds
,
cache_position
=
cache_position
,
past_key_values
=
past_key_values
,
return_dict
=
False
,
use_cache
=
True
)[
0
][:,
-
1
,:].
unsqueeze
(
0
).
clone
()
)[
0
][:,
-
1
,:].
unsqueeze
(
0
).
clone
()
.
to
(
torch_device
)
generation_config
,
model_kwargs
=
model
.
_prepare_generation_config
(
None
,
max_length
=
max_new_tokens
,
do_sample
=
True
,
top_k
=
5
,
top_p
=
0.85
,
temperature
=
0.1
# change this to modify generate config
)
try
:
# transformers==4.43
logits_warper
=
(
model
.
_get_logits_warper
(
generation_config
,
device
=
inputs
.
device
)
if
generation_config
.
do_sample
else
None
model
.
_get_logits_warper
(
generation_config
,
device
=
inputs
.
device
)
)
except
:
logits_warper
=
(
model
.
_get_logits_warper
(
generation_config
)
if
generation_config
.
do_sample
else
None
model
.
_get_logits_warper
(
generation_config
)
)
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
if
generation_config
.
do_sample
:
...
...
@@ -137,7 +162,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
prefill_count
=
seq_length
prefill_time
=
first_token_time
print
(
stream
.
put
(
next_token
.
item
()),
end
=
""
,
flush
=
True
)
generated_ids
[:,
seq_length
]
=
next_token
tokens
.
append
(
next_token
)
...
...
@@ -145,12 +169,16 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
cache_position
=
torch
.
tensor
([
seq_length
],
device
=
torch_device
)
position_ids
=
cache_position
.
unsqueeze
(
0
)
seq_length
+=
1
cuda_graph_runner
=
CUDAGraphRunner
()
cuda_graph_runner
.
capture
(
model
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
,
return_dict
=
False
,
use_cache
=
True
)
if
use_cuda_graph
:
cuda_graph_runner
=
CUDAGraphRunner
()
cuda_graph_runner
.
capture
(
model
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
,
torch_device
,
return_dict
=
False
,
use_cache
=
True
)
else
:
cuda_graph_runner
=
None
start_time
=
time
.
time
()
for
_
in
range
(
1
,
max_new_tokens
):
next_token
=
decode_one_tokens
(
cuda_graph_runner
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
)
next_token
=
decode_one_tokens
(
cuda_graph_runner
,
next_token
.
unsqueeze
(
0
),
position_ids
,
cache_position
,
past_key_values
,
use_cuda_graph
).
to
(
torch_device
)
inputs
=
torch
.
cat
((
inputs
,
next_token
.
unsqueeze
(
0
)),
dim
=-
1
)
generated_ids
[:,
cache_position
]
=
next_token
.
int
()
tokens
.
append
(
next_token
.
int
())
...
...
@@ -163,6 +191,7 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
print
(
stream
.
put
(
next_token
.
item
()),
end
=
""
,
flush
=
True
)
cache_position
+=
1
position_ids
=
cache_position
.
unsqueeze
(
0
)
total_time
=
time
.
time
()
-
start_time
tokens_generated
=
len
(
tokens
)
...
...
setup.py
View file @
77a34c28
...
...
@@ -6,7 +6,7 @@ Author : chenxl
Date : 2024-07-27 16:15:27
Version : 1.0.0
LastEditors : chenxl
LastEditTime : 2024-08-
08 02:45
:1
5
LastEditTime : 2024-08-
14 16:36
:1
9
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
...
...
@@ -299,6 +299,15 @@ setup(
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu'
,
'ktransformers/ktransformers_ext/cuda/binding.cpp'
,
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
])
],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
],
'nvcc'
:
[
'-O3'
,
'--use_fast_math'
,
'-Xcompiler'
,
'-fPIC'
,
]
}
)
]
)
third_party/llamafile/sgemm.cpp
View file @
77a34c28
...
...
@@ -94,7 +94,6 @@ static const struct GemmFuncs {
#if defined(__FMA__) || (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX512F__)
printf
(
"__AVX512F__
\n
"
);
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
// AMD Zen4+ (2023-)
sgemm
=
llamafile_sgemm_amd_zen4
;
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment