Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
f2938031
Unverified
Commit
f2938031
authored
Aug 09, 2024
by
UnicornChan
Committed by
GitHub
Aug 09, 2024
Browse files
Merge pull request #27 from chenht2022/develop-0.1.2
[Feature] towards 0.1.2
parents
442e13bc
c1cc7d2c
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
745 additions
and
727 deletions
+745
-727
ktransformers/ktransformers_ext/CMakeLists.txt
ktransformers/ktransformers_ext/CMakeLists.txt
+1
-2
ktransformers/ktransformers_ext/bench/bench_linear.py
ktransformers/ktransformers_ext/bench/bench_linear.py
+31
-21
ktransformers/ktransformers_ext/bench/bench_linear_torch.py
ktransformers/ktransformers_ext/bench/bench_linear_torch.py
+21
-23
ktransformers/ktransformers_ext/bench/bench_mlp.py
ktransformers/ktransformers_ext/bench/bench_mlp.py
+31
-21
ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
+33
-53
ktransformers/ktransformers_ext/bench/bench_moe.py
ktransformers/ktransformers_ext/bench/bench_moe.py
+35
-29
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
+68
-79
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+30
-6
ktransformers/ktransformers_ext/examples/test_linear.py
ktransformers/ktransformers_ext/examples/test_linear.py
+22
-43
ktransformers/ktransformers_ext/examples/test_mlp.py
ktransformers/ktransformers_ext/examples/test_mlp.py
+35
-51
ktransformers/ktransformers_ext/examples/test_moe.py
ktransformers/ktransformers_ext/examples/test_moe.py
+73
-65
ktransformers/ktransformers_ext/ext_bindings.cpp
ktransformers/ktransformers_ext/ext_bindings.cpp
+118
-203
ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
...sformers/ktransformers_ext/operators/llamafile/linear.cpp
+38
-12
ktransformers/ktransformers_ext/operators/llamafile/linear.h
ktransformers/ktransformers_ext/operators/llamafile/linear.h
+11
-7
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+55
-35
ktransformers/ktransformers_ext/operators/llamafile/mlp.h
ktransformers/ktransformers_ext/operators/llamafile/mlp.h
+16
-12
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+32
-63
ktransformers/ktransformers_ext/operators/llamafile/moe.h
ktransformers/ktransformers_ext/operators/llamafile/moe.h
+3
-2
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
...ransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+55
-0
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
...ktransformers_ext/operators/llamafile/shared_mem_buffer.h
+37
-0
No files found.
ktransformers/ktransformers_ext/CMakeLists.txt
View file @
f2938031
...
...
@@ -22,14 +22,13 @@ option(LLAMA_AVX2 "llama: enable AVX2"
option
(
LLAMA_AVX512
"llama: enable AVX512"
OFF
)
option
(
LLAMA_AVX512_VBMI
"llama: enable AVX512-VBMI"
OFF
)
option
(
LLAMA_AVX512_VNNI
"llama: enable AVX512-VNNI"
OFF
)
option
(
LLAMA_AVX512_BF16
"llama: enable AVX512-BF16"
OFF
)
option
(
LLAMA_FMA
"llama: enable FMA"
OFF
)
# in MSVC F16C is implied with AVX2/AVX512
if
(
NOT MSVC
)
option
(
LLAMA_F16C
"llama: enable F16C"
OFF
)
endif
()
option
(
LLAMA_AVX512_FANCY_SIMD
"llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"
OFF
)
option
(
LLAMA_AVX512_BF16
"llama: enable AVX512-BF16"
OFF
)
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
...
...
ktransformers/ktransformers_ext/bench/bench_linear.py
View file @
f2938031
...
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:31:59
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
2:51
LastEditTime : 2024-0
8-06
10:3
5:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
...
...
@@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
torch
input_size
=
16384
output_size
=
5120
stride
=
16
group_max_len
=
1024
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
def
bench_linear
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
output_size
=
5120
stride
=
16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
if
quant_mode
==
"fp32"
:
...
...
@@ -66,30 +69,37 @@ def bench_linear(quant_mode: str):
projs
=
[]
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
group_max_len
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
projs
.
append
(
proj
)
linears
.
append
(
linear
)
input
=
torch
.
randn
((
layer_num
,
qlen
,
input_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
output_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
for
i
in
range
(
warm_up_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
linears
[
i
%
layer_num
].
forward
(
qlen
,
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
linears
[
i
%
layer_num
].
forward
(
qlen
,
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+
=
end
-
start
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_linear_torch.py
View file @
f2938031
...
...
@@ -14,14 +14,17 @@ import time
import
torch
import
torch.nn.quantized
as
nnq
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
input_size
=
16384
output_size
=
5120
layer_num
=
10
qlen
=
1
warm_up_iter
=
1000
test_iter
=
10000
def
bench_linear
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
output_size
=
5120
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
...
...
@@ -41,37 +44,32 @@ def bench_linear(quant_mode: str):
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
proj_q
=
torch
.
quantize_per_tensor
(
proj
,
scale
,
zero_point
,
torch
.
qint8
)
quantized_layer
=
nnq
.
Linear
(
input_size
,
output_size
)
quantized_layer
.
set_weight_bias
(
proj_q
,
None
)
projs
.
append
(
quantized_layer
)
else
:
projs
.
append
(
proj
.
to
(
proj_type
))
input
=
torch
.
randn
((
layer_num
,
qlen
,
input_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
for
i
in
range
(
warm_up_iter
):
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_layer
=
projs
[
i
%
layer_num
]
t_output
=
quantized_layer
(
input_q
)
if
isinstance
(
projs
[
i
%
layer_num
],
nnq
.
Linear
):
input_q
=
torch
.
quantize_per_tensor
(
input
[
i
%
layer_num
].
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
projs
[
i
%
layer_num
](
input_q
)
else
:
t_output
=
torch
.
mm
(
input
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
t_output
=
torch
.
mm
(
input
[
i
%
layer_num
]
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float32
).
contiguous
()
start
=
time
.
perf_counter
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_layer
=
projs
[
i
%
layer_num
]
t_output
=
quantized_layer
(
input_q
)
if
isinstance
(
projs
[
i
%
layer_num
],
nnq
.
Linear
):
input_q
=
torch
.
quantize_per_tensor
(
input
[
i
%
layer_num
].
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
projs
[
i
%
layer_num
](
input_q
)
else
:
t_output
=
torch
.
mm
(
input
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
end
=
time
.
perf_counter
()
total_time
+
=
end
-
start
t_output
=
torch
.
mm
(
input
[
i
%
layer_num
]
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_mlp.py
View file @
f2938031
...
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-16 10:43:18
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
2:55
LastEditTime : 2024-0
8-06
10:3
6:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
...
...
@@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
torch
hidden_size
=
5120
intermediate_size
=
3072
stride
=
16
group_max_len
=
1024
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
def
bench_mlp
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
hidden_size
=
5120
intermediate_size
=
3072
stride
=
16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
if
quant_mode
==
"fp32"
:
...
...
@@ -93,32 +96,39 @@ def bench_mlp(quant_mode: str):
gate_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
mlp
=
cpuinfer_ext
.
mlp
.
MLP
(
config
)
gate_projs
.
append
(
gate_proj
)
up_projs
.
append
(
up_proj
)
down_projs
.
append
(
down_proj
)
mlps
.
append
(
mlp
)
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
for
i
in
range
(
warm_up_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
mlps
[
i
%
layer_num
].
forward
(
qlen
,
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
mlps
[
i
%
layer_num
].
forward
(
qlen
,
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+
=
end
-
start
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
View file @
f2938031
...
...
@@ -14,17 +14,38 @@ import time
import
torch
import
torch.nn.quantized
as
nnq
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
hidden_size
=
5120
intermediate_size
=
3072
layer_num
=
10
qlen
=
1
warm_up_iter
=
1000
test_iter
=
10000
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
if
isinstance
(
gate_proj
,
nnq
.
Linear
):
input_q
=
torch
.
quantize_per_tensor
(
input
.
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
gate_buf
=
gate_proj
(
input_q
)
up_buf
=
up_proj
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
expert_output
=
down_proj
(
intermediate_q
)
ret
=
expert_output
.
dequantize
()
else
:
gate_buf
=
torch
.
mm
(
input
.
to
(
gate_proj
.
dtype
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
up_proj
.
dtype
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
.
to
(
down_proj
.
dtype
),
down_proj
.
t
())
return
ret
def
bench_mlp
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
hidden_size
=
5120
intermediate_size
=
3072
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
...
...
@@ -48,7 +69,6 @@ def bench_mlp(quant_mode: str):
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
gate_proj_q
=
torch
.
quantize_per_tensor
(
gate_proj
,
scale
,
zero_point
,
torch
.
qint8
)
quantized_gate
=
nnq
.
Linear
(
hidden_size
,
intermediate_size
)
quantized_gate
.
set_weight_bias
(
gate_proj_q
,
None
)
...
...
@@ -65,58 +85,18 @@ def bench_mlp(quant_mode: str):
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
for
i
in
range
(
warm_up_iter
):
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_gate
=
gate_projs
[
i
%
layer_num
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_projs
[
i
%
layer_num
]
up_buf
=
quantized_gate
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_projs
[
i
%
layer_num
]
t_output
=
quantized_down
(
intermediate_q
)
else
:
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
t_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
.
t
())
mlp_torch
(
input
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
start
=
time
.
perf_counter
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_gate
=
gate_projs
[
i
%
layer_num
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_projs
[
i
%
layer_num
]
up_buf
=
quantized_gate
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_projs
[
i
%
layer_num
]
t_output
=
quantized_down
(
intermediate_q
)
else
:
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
t_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
.
t
())
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
mlp_torch
(
input
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_moe.py
View file @
f2938031
...
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-0
7-25 10:33:00
LastEditTime : 2024-0
8-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
...
...
@@ -15,21 +15,21 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
torch
expert_num
=
160
hidden_size
=
5120
intermediate_size
=
1536
stride
=
16
group_min_len
=
10
group_max_len
=
1024
n_routed_experts
=
6
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
10
hidden_size
=
5120
intermediate_size
=
1536
stride
=
16
group_min_len
=
10
group_max_len
=
1024
n_routed_experts
=
6
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
if
quant_mode
==
"fp32"
:
gate_type
=
0
# ggml_type::GGML_TYPE_F32
...
...
@@ -104,32 +104,38 @@ def bench_moe(quant_mode: str):
up_projs
.
append
(
up_proj
)
down_projs
.
append
(
down_proj
)
moes
.
append
(
moe
)
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
expert_ids
=
torch
.
stack
([
torch
.
stack
([
torch
.
randperm
(
expert_num
,
dtype
=
torch
.
int64
,
device
=
"cuda"
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)])
for
_
in
range
(
layer_num
)]
).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
for
i
in
range
(
warm_up_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
,
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
())
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
(
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
# test
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
,
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
())
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
(
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
=
end
-
start
...
...
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
View file @
f2938031
...
...
@@ -14,19 +14,71 @@ import time
import
torch
import
torch.nn.quantized
as
nnq
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
expert_num
=
160
hidden_size
=
5120
intermediate_size
=
1536
n_routed_experts
=
6
layer_num
=
10
qlen
=
1
warm_up_iter
=
1000
test_iter
=
10000
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
if
isinstance
(
gate_proj
,
nnq
.
Linear
):
input_q
=
torch
.
quantize_per_tensor
(
input
.
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
gate_buf
=
gate_proj
(
input_q
)
up_buf
=
up_proj
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
expert_output
=
down_proj
(
intermediate_q
)
ret
=
expert_output
.
dequantize
()
else
:
gate_buf
=
torch
.
mm
(
input
.
to
(
gate_proj
.
dtype
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
up_proj
.
dtype
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
.
to
(
down_proj
.
dtype
),
down_proj
.
t
())
return
ret
def
moe_torch
(
input
,
expert_ids
,
weights
,
gate_proj
,
up_proj
,
down_proj
):
cnts
=
expert_ids
.
new_zeros
((
expert_ids
.
shape
[
0
],
expert_num
))
cnts
.
scatter_
(
1
,
expert_ids
,
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
expert_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
input
[
idxs
//
expert_ids
.
shape
[
1
]]
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
expert_out
=
mlp_torch
(
tokens_for_this_expert
,
gate_proj
[
i
],
up_proj
[
i
],
down_proj
[
i
])
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
t_output
=
(
new_x
.
view
(
*
expert_ids
.
shape
,
-
1
)
.
type
(
weights
.
dtype
)
.
mul_
(
weights
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
new_x
.
dtype
)
)
return
t_output
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
10
hidden_size
=
5120
intermediate_size
=
1536
n_routed_experts
=
6
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
...
...
@@ -50,7 +102,6 @@ def bench_moe(quant_mode: str):
up_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
quantized_gate_proj
=
[]
quantized_up_proj
=
[]
quantized_down_proj
=
[]
...
...
@@ -74,82 +125,20 @@ def bench_moe(quant_mode: str):
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
expert_ids
=
torch
.
stack
([
torch
.
stack
([
torch
.
randperm
(
expert_num
,
dtype
=
torch
.
int64
,
device
=
"cuda"
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)])
for
_
in
range
(
layer_num
)]).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
for
i
in
range
(
warm_up_iter
):
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
n_routed_experts
,),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
n_routed_experts
,),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
quantized_gate
=
gate_proj
[
expert_id
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_proj
[
expert_id
]
up_buf
=
quantized_up
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_proj
[
expert_id
]
expert_output
=
quantized_down
(
intermediate_q
)
expert_output
=
expert_output
.
dequantize
()
t_output
+=
weights
[
i
]
*
expert_output
else
:
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
proj_type
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
[
expert_id
].
t
())
t_output
+=
weights
[
i
]
*
expert_output
moe_torch
(
input
[
i
%
layer_num
],
expert_ids
[
i
%
layer_num
],
weights
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
n_routed_experts
,),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
n_routed_experts
,),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
start
=
time
.
perf_counter
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
quantized_gate
=
gate_proj
[
expert_id
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_proj
[
expert_id
]
up_buf
=
quantized_up
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_proj
[
expert_id
]
expert_output
=
quantized_down
(
intermediate_q
)
expert_output
=
expert_output
.
dequantize
()
t_output
+=
weights
[
i
]
*
expert_output
else
:
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
proj_type
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
[
expert_id
].
t
())
t_output
+=
weights
[
i
]
*
expert_output
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
moe_torch
(
input
[
i
%
layer_num
],
expert_ids
[
i
%
layer_num
],
weights
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
View file @
f2938031
/**
* @Description :
* @Description :
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-0
7-25 10:33
:4
2
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
* @LastEditors : chenht2022
* @LastEditTime : 2024-0
8-07 09:47
:4
3
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_CPUINFER_H
#define CPUINFER_CPUINFER_H
...
...
@@ -17,6 +17,7 @@
#include <queue>
#include <thread>
#include <vector>
#include "cuda_runtime.h"
#include "backend.h"
#include "task_queue.h"
...
...
@@ -39,16 +40,39 @@ class CPUInfer {
}
template
<
typename
Func
,
typename
Obj
,
typename
...
Args
>
void
submit
(
Func
f
,
Obj
*
obj
,
Args
...
args
)
{
void
enqueue
(
Func
f
,
Obj
*
obj
,
Args
...
args
)
{
task_queue_
->
enqueue
([
=
]()
{
std
::
invoke
(
f
,
*
obj
,
args
...,
backend_
);
});
}
void
submit
(
std
::
pair
<
intptr_t
,
intptr_t
>
params
)
{
void
(
*
func
)(
void
*
)
=
(
void
(
*
)(
void
*
))
params
.
first
;
void
*
args
=
(
void
*
)
params
.
second
;
*
((
CPUInfer
**
)
args
)
=
this
;
func
(
args
);
}
void
sync
()
{
task_queue_
->
sync
();
}
void
submit_with_cuda_stream
(
intptr_t
user_cuda_stream
,
std
::
pair
<
intptr_t
,
intptr_t
>
params
)
{
void
(
*
func
)(
void
*
)
=
(
void
(
*
)(
void
*
))
params
.
first
;
void
*
args
=
(
void
*
)
params
.
second
;
*
((
CPUInfer
**
)
args
)
=
this
;
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
func
,
args
);
}
static
void
sync_
(
void
*
cpu_infer_ptr
)
{
CPUInfer
*
cpuinfer
=
(
CPUInfer
*
)
cpu_infer_ptr
;
cpuinfer
->
sync
();
}
void
sync_with_cuda_stream
(
intptr_t
user_cuda_stream
)
{
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
&
sync_
,
(
void
*
)
this
);
}
public:
Backend
*
backend_
;
TaskQueue
*
task_queue_
;
...
...
ktransformers/ktransformers_ext/examples/test_linear.py
View file @
f2938031
...
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
4:00
LastEditTime : 2024-0
8-06
10:3
6:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
...
...
@@ -15,23 +15,23 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
output_size
=
5120
stride
=
32
proj_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
warm_up_iter
=
1000
test_iter
=
10000
input_size
=
16384
output_size
=
5120
stride
=
32
group_max_len
=
1024
proj_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
qlen
=
30
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
with
torch
.
inference_mode
(
mode
=
True
):
linears
=
[]
projs
=
[]
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
group_max_len
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
projs
.
append
(
proj
)
linears
.
append
(
linear
)
...
...
@@ -39,11 +39,17 @@ with torch.inference_mode(mode=True):
# validation
for
i
in
range
(
validation_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
linear
.
forward
(
qlen
,
input
.
data_ptr
(),
output
.
data_ptr
()
)
)
CPUInfer
.
sync
()
# print('cpuinfer output', output)
...
...
@@ -54,30 +60,3 @@ with torch.inference_mode(mode=True):
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
input_size
*
output_size
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/examples/test_mlp.py
View file @
f2938031
...
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
4:03
LastEditTime : 2024-0
8-06
10:3
7:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
...
...
@@ -15,20 +15,30 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
hidden_size
=
5120
intermediate_size
=
3072
stride
=
32
gate_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
warm_up_iter
=
1000
test_iter
=
10000
hidden_size
=
5120
intermediate_size
=
3072
stride
=
32
group_max_len
=
1024
gate_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
qlen
=
30
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
gate_buf
=
torch
.
mm
(
input
,
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
,
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
,
down_proj
.
t
())
return
ret
with
torch
.
inference_mode
(
mode
=
True
):
mlps
=
[]
gate_projs
=
[]
up_projs
=
[]
...
...
@@ -37,7 +47,7 @@ with torch.inference_mode(mode=True):
gate_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
mlp
=
cpuinfer_ext
.
mlp
.
MLP
(
config
)
gate_projs
.
append
(
gate_proj
)
up_projs
.
append
(
up_proj
)
...
...
@@ -47,52 +57,26 @@ with torch.inference_mode(mode=True):
# validation
for
i
in
range
(
validation_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
mlp
.
forward
(
qlen
,
input
.
data_ptr
(),
output
.
data_ptr
()
)
)
CPUInfer
.
sync
()
# print('cpuinfer output', output)
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
gate_buf
=
torch
.
mm
(
input
,
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
,
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
t_output
=
torch
.
mm
(
intermediate
,
down_proj
.
t
())
t_output
=
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
)
# print('torch output', t_output)
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
time
()
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
time
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
2
*
test_iter
/
total_time
/
1024
/
1024
/
1024
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/examples/test_moe.py
View file @
f2938031
...
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
4
:0
6
LastEditTime : 2024-0
8-06
10:3
8
:0
5
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
...
...
@@ -15,25 +15,64 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
10
hidden_size
=
5120
intermediate_size
=
1536
stride
=
32
group_min_len
=
10
group_max_len
=
1024
gate_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
n_routed_experts
=
6
qlen
=
30
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
warm_up_iter
=
1000
test_iter
=
10000
expert_num
=
160
hidden_size
=
5120
intermediate_size
=
1536
stride
=
32
group_min_len
=
10
group_max_len
=
1024
gate_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
n_routed_experts
=
6
qlen
=
30
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
gate_buf
=
torch
.
mm
(
input
,
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
,
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
,
down_proj
.
t
())
return
ret
def
moe_torch
(
input
,
expert_ids
,
weights
,
gate_proj
,
up_proj
,
down_proj
):
cnts
=
expert_ids
.
new_zeros
((
expert_ids
.
shape
[
0
],
expert_num
))
cnts
.
scatter_
(
1
,
expert_ids
,
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
expert_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
input
[
idxs
//
expert_ids
.
shape
[
1
]]
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
expert_out
=
mlp_torch
(
tokens_for_this_expert
,
gate_proj
[
i
],
up_proj
[
i
],
down_proj
[
i
])
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
t_output
=
(
new_x
.
view
(
*
expert_ids
.
shape
,
-
1
)
.
type
(
weights
.
dtype
)
.
mul_
(
weights
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
new_x
.
dtype
)
)
return
t_output
with
torch
.
inference_mode
(
mode
=
True
):
moes
=
[]
gate_projs
=
[]
up_projs
=
[]
...
...
@@ -51,63 +90,32 @@ with torch.inference_mode(mode=True):
# validation
for
i
in
range
(
validation_iter
):
moe
=
moes
[
i
%
layer_num
]
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
).
contiguous
()
expert_ids
=
torch
.
stack
([
torch
.
randperm
(
expert_num
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)]).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
moe
.
forward
,
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
())
moe
=
moes
[
i
%
layer_num
]
CPUInfer
.
submit
(
moe
.
forward
(
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
()
)
)
CPUInfer
.
sync
()
# print('cpuinfer output', output)
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
t_output
=
torch
.
zeros
((
qlen
,
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
token_idx
in
range
(
qlen
):
for
i
,
expert_id
in
enumerate
(
expert_ids
[
token_idx
]):
gate_buf
=
torch
.
mm
(
input
[
token_idx
],
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
[
token_idx
],
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
,
down_proj
[
expert_id
].
t
())
t_output
[
token_idx
]
+=
weights
[
token_idx
][
i
]
*
expert_output
t_output
=
moe_torch
(
input
,
expert_ids
,
weights
,
gate_proj
,
up_proj
,
down_proj
)
# print('torch output', t_output)
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
moe
=
moes
[
i
%
layer_num
]
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
moe
.
forward
,
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
moe
=
moes
[
i
%
layer_num
]
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
moe
.
forward
,
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
n_routed_experts
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/ext_bindings.cpp
View file @
f2938031
...
...
@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-0
7-25
10:3
4:23
* @LastEditors : chenht2022
* @LastEditTime : 2024-0
8-07
10:3
9:37
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
// Python bindings
...
...
@@ -12,7 +12,6 @@
#include <iostream>
#include <memory>
#include "cpu_backend/cpuinfer.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "llamafile/flags.h"
#include "operators/llamafile/linear.h"
...
...
@@ -26,239 +25,155 @@
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
// Binding functions for the Linear class
class
LinearBindings
{
public:
static
void
bind_forward
(
CPUInfer
&
cpuinfer
,
Linear
*
linear
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
input
=
args
[
0
].
cast
<
intptr_t
>
();
auto
output
=
args
[
1
].
cast
<
intptr_t
>
();
cpuinfer
.
submit
(
&
Linear
::
forward
,
linear
,
(
const
void
*
)
input
,
(
void
*
)
output
);
}
static
void
bind_warm_up
(
CPUInfer
&
cpuinfer
,
Linear
*
linear
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
cpuinfer
.
submit
(
&
Linear
::
warm_up
,
linear
);
}
static
void
bind_functions
(
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
linear
=
func
.
attr
(
"__self__"
).
cast
<
Linear
*>
();
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
bind_forward
(
cpuinfer
,
linear
,
args
,
kwargs
);
}
else
if
(
func_name
==
"warm_up"
)
{
bind_warm_up
(
cpuinfer
,
linear
,
args
,
kwargs
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
class
WarmUpBindinds
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
Linear
*
linear
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
Linear
::
warm_up
,
args_
->
linear
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
Linear
&
linear
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
linear
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
ForwardBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
Linear
*
linear
;
int
qlen
;
const
void
*
input
;
void
*
output
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
Linear
::
forward
,
args_
->
linear
,
args_
->
qlen
,
args_
->
input
,
args_
->
output
);
}
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
Linear
&
linear
,
int
qlen
,
intptr_t
input
,
intptr_t
output
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
linear
,
qlen
,
(
const
void
*
)
input
,
(
void
*
)
output
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
// Binding functions for the MLP class
class
MLPBindings
{
public:
static
void
bind_forward
(
CPUInfer
&
cpuinfer
,
MLP
*
mlp
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
input
=
args
[
0
].
cast
<
intptr_t
>
();
auto
output
=
args
[
1
].
cast
<
intptr_t
>
();
cpuinfer
.
submit
(
&
MLP
::
forward
,
mlp
,
(
const
void
*
)
input
,
(
void
*
)
output
);
}
static
void
bind_warm_up
(
CPUInfer
&
cpuinfer
,
MLP
*
mlp
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
cpuinfer
.
submit
(
&
MLP
::
warm_up
,
mlp
);
}
static
void
bind_functions
(
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
mlp
=
func
.
attr
(
"__self__"
).
cast
<
MLP
*>
();
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
bind_forward
(
cpuinfer
,
mlp
,
args
,
kwargs
);
}
else
if
(
func_name
==
"warm_up"
)
{
bind_warm_up
(
cpuinfer
,
mlp
,
args
,
kwargs
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
class
WarmUpBindinds
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
MLP
*
mlp
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
MLP
::
warm_up
,
args_
->
mlp
);
}
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MLP
&
mlp
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
mlp
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
ForwardBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
MLP
*
mlp
;
int
qlen
;
const
void
*
input
;
void
*
output
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
MLP
::
forward
,
args_
->
mlp
,
args_
->
qlen
,
args_
->
input
,
args_
->
output
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MLP
&
mlp
,
int
qlen
,
intptr_t
input
,
intptr_t
output
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
mlp
,
qlen
,
(
const
void
*
)
input
,
(
void
*
)
output
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
// Binding functions for the MOE class
class
MOEBindings
{
public:
static
void
bind_forward
(
CPUInfer
&
cpuinfer
,
MOE
*
moe
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
int
qlen
=
args
[
0
].
cast
<
int
>
();
int
k
=
args
[
1
].
cast
<
int
>
();
auto
expert_ids
=
args
[
2
].
cast
<
intptr_t
>
();
auto
weights
=
args
[
3
].
cast
<
intptr_t
>
();
auto
input
=
args
[
4
].
cast
<
intptr_t
>
();
auto
output
=
args
[
5
].
cast
<
intptr_t
>
();
cpuinfer
.
submit
(
&
MOE
::
forward
,
moe
,
qlen
,
k
,
(
const
uint64_t
*
)
expert_ids
,
(
const
float
*
)
weights
,
(
const
void
*
)
input
,
(
void
*
)
output
);
}
static
void
bind_warm_up
(
CPUInfer
&
cpuinfer
,
MOE
*
moe
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
cpuinfer
.
submit
(
&
MOE
::
warm_up
,
moe
);
}
static
void
bind_functions
(
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
moe
=
func
.
attr
(
"__self__"
).
cast
<
MOE
*>
();
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
bind_forward
(
cpuinfer
,
moe
,
args
,
kwargs
);
}
else
if
(
func_name
==
"warm_up"
)
{
bind_warm_up
(
cpuinfer
,
moe
,
args
,
kwargs
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
class
WarmUpBindinds
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
MOE
*
moe
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
MOE
::
warm_up
,
args_
->
moe
);
}
}
};
struct
MOEForwardArgs
{
CPUInfer
*
cpuinfer
;
MOE
*
moe
;
int
qlen
;
int
k
;
uint64_t
*
expert_ids
;
float
*
weights
;
void
*
input
;
void
*
output
;
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MOE
&
moe
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
ForwardBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
MOE
*
moe
;
int
qlen
;
int
k
;
const
uint64_t
*
expert_ids
;
const
float
*
weights
;
const
void
*
input
;
void
*
output
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
MOE
::
forward
,
args_
->
moe
,
args_
->
qlen
,
args_
->
k
,
args_
->
expert_ids
,
args_
->
weights
,
args_
->
input
,
args_
->
output
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MOE
&
moe
,
int
qlen
,
int
k
,
intptr_t
expert_ids
,
intptr_t
weights
,
intptr_t
input
,
intptr_t
output
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
,
qlen
,
k
,
(
const
uint64_t
*
)
expert_ids
,
(
const
float
*
)
weights
,
(
const
void
*
)
input
,
(
void
*
)
output
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
void
submit_moe_forward_with_host_args_ptr
(
void
*
host_args_ptr
)
{
MOEForwardArgs
*
host_args
=
(
MOEForwardArgs
*
)
host_args_ptr
;
host_args
->
cpuinfer
->
submit
(
&
MOE
::
forward
,
host_args
->
moe
,
host_args
->
qlen
,
host_args
->
k
,
host_args
->
expert_ids
,
host_args
->
weights
,
host_args
->
input
,
host_args
->
output
);
}
void
cpuinfer_sync
(
void
*
host_args_ptr
)
{
CPUInfer
*
cpuinfer
=
(
CPUInfer
*
)
host_args_ptr
;
cpuinfer
->
sync
();
}
PYBIND11_MODULE
(
cpuinfer_ext
,
m
)
{
auto
linear_module
=
m
.
def_submodule
(
"linear"
);
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
"submit"
,
&
CPUInfer
::
submit
)
.
def
(
"submit_with_cuda_stream"
,
&
CPUInfer
::
submit_with_cuda_stream
)
.
def
(
"sync"
,
&
CPUInfer
::
sync
)
.
def
(
"sync_with_cuda_stream"
,
&
CPUInfer
::
sync_with_cuda_stream
);
auto
linear_module
=
m
.
def_submodule
(
"linear"
);
py
::
class_
<
LinearConfig
>
(
linear_module
,
"LinearConfig"
)
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
intptr_t
proj
,
int
proj_type
,
int
hidden_type
)
{
return
LinearConfig
(
hidden_size
,
intermediate_size
,
stride
,
(
void
*
)
proj
,
(
ggml_type
)
proj_type
,
(
ggml_type
)
hidden_type
);
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_max_len
,
intptr_t
proj
,
int
proj_type
,
int
hidden_type
)
{
return
LinearConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
(
void
*
)
proj
,
(
ggml_type
)
proj_type
,
(
ggml_type
)
hidden_type
);
}));
py
::
class_
<
Linear
>
(
linear_module
,
"Linear"
)
.
def
(
py
::
init
<
LinearConfig
>
())
.
def
(
"warm_up"
,
[](
Linear
&
linear
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
})
.
def
(
"forward"
,
[](
Linear
&
linear
,
intptr_t
input
,
intptr_t
output
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
});
.
def
(
"warm_up"
,
&
LinearBindings
::
WarmUpBindinds
::
interface
)
.
def
(
"forward"
,
&
LinearBindings
::
ForwardBindings
::
interface
);
auto
mlp_module
=
m
.
def_submodule
(
"mlp"
);
py
::
class_
<
MLPConfig
>
(
mlp_module
,
"MLPConfig"
)
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
return
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
return
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
}));
py
::
class_
<
MLP
>
(
mlp_module
,
"MLP"
)
.
def
(
py
::
init
<
MLPConfig
>
())
.
def
(
"warm_up"
,
[](
MLP
&
mlp
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
})
.
def
(
"forward"
,
[](
MLP
&
mlp
,
intptr_t
input
,
intptr_t
output
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
});
.
def
(
"warm_up"
,
&
MLPBindings
::
WarmUpBindinds
::
interface
)
.
def
(
"forward"
,
&
MLPBindings
::
ForwardBindings
::
interface
);
auto
moe_module
=
m
.
def_submodule
(
"moe"
);
py
::
class_
<
MOEConfig
>
(
moe_module
,
"MOEConfig"
)
.
def
(
py
::
init
([](
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_min_len
,
int
group_max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
return
MOEConfig
(
expert_num
,
routed_expert_num
,
hidden_size
,
intermediate_size
,
stride
,
group_min_len
,
group_max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
}));
py
::
class_
<
MOE
>
(
moe_module
,
"MOE"
)
.
def
(
py
::
init
<
MOEConfig
>
())
.
def
(
"warm_up"
,
[](
MOE
&
moe
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
})
.
def
(
"forward"
,
[](
MOE
&
moe
,
int
k
,
uint64_t
expert_ids
,
intptr_t
weights
,
intptr_t
input
,
intptr_t
output
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
});
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
"submit"
,
[
linear_module
,
mlp_module
,
moe_module
](
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
if
(
py
::
hasattr
(
func
,
"__self__"
)
&&
py
::
hasattr
(
func
,
"__func__"
))
{
std
::
string
class_name
=
py
::
str
(
func
.
attr
(
"__self__"
)
.
attr
(
"__class__"
)
.
attr
(
"__name__"
));
if
(
class_name
==
"Linear"
)
{
LinearBindings
::
bind_functions
(
cpuinfer
,
func
,
args
,
kwargs
);
}
else
if
(
class_name
==
"MLP"
)
{
MLPBindings
::
bind_functions
(
cpuinfer
,
func
,
args
,
kwargs
);
}
else
if
(
class_name
==
"MOE"
)
{
MOEBindings
::
bind_functions
(
cpuinfer
,
func
,
args
,
kwargs
);
}
else
{
// handle other classes
throw
py
::
type_error
(
"Unsupported class type: "
+
class_name
);
}
}
else
{
// handle cases where func does not have __self__ or
// __func__
throw
py
::
type_error
(
"Invalid function object: missing "
"__self__ or __func__ attribute."
);
}
})
.
def
(
"submit_with_cuda_stream"
,
[
linear_module
,
mlp_module
,
moe_module
](
CPUInfer
&
cpuinfer
,
intptr_t
user_cuda_stream
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
if
(
py
::
hasattr
(
func
,
"__self__"
)
&&
py
::
hasattr
(
func
,
"__func__"
))
{
std
::
string
class_name
=
py
::
str
(
func
.
attr
(
"__self__"
)
.
attr
(
"__class__"
)
.
attr
(
"__name__"
));
if
(
class_name
==
"MOE"
)
{
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
auto
moe
=
func
.
attr
(
"__self__"
).
cast
<
MOE
*>
();
int
qlen
=
args
[
0
].
cast
<
int
>
();
int
k
=
args
[
1
].
cast
<
int
>
();
auto
expert_ids
=
args
[
2
].
cast
<
intptr_t
>
();
auto
weights
=
args
[
3
].
cast
<
intptr_t
>
();
auto
input
=
args
[
4
].
cast
<
intptr_t
>
();
auto
output
=
args
[
5
].
cast
<
intptr_t
>
();
MOEForwardArgs
*
moe_forward_args
=
new
MOEForwardArgs
{
&
cpuinfer
,
moe
,
qlen
,
k
,
(
uint64_t
*
)
expert_ids
,
(
float
*
)
weights
,
(
void
*
)
input
,
(
void
*
)
output
};
// submit_moe_forward_with_host_args_ptr(moe_forward_args);
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
submit_moe_forward_with_host_args_ptr
,
moe_forward_args
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
}
}
else
{
// handle other classes
throw
py
::
type_error
(
"Unsupported class type: "
+
class_name
);
}
}
else
{
// handle cases where func does not have __self__ or
// __func__
throw
py
::
type_error
(
"Invalid function object: missing "
"__self__ or __func__ attribute."
);
}
})
.
def
(
"sync_with_cuda_stream"
,
[](
CPUInfer
&
cpuinfer
,
intptr_t
user_cuda_stream
)
{
// cpuinfer_sync((void*)(&cpuinfer));
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
cpuinfer_sync
,
(
void
*
)(
&
cpuinfer
));
})
.
def
(
"sync"
,
&
CPUInfer
::
sync
);
.
def
(
"warm_up"
,
&
MOEBindings
::
WarmUpBindinds
::
interface
)
.
def
(
"forward"
,
&
MOEBindings
::
ForwardBindings
::
interface
);
}
ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
View file @
f2938031
...
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:34:58
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
...
...
@@ -13,9 +13,15 @@ Linear::Linear(LinearConfig config) {
config_
=
config
;
proj_
=
config_
.
proj
;
input_fp32_
.
resize
(
config_
.
input_size
);
proj_input_
.
resize
(
config_
.
input_size
*
4
);
proj_output_
.
resize
(
config_
.
output_size
);
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
mem_requests
;
mem_requests
.
push_back
({(
void
**
)
&
input_fp32_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
input_size
});
mem_requests
.
push_back
({(
void
**
)
&
proj_input_
,
config_
.
group_max_len
*
config_
.
input_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)});
mem_requests
.
push_back
({(
void
**
)
&
proj_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
output_size
});
shared_mem_buffer
.
alloc
(
this
,
mem_requests
);
}
Linear
::~
Linear
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
void
Linear
::
warm_up
(
Backend
*
backend
)
{
...
...
@@ -26,22 +32,42 @@ void Linear::warm_up(Backend* backend) {
input_fp32
[
i
]
=
0
;
}
from_float
(
input_fp32
.
data
(),
input
.
data
(),
config_
.
input_size
,
config_
.
hidden_type
);
forward
(
input
.
data
(),
output
.
data
(),
backend
);
forward
_many
(
1
,
input
.
data
(),
output
.
data
(),
backend
);
}
void
Linear
::
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
Linear
::
forward
_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
const
void
*
proj_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)
{
proj_input_ptr
=
input
;
}
else
{
to_float
(
input
,
input_fp32_
.
data
(),
config_
.
input_size
,
config_
.
hidden_type
);
from_float
(
input_fp32_
.
data
()
,
proj_input_
.
data
(),
config_
.
input_size
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
);
proj_input_ptr
=
proj_input_
.
data
()
;
to_float
(
input
,
input_fp32_
,
qlen
*
config_
.
input_size
,
config_
.
hidden_type
);
from_float
(
input_fp32_
,
proj_input_
,
qlen
*
config_
.
input_size
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
);
proj_input_ptr
=
proj_input_
;
}
int
nth
=
config_
.
output_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
%
nth
;
llamafile_sgemm
(
config_
.
output_size
,
1
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_input_ptr
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_output_
.
data
(),
config_
.
output_size
,
ith
,
nth
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
proj_type
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
int
ith
=
task_id
;
void
*
proj_ptr
=
proj_
+
ith
*
config_
.
stride
*
config_
.
input_size
*
ggml_type_size
(
config_
.
proj_type
)
/
ggml_blck_size
(
config_
.
proj_type
);
float
*
proj_output_ptr
=
proj_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_ptr
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_input_ptr
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_output_ptr
,
config_
.
output_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
proj_type
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
float
*
output_fp32_ptr
=
proj_output_
+
i
*
config_
.
output_size
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
i
*
config_
.
output_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
)
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
from_float
(
proj_output_
.
data
(),
output
,
config_
.
output_size
,
config_
.
hidden_type
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
!=
0
)
{
from_float
(
proj_output_
,
output
,
qlen
*
config_
.
output_size
,
config_
.
hidden_type
);
}
}
void
Linear
::
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<=
0
)
{
return
;
}
int
forward_len
=
std
::
min
(
qlen
,
config_
.
group_max_len
);
forward_many
(
forward_len
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
input
+
forward_len
*
config_
.
input_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
output_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/linear.h
View file @
f2938031
...
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:00
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
...
...
@@ -22,34 +22,38 @@
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
LinearConfig
{
int
input_size
;
int
output_size
;
int
stride
;
int
group_max_len
;
void
*
proj
;
ggml_type
proj_type
;
ggml_type
hidden_type
;
LinearConfig
()
{}
LinearConfig
(
int
input_size
,
int
output_size
,
int
stride
,
void
*
proj
,
ggml_type
proj_type
,
ggml_type
hidden_type
)
:
input_size
(
input_size
),
output_size
(
output_size
),
stride
(
stride
),
proj
(
proj
),
proj_type
(
proj_type
),
hidden_type
(
hidden_type
)
{}
LinearConfig
(
int
input_size
,
int
output_size
,
int
stride
,
int
group_max_len
,
void
*
proj
,
ggml_type
proj_type
,
ggml_type
hidden_type
)
:
input_size
(
input_size
),
output_size
(
output_size
),
stride
(
stride
),
group_max_len
(
group_max_len
),
proj
(
proj
),
proj_type
(
proj_type
),
hidden_type
(
hidden_type
)
{}
};
class
Linear
{
public:
Linear
(
LinearConfig
);
~
Linear
();
void
warm_up
(
Backend
*
backend
);
void
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
LinearConfig
config_
;
void
*
proj_
;
// [output_size * input_size ( /32 if quantized)]
std
::
vector
<
float
>
input_fp32_
;
// [input_size]
std
::
vector
<
uint8_t
>
proj_input_
;
// [
input_size * 4
]
std
::
vector
<
float
>
proj_output_
;
// [output_size]
float
*
input_fp32_
;
// [
group_max_len *
input_size]
uint8_t
*
proj_input_
;
// [
group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)
]
float
*
proj_output_
;
// [
group_max_len *
output_size]
};
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
View file @
f2938031
...
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:04
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
...
...
@@ -15,14 +15,20 @@ MLP::MLP(MLPConfig config) {
up_proj_
=
config_
.
up_proj
;
down_proj_
=
config_
.
down_proj
;
input_fp32_
.
resize
(
config_
.
hidden_size
);
gate_input_
.
resize
(
config_
.
hidden_size
*
4
);
up_input_
.
resize
(
config_
.
hidden_size
*
4
);
gate_output_
.
resize
(
config_
.
intermediate_size
);
up_output_
.
resize
(
config_
.
intermediate_size
);
intermediate_fp32_
.
resize
(
config_
.
intermediate_size
);
down_input_
.
resize
(
config_
.
intermediate_size
*
4
);
down_output_
.
resize
(
config_
.
hidden_size
);
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
mem_requests
;
mem_requests
.
push_back
({(
void
**
)
&
input_fp32_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
});
mem_requests
.
push_back
({(
void
**
)
&
gate_input_
,
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
mem_requests
.
push_back
({(
void
**
)
&
up_input_
,
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
mem_requests
.
push_back
({(
void
**
)
&
gate_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
mem_requests
.
push_back
({(
void
**
)
&
up_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
mem_requests
.
push_back
({(
void
**
)
&
intermediate_fp32_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
mem_requests
.
push_back
({(
void
**
)
&
down_input_
,
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)});
mem_requests
.
push_back
({(
void
**
)
&
down_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
});
shared_mem_buffer
.
alloc
(
this
,
mem_requests
);
}
MLP
::~
MLP
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
void
MLP
::
warm_up
(
Backend
*
backend
)
{
...
...
@@ -33,33 +39,33 @@ void MLP::warm_up(Backend* backend) {
input_fp32
[
i
]
=
0
;
}
from_float
(
input_fp32
.
data
(),
input
.
data
(),
config_
.
hidden_size
,
config_
.
hidden_type
);
forward
(
input
.
data
(),
output
.
data
(),
backend
);
forward
_many
(
1
,
input
.
data
(),
output
.
data
(),
backend
);
}
static
float
act_fn
(
float
x
)
{
return
x
/
(
1.0
f
+
expf
(
-
x
));
}
void
MLP
::
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
MLP
::
forward
_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
const
void
*
gate_input_ptr
;
const
void
*
up_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
gate_input_ptr
=
up_input_ptr
=
input
;
}
else
{
to_float
(
input
,
input_fp32_
.
data
(),
config_
.
hidden_size
,
config_
.
hidden_type
);
to_float
(
input
,
input_fp32_
,
qlen
*
config_
.
hidden_size
,
config_
.
hidden_type
);
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
input_fp32_
.
data
()
,
gate_input_
.
data
(),
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
gate_input_
.
data
()
;
from_float
(
input_fp32_
,
gate_input_
,
qlen
*
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
gate_input_
;
}
else
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
{
from_float
(
input_fp32_
.
data
()
,
gate_input_
.
data
(),
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
gate_input_
.
data
()
;
from_float
(
input_fp32_
,
gate_input_
,
qlen
*
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
gate_input_
;
}
else
{
gate_input_ptr
=
input
;
}
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
input_fp32_
.
data
()
,
up_input_
.
data
(),
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
up_input_
.
data
()
;
from_float
(
input_fp32_
,
up_input_
,
qlen
*
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
up_input_
;
}
else
{
up_input_ptr
=
input
;
}
...
...
@@ -69,35 +75,49 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
void
*
gate_proj_ptr
=
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
gate_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
strid
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
float
*
gate_output_ptr
=
gate_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_siz
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
up_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
intermediate_fp32_
[
i
]
=
act_fn
(
gate_output_
[
i
])
*
up_output_
[
i
];
}
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
.
data
()
+
ith
*
config_
.
stride
;
void
*
down_input_ptr
=
down_input_
.
data
()
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
float
*
up_output_ptr
=
up_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
for
(
int
j
=
ith
*
config_
.
stride
;
j
<
(
ith
+
1
)
*
config_
.
stride
;
j
++
)
{
intermediate_fp32_
[
i
*
config_
.
intermediate_size
+
j
]
=
act_fn
(
gate_output_
[
i
*
config_
.
intermediate_size
+
j
])
*
up_output_
[
i
*
config_
.
intermediate_size
+
j
];
}
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
+
i
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
;
void
*
down_input_ptr
=
down_input_
+
i
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
}
}
});
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
!=
0
)
{
from_float
(
intermediate_fp32_
.
data
()
,
down_input_
.
data
(),
config_
.
intermediate_size
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_
,
down_input_
,
qlen
*
config_
.
intermediate_size
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
}
nth
=
config_
.
hidden_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
void
*
down_proj_ptr
=
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
down_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
.
data
()
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
strid
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
float
*
down_output_ptr
=
down_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_siz
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
down_output_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
float
*
output_fp32_ptr
=
down_output_
+
i
*
config_
.
hidden_size
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
)
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
!=
0
)
{
from_float
(
down_output_
.
data
()
,
output
,
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
down_output_
,
output
,
qlen
*
config_
.
hidden_size
,
config_
.
hidden_type
);
}
}
void
MLP
::
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<=
0
)
{
return
;
}
int
forward_len
=
std
::
min
(
qlen
,
config_
.
group_max_len
);
forward_many
(
forward_len
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/mlp.h
View file @
f2938031
...
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
...
...
@@ -22,11 +22,13 @@
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MLPConfig
{
int
hidden_size
;
int
intermediate_size
;
int
stride
;
int
group_max_len
;
void
*
gate_proj
;
void
*
up_proj
;
void
*
down_proj
;
...
...
@@ -37,15 +39,17 @@ struct MLPConfig {
MLPConfig
()
{}
MLPConfig
(
int
hidden_size
,
int
intermediate_size
,
int
stride
,
void
*
gate_proj
,
void
*
up_proj
,
void
*
down_proj
,
ggml_type
gate_type
,
ggml_type
up_type
,
ggml_type
down_type
,
ggml_type
hidden_type
)
:
hidden_size
(
hidden_size
),
intermediate_size
(
intermediate_size
),
stride
(
stride
),
gate_proj
(
gate_proj
),
up_proj
(
up_proj
),
down_proj
(
down_proj
),
gate_type
(
gate_type
),
up_type
(
up_type
),
down_type
(
down_type
),
hidden_type
(
hidden_type
)
{}
MLPConfig
(
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_max_len
,
void
*
gate_proj
,
void
*
up_proj
,
void
*
down_proj
,
ggml_type
gate_type
,
ggml_type
up_type
,
ggml_type
down_type
,
ggml_type
hidden_type
)
:
hidden_size
(
hidden_size
),
intermediate_size
(
intermediate_size
),
stride
(
stride
),
group_max_len
(
group_max_len
),
gate_proj
(
gate_proj
),
up_proj
(
up_proj
),
down_proj
(
down_proj
),
gate_type
(
gate_type
),
up_type
(
up_type
),
down_type
(
down_type
),
hidden_type
(
hidden_type
)
{}
};
class
MLP
{
public:
MLP
(
MLPConfig
);
~
MLP
();
void
warm_up
(
Backend
*
backend
);
void
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
MLPConfig
config_
;
...
...
@@ -53,14 +57,14 @@ class MLP {
void
*
up_proj_
;
// [intermediate_size * hidden_size ( /32 if quantized)]
void
*
down_proj_
;
// [hidden_size * intermediate_size ( /32 if quantized)]
std
::
vector
<
float
>
input_fp32_
;
// [hidden_size]
std
::
vector
<
uint8_t
>
gate_input_
;
// [
hidden_size * 4
]
std
::
vector
<
uint8_t
>
up_input_
;
// [
hidden_size * 4
]
std
::
vector
<
float
>
gate_output_
;
// [intermediate_size]
std
::
vector
<
float
>
up_output_
;
// [intermediate_size]
std
::
vector
<
float
>
intermediate_fp32_
;
// [intermediate_size]
std
::
vector
<
uint8_t
>
down_input_
;
// [intermediate_size *
4
]
std
::
vector
<
float
>
down_output_
;
// [hidden_size]
float
*
input_fp32_
;
// [
group_max_len *
hidden_size]
uint8_t
*
gate_input_
;
// [
group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)
]
uint8_t
*
up_input_
;
// [
group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)
]
float
*
gate_output_
;
// [
group_max_len *
intermediate_size]
float
*
up_output_
;
// [
group_max_len *
intermediate_size]
float
*
intermediate_fp32_
;
// [
group_max_len *
intermediate_size]
uint8_t
*
down_input_
;
// [
group_max_len *
intermediate_size *
ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)
]
float
*
down_output_
;
// [
group_max_len *
hidden_size]
};
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
View file @
f2938031
/**
* @Description :
* @Description :
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:07
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "moe.h"
#include <iostream>
#include "unistd.h"
void
*
MOE
::
buffer_
=
nullptr
;
MOE
::
MOE
(
MOEConfig
config
)
{
config_
=
config
;
gate_proj_
=
config_
.
gate_proj
;
up_proj_
=
config_
.
up_proj
;
down_proj_
=
config_
.
down_proj
;
if
(
MOE
::
buffer_
==
nullptr
)
{
uint64_t
buffer_size
=
0
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
buffer_size
+=
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_
=
malloc
(
buffer_size
);
}
uint64_t
offset
=
0
;
s_input_fp32_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
s_gate_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
s_up_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
s_mem_requests
;
s_mem_requests
.
push_back
({(
void
**
)
&
s_input_fp32_
,
sizeof
(
float
)
*
config_
.
hidden_size
});
s_mem_requests
.
push_back
({(
void
**
)
&
s_gate_input_
,
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
s_mem_requests
.
push_back
({(
void
**
)
&
s_up_input_
,
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
s_gate_output_
.
resize
(
config_
.
routed_expert_num
);
s_up_output_
.
resize
(
config_
.
routed_expert_num
);
s_intermediate_fp32_
.
resize
(
config_
.
routed_expert_num
);
s_down_input_
.
resize
(
config_
.
routed_expert_num
);
s_down_output_
.
resize
(
config_
.
routed_expert_num
);
for
(
int
i
=
0
;
i
<
config_
.
routed_expert_num
;
i
++
)
{
s_gate_output_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
intermediate_size
;
s_up_output_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
intermediate_size
;
s_intermediate_fp32_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
intermediate_size
;
s_down_input_
[
i
]
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
s_down_output_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
s_mem_requests
.
push_back
({(
void
**
)
&
s_gate_output_
[
i
],
sizeof
(
float
)
*
config_
.
intermediate_size
});
s_mem_requests
.
push_back
({(
void
**
)
&
s_up_output_
[
i
],
sizeof
(
float
)
*
config_
.
intermediate_size
});
s_mem_requests
.
push_back
({(
void
**
)
&
s_intermediate_fp32_
[
i
],
sizeof
(
float
)
*
config_
.
intermediate_size
});
s_mem_requests
.
push_back
({(
void
**
)
&
s_down_input_
[
i
],
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)});
s_mem_requests
.
push_back
({(
void
**
)
&
s_down_output_
[
i
],
sizeof
(
float
)
*
config_
.
hidden_size
});
}
s_output_fp32_
=
(
float
*
)(
buffer_
+
offset
);
s_mem_requests
.
push_back
({(
void
**
)
&
s_output_fp32_
,
sizeof
(
float
)
*
config_
.
hidden_size
});
shared_mem_buffer
.
alloc
(
this
,
s_mem_requests
);
offset
=
0
;
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
m_mem_requests
;
m_input_fp32_
.
resize
(
config_
.
group_max_len
);
m_gate_input_
.
resize
(
config_
.
group_max_len
);
m_up_input_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_input_fp32_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
m_gate_input_
[
i
]
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
m_up_input_
[
i
]
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_input_fp32_
[
i
],
sizeof
(
float
)
*
config_
.
hidden_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_gate_input_
[
i
],
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
m_mem_requests
.
push_back
({(
void
**
)
&
m_up_input_
[
i
],
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
}
m_local_gate_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
m_local_up_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
m_local_gate_output_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
m_local_up_output_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
m_local_intermediate_fp32_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
m_local_down_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
m_local_down_output_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_gate_input_
,
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_up_input_
,
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_gate_output_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_up_output_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_intermediate_fp32_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_down_input_
,
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_down_output_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
});
m_output_fp32_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_output_fp32_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
m_mem_requests
.
push_back
({(
void
**
)
&
m_output_fp32_
[
i
],
sizeof
(
float
)
*
config_
.
hidden_size
});
}
shared_mem_buffer
.
alloc
(
this
,
m_mem_requests
);
m_local_pos_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
...
...
@@ -107,6 +72,10 @@ MOE::MOE(MOEConfig config) {
m_local_down_output_ptr_
.
resize
(
config_
.
expert_num
);
}
MOE
::~
MOE
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
void
MOE
::
warm_up
(
Backend
*
backend
)
{
std
::
vector
<
float
>
input_fp32
(
config_
.
hidden_size
);
std
::
vector
<
uint8_t
>
input
(
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
));
...
...
ktransformers/ktransformers_ext/operators/llamafile/moe.h
View file @
f2938031
...
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:10
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
...
...
@@ -22,6 +22,7 @@
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MOEConfig
{
int
expert_num
;
...
...
@@ -48,13 +49,13 @@ struct MOEConfig {
class
MOE
{
public:
MOE
(
MOEConfig
);
~
MOE
();
void
warm_up
(
Backend
*
backend
);
void
forward_one
(
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
static
void
*
buffer_
;
MOEConfig
config_
;
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
...
...
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
0 → 100644
View file @
f2938031
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 09:21:29
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "shared_mem_buffer.h"
#include <cstdio>
SharedMemBuffer
::
SharedMemBuffer
()
{
buffer_
=
nullptr
;
size_
=
0
;
}
SharedMemBuffer
::~
SharedMemBuffer
()
{
if
(
buffer_
)
{
free
(
buffer_
);
}
}
void
SharedMemBuffer
::
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
uint64_t
size
=
0
;
for
(
auto
&
request
:
requests
)
{
size
+=
request
.
second
;
}
if
(
size
>
size_
)
{
if
(
buffer_
)
{
free
(
buffer_
);
}
buffer_
=
malloc
(
size
);
size_
=
size
;
for
(
auto
&
obj_requests
:
hist_requests_
)
{
for
(
auto
&
requests
:
obj_requests
.
second
)
{
arrange
(
requests
);
}
}
}
arrange
(
requests
);
hist_requests_
[
object
].
push_back
(
requests
);
}
void
SharedMemBuffer
::
dealloc
(
void
*
object
)
{
hist_requests_
.
erase
(
object
);
}
void
SharedMemBuffer
::
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
uint64_t
offset
=
0
;
for
(
auto
&
request
:
requests
)
{
*
(
request
.
first
)
=
buffer_
+
offset
;
offset
+=
request
.
second
;
}
}
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
0 → 100644
View file @
f2938031
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class
SharedMemBuffer
{
public:
SharedMemBuffer
();
~
SharedMemBuffer
();
void
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
void
dealloc
(
void
*
object
);
private:
void
*
buffer_
;
uint64_t
size_
;
std
::
map
<
void
*
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>>>
hist_requests_
;
void
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
};
static
SharedMemBuffer
shared_mem_buffer
;
#endif
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment