Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
f2938031
Unverified
Commit
f2938031
authored
Aug 09, 2024
by
UnicornChan
Committed by
GitHub
Aug 09, 2024
Browse files
Merge pull request #27 from chenht2022/develop-0.1.2
[Feature] towards 0.1.2
parents
442e13bc
c1cc7d2c
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
745 additions
and
727 deletions
+745
-727
ktransformers/ktransformers_ext/CMakeLists.txt
ktransformers/ktransformers_ext/CMakeLists.txt
+1
-2
ktransformers/ktransformers_ext/bench/bench_linear.py
ktransformers/ktransformers_ext/bench/bench_linear.py
+31
-21
ktransformers/ktransformers_ext/bench/bench_linear_torch.py
ktransformers/ktransformers_ext/bench/bench_linear_torch.py
+21
-23
ktransformers/ktransformers_ext/bench/bench_mlp.py
ktransformers/ktransformers_ext/bench/bench_mlp.py
+31
-21
ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
+33
-53
ktransformers/ktransformers_ext/bench/bench_moe.py
ktransformers/ktransformers_ext/bench/bench_moe.py
+35
-29
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
+68
-79
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+30
-6
ktransformers/ktransformers_ext/examples/test_linear.py
ktransformers/ktransformers_ext/examples/test_linear.py
+22
-43
ktransformers/ktransformers_ext/examples/test_mlp.py
ktransformers/ktransformers_ext/examples/test_mlp.py
+35
-51
ktransformers/ktransformers_ext/examples/test_moe.py
ktransformers/ktransformers_ext/examples/test_moe.py
+73
-65
ktransformers/ktransformers_ext/ext_bindings.cpp
ktransformers/ktransformers_ext/ext_bindings.cpp
+118
-203
ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
...sformers/ktransformers_ext/operators/llamafile/linear.cpp
+38
-12
ktransformers/ktransformers_ext/operators/llamafile/linear.h
ktransformers/ktransformers_ext/operators/llamafile/linear.h
+11
-7
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+55
-35
ktransformers/ktransformers_ext/operators/llamafile/mlp.h
ktransformers/ktransformers_ext/operators/llamafile/mlp.h
+16
-12
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+32
-63
ktransformers/ktransformers_ext/operators/llamafile/moe.h
ktransformers/ktransformers_ext/operators/llamafile/moe.h
+3
-2
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
...ransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+55
-0
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
...ktransformers_ext/operators/llamafile/shared_mem_buffer.h
+37
-0
No files found.
ktransformers/ktransformers_ext/CMakeLists.txt
View file @
f2938031
...
@@ -22,14 +22,13 @@ option(LLAMA_AVX2 "llama: enable AVX2"
...
@@ -22,14 +22,13 @@ option(LLAMA_AVX2 "llama: enable AVX2"
option
(
LLAMA_AVX512
"llama: enable AVX512"
OFF
)
option
(
LLAMA_AVX512
"llama: enable AVX512"
OFF
)
option
(
LLAMA_AVX512_VBMI
"llama: enable AVX512-VBMI"
OFF
)
option
(
LLAMA_AVX512_VBMI
"llama: enable AVX512-VBMI"
OFF
)
option
(
LLAMA_AVX512_VNNI
"llama: enable AVX512-VNNI"
OFF
)
option
(
LLAMA_AVX512_VNNI
"llama: enable AVX512-VNNI"
OFF
)
option
(
LLAMA_AVX512_BF16
"llama: enable AVX512-BF16"
OFF
)
option
(
LLAMA_FMA
"llama: enable FMA"
OFF
)
option
(
LLAMA_FMA
"llama: enable FMA"
OFF
)
# in MSVC F16C is implied with AVX2/AVX512
# in MSVC F16C is implied with AVX2/AVX512
if
(
NOT MSVC
)
if
(
NOT MSVC
)
option
(
LLAMA_F16C
"llama: enable F16C"
OFF
)
option
(
LLAMA_F16C
"llama: enable F16C"
OFF
)
endif
()
endif
()
option
(
LLAMA_AVX512_FANCY_SIMD
"llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"
OFF
)
option
(
LLAMA_AVX512_FANCY_SIMD
"llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"
OFF
)
option
(
LLAMA_AVX512_BF16
"llama: enable AVX512-BF16"
OFF
)
# Architecture specific
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# TODO: probably these flags need to be tweaked on some architectures
...
...
ktransformers/ktransformers_ext/bench/bench_linear.py
View file @
f2938031
...
@@ -6,7 +6,7 @@ Author : chenht2022
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:31:59
Date : 2024-07-25 10:31:59
Version : 1.0.0
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
2:51
LastEditTime : 2024-0
8-06
10:3
5:35
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
import
os
,
sys
import
os
,
sys
...
@@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
...
@@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
cpuinfer_ext
import
torch
import
torch
input_size
=
16384
output_size
=
5120
stride
=
16
group_max_len
=
1024
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
def
bench_linear
(
quant_mode
:
str
):
def
bench_linear
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
output_size
=
5120
stride
=
16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
if
quant_mode
==
"fp32"
:
if
quant_mode
==
"fp32"
:
...
@@ -66,30 +69,37 @@ def bench_linear(quant_mode: str):
...
@@ -66,30 +69,37 @@ def bench_linear(quant_mode: str):
projs
=
[]
projs
=
[]
for
_
in
range
(
layer_num
):
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
group_max_len
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
projs
.
append
(
proj
)
projs
.
append
(
proj
)
linears
.
append
(
linear
)
linears
.
append
(
linear
)
input
=
torch
.
randn
((
layer_num
,
qlen
,
input_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
output_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
# warm up
for
i
in
range
(
warm_up_iter
):
for
i
in
range
(
warm_up_iter
):
linear
=
linears
[
i
%
layer_num
]
CPUInfer
.
submit
(
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
linears
[
i
%
layer_num
].
forward
(
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
qlen
,
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
# test
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
for
i
in
range
(
test_iter
):
linear
=
linears
[
i
%
layer_num
]
CPUInfer
.
submit
(
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
linears
[
i
%
layer_num
].
forward
(
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
qlen
,
start
=
time
.
perf_counter
()
input
[
i
%
layer_num
].
data_ptr
(),
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
total_time
+
=
end
-
start
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_linear_torch.py
View file @
f2938031
...
@@ -14,14 +14,17 @@ import time
...
@@ -14,14 +14,17 @@ import time
import
torch
import
torch
import
torch.nn.quantized
as
nnq
import
torch.nn.quantized
as
nnq
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
input_size
=
16384
output_size
=
5120
layer_num
=
10
qlen
=
1
warm_up_iter
=
1000
test_iter
=
10000
def
bench_linear
(
quant_mode
:
str
):
def
bench_linear
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
output_size
=
5120
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
bytes_per_elem
=
4.000000
...
@@ -41,37 +44,32 @@ def bench_linear(quant_mode: str):
...
@@ -41,37 +44,32 @@ def bench_linear(quant_mode: str):
for
_
in
range
(
layer_num
):
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
proj_q
=
torch
.
quantize_per_tensor
(
proj
,
scale
,
zero_point
,
torch
.
qint8
)
proj_q
=
torch
.
quantize_per_tensor
(
proj
,
scale
,
zero_point
,
torch
.
qint8
)
quantized_layer
=
nnq
.
Linear
(
input_size
,
output_size
)
quantized_layer
=
nnq
.
Linear
(
input_size
,
output_size
)
quantized_layer
.
set_weight_bias
(
proj_q
,
None
)
quantized_layer
.
set_weight_bias
(
proj_q
,
None
)
projs
.
append
(
quantized_layer
)
projs
.
append
(
quantized_layer
)
else
:
else
:
projs
.
append
(
proj
.
to
(
proj_type
))
projs
.
append
(
proj
.
to
(
proj_type
))
input
=
torch
.
randn
((
layer_num
,
qlen
,
input_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
# warm up
for
i
in
range
(
warm_up_iter
):
for
i
in
range
(
warm_up_iter
):
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
isinstance
(
projs
[
i
%
layer_num
],
nnq
.
Linear
):
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
[
i
%
layer_num
].
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
projs
[
i
%
layer_num
](
input_q
)
quantized_layer
=
projs
[
i
%
layer_num
]
t_output
=
quantized_layer
(
input_q
)
else
:
else
:
t_output
=
torch
.
mm
(
input
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
t_output
=
torch
.
mm
(
input
[
i
%
layer_num
]
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
# test
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
for
i
in
range
(
test_iter
):
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
isinstance
(
projs
[
i
%
layer_num
],
nnq
.
Linear
):
start
=
time
.
perf_counter
()
input_q
=
torch
.
quantize_per_tensor
(
input
[
i
%
layer_num
].
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
if
quant_mode
==
"qint8"
:
t_output
=
projs
[
i
%
layer_num
](
input_q
)
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_layer
=
projs
[
i
%
layer_num
]
t_output
=
quantized_layer
(
input_q
)
else
:
else
:
t_output
=
torch
.
mm
(
input
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
t_output
=
torch
.
mm
(
input
[
i
%
layer_num
]
.
to
(
proj_type
),
projs
[
i
%
layer_num
].
t
())
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
total_time
+
=
end
-
start
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_mlp.py
View file @
f2938031
...
@@ -6,7 +6,7 @@ Author : chenht2022
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-16 10:43:18
Date : 2024-07-16 10:43:18
Version : 1.0.0
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
2:55
LastEditTime : 2024-0
8-06
10:3
6:04
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
import
os
,
sys
import
os
,
sys
...
@@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
...
@@ -15,15 +15,18 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
cpuinfer_ext
import
torch
import
torch
hidden_size
=
5120
intermediate_size
=
3072
stride
=
16
group_max_len
=
1024
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
def
bench_mlp
(
quant_mode
:
str
):
def
bench_mlp
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
with
torch
.
inference_mode
(
mode
=
True
):
hidden_size
=
5120
intermediate_size
=
3072
stride
=
16
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
if
quant_mode
==
"fp32"
:
if
quant_mode
==
"fp32"
:
...
@@ -93,32 +96,39 @@ def bench_mlp(quant_mode: str):
...
@@ -93,32 +96,39 @@ def bench_mlp(quant_mode: str):
gate_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
gate_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
mlp
=
cpuinfer_ext
.
mlp
.
MLP
(
config
)
mlp
=
cpuinfer_ext
.
mlp
.
MLP
(
config
)
gate_projs
.
append
(
gate_proj
)
gate_projs
.
append
(
gate_proj
)
up_projs
.
append
(
up_proj
)
up_projs
.
append
(
up_proj
)
down_projs
.
append
(
down_proj
)
down_projs
.
append
(
down_proj
)
mlps
.
append
(
mlp
)
mlps
.
append
(
mlp
)
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
# warm up
for
i
in
range
(
warm_up_iter
):
for
i
in
range
(
warm_up_iter
):
mlp
=
mlps
[
i
%
layer_num
]
CPUInfer
.
submit
(
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
mlps
[
i
%
layer_num
].
forward
(
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
qlen
,
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
# test
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
for
i
in
range
(
test_iter
):
mlp
=
mlps
[
i
%
layer_num
]
CPUInfer
.
submit
(
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
mlps
[
i
%
layer_num
].
forward
(
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
bfloat16
).
contiguous
()
qlen
,
start
=
time
.
perf_counter
()
input
[
i
%
layer_num
].
data_ptr
(),
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
total_time
+
=
end
-
start
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_mlp_torch.py
View file @
f2938031
...
@@ -14,17 +14,38 @@ import time
...
@@ -14,17 +14,38 @@ import time
import
torch
import
torch
import
torch.nn.quantized
as
nnq
import
torch.nn.quantized
as
nnq
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
hidden_size
=
5120
intermediate_size
=
3072
layer_num
=
10
qlen
=
1
warm_up_iter
=
1000
test_iter
=
10000
def
act_fn
(
x
):
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
if
isinstance
(
gate_proj
,
nnq
.
Linear
):
input_q
=
torch
.
quantize_per_tensor
(
input
.
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
gate_buf
=
gate_proj
(
input_q
)
up_buf
=
up_proj
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
expert_output
=
down_proj
(
intermediate_q
)
ret
=
expert_output
.
dequantize
()
else
:
gate_buf
=
torch
.
mm
(
input
.
to
(
gate_proj
.
dtype
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
up_proj
.
dtype
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
.
to
(
down_proj
.
dtype
),
down_proj
.
t
())
return
ret
def
bench_mlp
(
quant_mode
:
str
):
def
bench_mlp
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
with
torch
.
inference_mode
(
mode
=
True
):
hidden_size
=
5120
intermediate_size
=
3072
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
bytes_per_elem
=
4.000000
...
@@ -48,7 +69,6 @@ def bench_mlp(quant_mode: str):
...
@@ -48,7 +69,6 @@ def bench_mlp(quant_mode: str):
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
gate_proj_q
=
torch
.
quantize_per_tensor
(
gate_proj
,
scale
,
zero_point
,
torch
.
qint8
)
gate_proj_q
=
torch
.
quantize_per_tensor
(
gate_proj
,
scale
,
zero_point
,
torch
.
qint8
)
quantized_gate
=
nnq
.
Linear
(
hidden_size
,
intermediate_size
)
quantized_gate
=
nnq
.
Linear
(
hidden_size
,
intermediate_size
)
quantized_gate
.
set_weight_bias
(
gate_proj_q
,
None
)
quantized_gate
.
set_weight_bias
(
gate_proj_q
,
None
)
...
@@ -65,58 +85,18 @@ def bench_mlp(quant_mode: str):
...
@@ -65,58 +85,18 @@ def bench_mlp(quant_mode: str):
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
# warm up
for
i
in
range
(
warm_up_iter
):
for
i
in
range
(
warm_up_iter
):
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
mlp_torch
(
input
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_gate
=
gate_projs
[
i
%
layer_num
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_projs
[
i
%
layer_num
]
up_buf
=
quantized_gate
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_projs
[
i
%
layer_num
]
t_output
=
quantized_down
(
intermediate_q
)
else
:
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
t_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
.
t
())
# test
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
for
i
in
range
(
test_iter
):
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
mlp_torch
(
input
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
start
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
if
quant_mode
==
"qint8"
:
total_time
=
end
-
start
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_gate
=
gate_projs
[
i
%
layer_num
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_projs
[
i
%
layer_num
]
up_buf
=
quantized_gate
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_projs
[
i
%
layer_num
]
t_output
=
quantized_down
(
intermediate_q
)
else
:
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
t_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
.
t
())
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/bench/bench_moe.py
View file @
f2938031
...
@@ -6,7 +6,7 @@ Author : chenht2022
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Date : 2024-07-25 10:32:05
Version : 1.0.0
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-0
7-25 10:33:00
LastEditTime : 2024-0
8-06 10:41:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
import
os
,
sys
import
os
,
sys
...
@@ -15,21 +15,21 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
...
@@ -15,21 +15,21 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
cpuinfer_ext
import
torch
import
torch
expert_num
=
160
hidden_size
=
5120
intermediate_size
=
1536
stride
=
16
group_min_len
=
10
group_max_len
=
1024
n_routed_experts
=
6
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
def
bench_moe
(
quant_mode
:
str
):
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
10
hidden_size
=
5120
intermediate_size
=
1536
stride
=
16
group_min_len
=
10
group_max_len
=
1024
n_routed_experts
=
6
layer_num
=
10
qlen
=
1
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
64
)
warm_up_iter
=
1000
test_iter
=
10000
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
hidden_type
=
30
# ggml_type::GGML_TYPE_BF16
if
quant_mode
==
"fp32"
:
if
quant_mode
==
"fp32"
:
gate_type
=
0
# ggml_type::GGML_TYPE_F32
gate_type
=
0
# ggml_type::GGML_TYPE_F32
...
@@ -104,32 +104,38 @@ def bench_moe(quant_mode: str):
...
@@ -104,32 +104,38 @@ def bench_moe(quant_mode: str):
up_projs
.
append
(
up_proj
)
up_projs
.
append
(
up_proj
)
down_projs
.
append
(
down_proj
)
down_projs
.
append
(
down_proj
)
moes
.
append
(
moe
)
moes
.
append
(
moe
)
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
expert_ids
=
torch
.
stack
([
torch
.
stack
([
torch
.
randperm
(
expert_num
,
dtype
=
torch
.
int64
,
device
=
"cuda"
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)])
for
_
in
range
(
layer_num
)]
).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
# warm up
for
i
in
range
(
warm_up_iter
):
for
i
in
range
(
warm_up_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
,
CPUInfer
.
submit
(
qlen
,
moes
[
i
%
layer_num
].
forward
(
n_routed_experts
,
qlen
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
n_routed_experts
,
weights
[
i
%
layer_num
].
data_ptr
(),
expert_ids
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
())
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
# test
# test
start
=
time
.
perf_counter
()
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
for
i
in
range
(
test_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
,
CPUInfer
.
submit
(
qlen
,
moes
[
i
%
layer_num
].
forward
(
n_routed_experts
,
qlen
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
n_routed_experts
,
weights
[
i
%
layer_num
].
data_ptr
(),
expert_ids
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
())
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
end
=
time
.
perf_counter
()
total_time
=
end
-
start
total_time
=
end
-
start
...
...
ktransformers/ktransformers_ext/bench/bench_moe_torch.py
View file @
f2938031
...
@@ -14,19 +14,71 @@ import time
...
@@ -14,19 +14,71 @@ import time
import
torch
import
torch
import
torch.nn.quantized
as
nnq
import
torch.nn.quantized
as
nnq
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
expert_num
=
160
hidden_size
=
5120
intermediate_size
=
1536
n_routed_experts
=
6
layer_num
=
10
qlen
=
1
warm_up_iter
=
1000
test_iter
=
10000
def
act_fn
(
x
):
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
if
isinstance
(
gate_proj
,
nnq
.
Linear
):
input_q
=
torch
.
quantize_per_tensor
(
input
.
to
(
torch
.
float32
),
scale
,
zero_point
,
torch
.
quint8
)
gate_buf
=
gate_proj
(
input_q
)
up_buf
=
up_proj
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
expert_output
=
down_proj
(
intermediate_q
)
ret
=
expert_output
.
dequantize
()
else
:
gate_buf
=
torch
.
mm
(
input
.
to
(
gate_proj
.
dtype
),
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
up_proj
.
dtype
),
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
.
to
(
down_proj
.
dtype
),
down_proj
.
t
())
return
ret
def
moe_torch
(
input
,
expert_ids
,
weights
,
gate_proj
,
up_proj
,
down_proj
):
cnts
=
expert_ids
.
new_zeros
((
expert_ids
.
shape
[
0
],
expert_num
))
cnts
.
scatter_
(
1
,
expert_ids
,
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
expert_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
input
[
idxs
//
expert_ids
.
shape
[
1
]]
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
expert_out
=
mlp_torch
(
tokens_for_this_expert
,
gate_proj
[
i
],
up_proj
[
i
],
down_proj
[
i
])
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
t_output
=
(
new_x
.
view
(
*
expert_ids
.
shape
,
-
1
)
.
type
(
weights
.
dtype
)
.
mul_
(
weights
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
new_x
.
dtype
)
)
return
t_output
def
bench_moe
(
quant_mode
:
str
):
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
10
hidden_size
=
5120
intermediate_size
=
1536
n_routed_experts
=
6
layer_num
=
10
warm_up_iter
=
1000
test_iter
=
10000
if
quant_mode
==
"fp32"
:
if
quant_mode
==
"fp32"
:
proj_type
=
torch
.
float32
proj_type
=
torch
.
float32
bytes_per_elem
=
4.000000
bytes_per_elem
=
4.000000
...
@@ -50,7 +102,6 @@ def bench_moe(quant_mode: str):
...
@@ -50,7 +102,6 @@ def bench_moe(quant_mode: str):
up_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
if
quant_mode
==
"qint8"
:
if
quant_mode
==
"qint8"
:
scale
,
zero_point
=
0.1
,
0
# Adjust scale and zero_point based on your dataset
quantized_gate_proj
=
[]
quantized_gate_proj
=
[]
quantized_up_proj
=
[]
quantized_up_proj
=
[]
quantized_down_proj
=
[]
quantized_down_proj
=
[]
...
@@ -74,82 +125,20 @@ def bench_moe(quant_mode: str):
...
@@ -74,82 +125,20 @@ def bench_moe(quant_mode: str):
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
gate_projs
.
append
(
gate_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
up_projs
.
append
(
up_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
down_projs
.
append
(
down_proj
.
to
(
proj_type
))
expert_ids
=
torch
.
stack
([
torch
.
stack
([
torch
.
randperm
(
expert_num
,
dtype
=
torch
.
int64
,
device
=
"cuda"
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)])
for
_
in
range
(
layer_num
)]).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
# warm up
# warm up
for
i
in
range
(
warm_up_iter
):
for
i
in
range
(
warm_up_iter
):
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
n_routed_experts
,),
dtype
=
torch
.
int64
).
contiguous
()
moe_torch
(
input
[
i
%
layer_num
],
expert_ids
[
i
%
layer_num
],
weights
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
weights
=
torch
.
rand
((
n_routed_experts
,),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
quantized_gate
=
gate_proj
[
expert_id
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_proj
[
expert_id
]
up_buf
=
quantized_up
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_proj
[
expert_id
]
expert_output
=
quantized_down
(
intermediate_q
)
expert_output
=
expert_output
.
dequantize
()
t_output
+=
weights
[
i
]
*
expert_output
else
:
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
proj_type
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
[
expert_id
].
t
())
t_output
+=
weights
[
i
]
*
expert_output
# test
# test
total_time
=
0
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
for
i
in
range
(
test_iter
):
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
n_routed_experts
,),
dtype
=
torch
.
int64
).
contiguous
()
moe_torch
(
input
[
i
%
layer_num
],
expert_ids
[
i
%
layer_num
],
weights
[
i
%
layer_num
],
gate_projs
[
i
%
layer_num
],
up_projs
[
i
%
layer_num
],
down_projs
[
i
%
layer_num
])
weights
=
torch
.
rand
((
n_routed_experts
,),
dtype
=
torch
.
float32
).
contiguous
()
end
=
time
.
perf_counter
()
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
total_time
=
end
-
start
start
=
time
.
perf_counter
()
if
quant_mode
==
"qint8"
:
input_q
=
torch
.
quantize_per_tensor
(
input
,
scale
,
zero_point
,
torch
.
quint8
)
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
quantized_gate
=
gate_proj
[
expert_id
]
gate_buf
=
quantized_gate
(
input_q
)
quantized_up
=
up_proj
[
expert_id
]
up_buf
=
quantized_up
(
input_q
)
gate_buf
=
gate_buf
.
dequantize
()
up_buf
=
up_buf
.
dequantize
()
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
intermediate_q
=
torch
.
quantize_per_tensor
(
intermediate
,
scale
,
zero_point
,
torch
.
quint8
)
quantized_down
=
down_proj
[
expert_id
]
expert_output
=
quantized_down
(
intermediate_q
)
expert_output
=
expert_output
.
dequantize
()
t_output
+=
weights
[
i
]
*
expert_output
else
:
t_output
=
torch
.
zeros
((
1
,
hidden_size
),
dtype
=
proj_type
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
i
,
expert_id
in
enumerate
(
expert_ids
):
gate_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
.
to
(
proj_type
),
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
.
to
(
proj_type
),
down_proj
[
expert_id
].
t
())
t_output
+=
weights
[
i
]
*
expert_output
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Iteration: '
,
test_iter
)
...
...
ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
View file @
f2938031
/**
/**
* @Description :
* @Description :
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-0
7-25 10:33
:4
2
* @LastEditTime : 2024-0
8-07 09:47
:4
3
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
#ifndef CPUINFER_CPUINFER_H
#ifndef CPUINFER_CPUINFER_H
#define CPUINFER_CPUINFER_H
#define CPUINFER_CPUINFER_H
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <queue>
#include <queue>
#include <thread>
#include <thread>
#include <vector>
#include <vector>
#include "cuda_runtime.h"
#include "backend.h"
#include "backend.h"
#include "task_queue.h"
#include "task_queue.h"
...
@@ -39,16 +40,39 @@ class CPUInfer {
...
@@ -39,16 +40,39 @@ class CPUInfer {
}
}
template
<
typename
Func
,
typename
Obj
,
typename
...
Args
>
template
<
typename
Func
,
typename
Obj
,
typename
...
Args
>
void
submit
(
Func
f
,
Obj
*
obj
,
Args
...
args
)
{
void
enqueue
(
Func
f
,
Obj
*
obj
,
Args
...
args
)
{
task_queue_
->
enqueue
([
=
]()
{
task_queue_
->
enqueue
([
=
]()
{
std
::
invoke
(
f
,
*
obj
,
args
...,
backend_
);
std
::
invoke
(
f
,
*
obj
,
args
...,
backend_
);
});
});
}
}
void
submit
(
std
::
pair
<
intptr_t
,
intptr_t
>
params
)
{
void
(
*
func
)(
void
*
)
=
(
void
(
*
)(
void
*
))
params
.
first
;
void
*
args
=
(
void
*
)
params
.
second
;
*
((
CPUInfer
**
)
args
)
=
this
;
func
(
args
);
}
void
sync
()
{
void
sync
()
{
task_queue_
->
sync
();
task_queue_
->
sync
();
}
}
void
submit_with_cuda_stream
(
intptr_t
user_cuda_stream
,
std
::
pair
<
intptr_t
,
intptr_t
>
params
)
{
void
(
*
func
)(
void
*
)
=
(
void
(
*
)(
void
*
))
params
.
first
;
void
*
args
=
(
void
*
)
params
.
second
;
*
((
CPUInfer
**
)
args
)
=
this
;
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
func
,
args
);
}
static
void
sync_
(
void
*
cpu_infer_ptr
)
{
CPUInfer
*
cpuinfer
=
(
CPUInfer
*
)
cpu_infer_ptr
;
cpuinfer
->
sync
();
}
void
sync_with_cuda_stream
(
intptr_t
user_cuda_stream
)
{
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
&
sync_
,
(
void
*
)
this
);
}
public:
public:
Backend
*
backend_
;
Backend
*
backend_
;
TaskQueue
*
task_queue_
;
TaskQueue
*
task_queue_
;
...
...
ktransformers/ktransformers_ext/examples/test_linear.py
View file @
f2938031
...
@@ -6,7 +6,7 @@ Author : chenht2022
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Date : 2024-07-25 10:32:05
Version : 1.0.0
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
4:00
LastEditTime : 2024-0
8-06
10:3
6:59
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
import
os
,
sys
import
os
,
sys
...
@@ -15,23 +15,23 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
...
@@ -15,23 +15,23 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
cpuinfer_ext
import
torch
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
input_size
=
16384
input_size
=
16384
output_size
=
5120
output_size
=
5120
stride
=
32
stride
=
32
group_max_len
=
1024
proj_type
=
1
# ggml_type::GGML_TYPE_F16
proj_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
layer_num
=
10
qlen
=
30
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
layer_num
=
10
validation_iter
=
100
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
warm_up_iter
=
1000
validation_iter
=
100
test_iter
=
10000
with
torch
.
inference_mode
(
mode
=
True
):
linears
=
[]
linears
=
[]
projs
=
[]
projs
=
[]
for
_
in
range
(
layer_num
):
for
_
in
range
(
layer_num
):
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
proj
=
torch
.
randn
((
output_size
,
input_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
config
=
cpuinfer_ext
.
linear
.
LinearConfig
(
input_size
,
output_size
,
stride
,
group_max_len
,
proj
.
data_ptr
(),
proj_type
,
hidden_type
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
linear
=
cpuinfer_ext
.
linear
.
Linear
(
config
)
projs
.
append
(
proj
)
projs
.
append
(
proj
)
linears
.
append
(
linear
)
linears
.
append
(
linear
)
...
@@ -39,11 +39,17 @@ with torch.inference_mode(mode=True):
...
@@ -39,11 +39,17 @@ with torch.inference_mode(mode=True):
# validation
# validation
for
i
in
range
(
validation_iter
):
for
i
in
range
(
validation_iter
):
linear
=
linears
[
i
%
layer_num
]
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
input
=
input
/
100
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
linear
.
forward
(
qlen
,
input
.
data_ptr
(),
output
.
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
# print('cpuinfer output', output)
# print('cpuinfer output', output)
...
@@ -54,30 +60,3 @@ with torch.inference_mode(mode=True):
...
@@ -54,30 +60,3 @@ with torch.inference_mode(mode=True):
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
linear
=
linears
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
input_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
output_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
linear
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
input_size
*
output_size
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/examples/test_mlp.py
View file @
f2938031
...
@@ -6,7 +6,7 @@ Author : chenht2022
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Date : 2024-07-25 10:32:05
Version : 1.0.0
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
4:03
LastEditTime : 2024-0
8-06
10:3
7:28
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
import
os
,
sys
import
os
,
sys
...
@@ -15,20 +15,30 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
...
@@ -15,20 +15,30 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
cpuinfer_ext
import
torch
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
hidden_size
=
5120
hidden_size
=
5120
intermediate_size
=
3072
intermediate_size
=
3072
stride
=
32
stride
=
32
group_max_len
=
1024
gate_type
=
1
# ggml_type::GGML_TYPE_F16
gate_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
layer_num
=
10
qlen
=
30
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
layer_num
=
10
validation_iter
=
100
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
warm_up_iter
=
1000
validation_iter
=
100
test_iter
=
10000
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
gate_buf
=
torch
.
mm
(
input
,
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
,
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
,
down_proj
.
t
())
return
ret
with
torch
.
inference_mode
(
mode
=
True
):
mlps
=
[]
mlps
=
[]
gate_projs
=
[]
gate_projs
=
[]
up_projs
=
[]
up_projs
=
[]
...
@@ -37,7 +47,7 @@ with torch.inference_mode(mode=True):
...
@@ -37,7 +47,7 @@ with torch.inference_mode(mode=True):
gate_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
gate_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
config
=
cpuinfer_ext
.
mlp
.
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
(),
gate_type
,
up_type
,
down_type
,
hidden_type
)
mlp
=
cpuinfer_ext
.
mlp
.
MLP
(
config
)
mlp
=
cpuinfer_ext
.
mlp
.
MLP
(
config
)
gate_projs
.
append
(
gate_proj
)
gate_projs
.
append
(
gate_proj
)
up_projs
.
append
(
up_proj
)
up_projs
.
append
(
up_proj
)
...
@@ -47,52 +57,26 @@ with torch.inference_mode(mode=True):
...
@@ -47,52 +57,26 @@ with torch.inference_mode(mode=True):
# validation
# validation
for
i
in
range
(
validation_iter
):
for
i
in
range
(
validation_iter
):
mlp
=
mlps
[
i
%
layer_num
]
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
input
=
input
/
100
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
submit
(
mlp
.
forward
(
qlen
,
input
.
data_ptr
(),
output
.
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
# print('cpuinfer output', output)
# print('cpuinfer output', output)
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
gate_proj
=
gate_projs
[
i
%
layer_num
]
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
gate_buf
=
torch
.
mm
(
input
,
gate_proj
.
t
())
t_output
=
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
)
up_buf
=
torch
.
mm
(
input
,
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
t_output
=
torch
.
mm
(
intermediate
,
down_proj
.
t
())
# print('torch output', t_output)
# print('torch output', t_output)
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
mlp
=
mlps
[
i
%
layer_num
]
input
=
torch
.
randn
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
time
()
CPUInfer
.
submit
(
mlp
.
forward
,
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
time
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
2
*
test_iter
/
total_time
/
1024
/
1024
/
1024
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/examples/test_moe.py
View file @
f2938031
...
@@ -6,7 +6,7 @@ Author : chenht2022
...
@@ -6,7 +6,7 @@ Author : chenht2022
Date : 2024-07-25 10:32:05
Date : 2024-07-25 10:32:05
Version : 1.0.0
Version : 1.0.0
LastEditors : chenht2022
LastEditors : chenht2022
LastEditTime : 2024-0
7-25
10:3
4
:0
6
LastEditTime : 2024-0
8-06
10:3
8
:0
5
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
import
os
,
sys
import
os
,
sys
...
@@ -15,25 +15,64 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
...
@@ -15,25 +15,64 @@ sys.path.append(os.path.dirname(__file__) + '/../build')
import
cpuinfer_ext
import
cpuinfer_ext
import
torch
import
torch
with
torch
.
inference_mode
(
mode
=
True
):
expert_num
=
160
expert_num
=
10
hidden_size
=
5120
hidden_size
=
5120
intermediate_size
=
1536
intermediate_size
=
1536
stride
=
32
stride
=
32
group_min_len
=
10
group_min_len
=
10
group_max_len
=
1024
group_max_len
=
1024
gate_type
=
1
# ggml_type::GGML_TYPE_F16
gate_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
up_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
down_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
hidden_type
=
1
# ggml_type::GGML_TYPE_F16
n_routed_experts
=
6
n_routed_experts
=
6
qlen
=
30
qlen
=
30
layer_num
=
10
layer_num
=
10
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
48
)
validation_iter
=
100
validation_iter
=
100
warm_up_iter
=
1000
def
act_fn
(
x
):
test_iter
=
10000
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
def
mlp_torch
(
input
,
gate_proj
,
up_proj
,
down_proj
):
gate_buf
=
torch
.
mm
(
input
,
gate_proj
.
t
())
up_buf
=
torch
.
mm
(
input
,
up_proj
.
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
ret
=
torch
.
mm
(
intermediate
,
down_proj
.
t
())
return
ret
def
moe_torch
(
input
,
expert_ids
,
weights
,
gate_proj
,
up_proj
,
down_proj
):
cnts
=
expert_ids
.
new_zeros
((
expert_ids
.
shape
[
0
],
expert_num
))
cnts
.
scatter_
(
1
,
expert_ids
,
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
expert_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
input
[
idxs
//
expert_ids
.
shape
[
1
]]
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
expert_out
=
mlp_torch
(
tokens_for_this_expert
,
gate_proj
[
i
],
up_proj
[
i
],
down_proj
[
i
])
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
t_output
=
(
new_x
.
view
(
*
expert_ids
.
shape
,
-
1
)
.
type
(
weights
.
dtype
)
.
mul_
(
weights
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
new_x
.
dtype
)
)
return
t_output
with
torch
.
inference_mode
(
mode
=
True
):
moes
=
[]
moes
=
[]
gate_projs
=
[]
gate_projs
=
[]
up_projs
=
[]
up_projs
=
[]
...
@@ -51,63 +90,32 @@ with torch.inference_mode(mode=True):
...
@@ -51,63 +90,32 @@ with torch.inference_mode(mode=True):
# validation
# validation
for
i
in
range
(
validation_iter
):
for
i
in
range
(
validation_iter
):
moe
=
moes
[
i
%
layer_num
]
expert_ids
=
torch
.
stack
([
torch
.
randperm
(
expert_num
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)]).
contiguous
()
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
1
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
input
=
input
/
100
CPUInfer
.
submit
(
moe
.
forward
,
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
())
moe
=
moes
[
i
%
layer_num
]
CPUInfer
.
submit
(
moe
.
forward
(
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
()
)
)
CPUInfer
.
sync
()
CPUInfer
.
sync
()
# print('cpuinfer output', output)
# print('cpuinfer output', output)
def
act_fn
(
x
):
return
x
/
(
1.0
+
torch
.
exp
(
-
x
))
t_output
=
torch
.
zeros
((
qlen
,
1
,
hidden_size
),
dtype
=
torch
.
float32
).
contiguous
()
gate_proj
=
gate_projs
[
i
%
layer_num
]
gate_proj
=
gate_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
up_proj
=
up_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
down_proj
=
down_projs
[
i
%
layer_num
]
for
token_idx
in
range
(
qlen
):
t_output
=
moe_torch
(
input
,
expert_ids
,
weights
,
gate_proj
,
up_proj
,
down_proj
)
for
i
,
expert_id
in
enumerate
(
expert_ids
[
token_idx
]):
gate_buf
=
torch
.
mm
(
input
[
token_idx
],
gate_proj
[
expert_id
].
t
())
up_buf
=
torch
.
mm
(
input
[
token_idx
],
up_proj
[
expert_id
].
t
())
intermediate
=
act_fn
(
gate_buf
)
*
up_buf
expert_output
=
torch
.
mm
(
intermediate
,
down_proj
[
expert_id
].
t
())
t_output
[
token_idx
]
+=
weights
[
token_idx
][
i
]
*
expert_output
# print('torch output', t_output)
# print('torch output', t_output)
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
diff
=
torch
.
mean
(
torch
.
abs
(
output
-
t_output
))
/
torch
.
mean
(
torch
.
abs
(
t_output
))
print
(
'diff = '
,
diff
)
print
(
'diff = '
,
diff
)
assert
(
diff
<
0.001
)
assert
(
diff
<
0.001
)
# warm up
for
i
in
range
(
warm_up_iter
):
moe
=
moes
[
i
%
layer_num
]
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
CPUInfer
.
submit
(
moe
.
forward
,
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
# test
total_time
=
0
for
i
in
range
(
test_iter
):
moe
=
moes
[
i
%
layer_num
]
expert_ids
=
torch
.
randint
(
0
,
expert_num
,
(
qlen
,
n_routed_experts
),
dtype
=
torch
.
int64
).
contiguous
()
weights
=
torch
.
rand
((
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
).
contiguous
()
input
=
torch
.
randn
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
output
=
torch
.
empty
((
qlen
,
hidden_size
),
dtype
=
torch
.
float16
).
contiguous
()
input
=
input
/
100
start
=
time
.
perf_counter
()
CPUInfer
.
submit
(
moe
.
forward
,
qlen
,
n_routed_experts
,
expert_ids
.
data_ptr
(),
weights
.
data_ptr
(),
input
.
data_ptr
(),
output
.
data_ptr
())
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
+=
end
-
start
print
(
'Time: '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time per iteration: '
,
total_time
/
test_iter
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
n_routed_experts
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
"All tasks completed."
)
\ No newline at end of file
ktransformers/ktransformers_ext/ext_bindings.cpp
View file @
f2938031
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-0
7-25
10:3
4:23
* @LastEditTime : 2024-0
8-07
10:3
9:37
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
// Python bindings
// Python bindings
...
@@ -12,7 +12,6 @@
...
@@ -12,7 +12,6 @@
#include <iostream>
#include <iostream>
#include <memory>
#include <memory>
#include "cpu_backend/cpuinfer.h"
#include "cpu_backend/cpuinfer.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_launch_parameters.h"
#include "llamafile/flags.h"
#include "llamafile/flags.h"
#include "operators/llamafile/linear.h"
#include "operators/llamafile/linear.h"
...
@@ -26,239 +25,155 @@
...
@@ -26,239 +25,155 @@
namespace
py
=
pybind11
;
namespace
py
=
pybind11
;
using
namespace
pybind11
::
literals
;
using
namespace
pybind11
::
literals
;
// Binding functions for the Linear class
class
LinearBindings
{
class
LinearBindings
{
public:
public:
static
void
bind_forward
(
CPUInfer
&
cpuinfer
,
Linear
*
linear
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
class
WarmUpBindinds
{
auto
input
=
args
[
0
].
cast
<
intptr_t
>
();
public:
auto
output
=
args
[
1
].
cast
<
intptr_t
>
();
struct
Args
{
cpuinfer
.
submit
(
&
Linear
::
forward
,
linear
,
CPUInfer
*
cpuinfer
;
(
const
void
*
)
input
,
(
void
*
)
output
);
Linear
*
linear
;
}
};
static
void
inner
(
void
*
args
)
{
static
void
bind_warm_up
(
CPUInfer
&
cpuinfer
,
Linear
*
linear
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
Args
*
args_
=
(
Args
*
)
args
;
cpuinfer
.
submit
(
&
Linear
::
warm_up
,
linear
);
args_
->
cpuinfer
->
enqueue
(
&
Linear
::
warm_up
,
args_
->
linear
);
}
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
Linear
&
linear
)
{
static
void
bind_functions
(
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
linear
};
auto
linear
=
func
.
attr
(
"__self__"
).
cast
<
Linear
*>
();
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
}
};
if
(
func_name
==
"forward"
)
{
class
ForwardBindings
{
bind_forward
(
cpuinfer
,
linear
,
args
,
kwargs
);
public:
}
else
if
(
func_name
==
"warm_up"
)
{
struct
Args
{
bind_warm_up
(
cpuinfer
,
linear
,
args
,
kwargs
);
CPUInfer
*
cpuinfer
;
}
else
{
Linear
*
linear
;
throw
py
::
value_error
(
"Unsupported function: "
+
int
qlen
;
std
::
string
(
func_name
));
const
void
*
input
;
void
*
output
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
Linear
::
forward
,
args_
->
linear
,
args_
->
qlen
,
args_
->
input
,
args_
->
output
);
}
}
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
Linear
&
linear
,
int
qlen
,
intptr_t
input
,
intptr_t
output
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
linear
,
qlen
,
(
const
void
*
)
input
,
(
void
*
)
output
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
};
// Binding functions for the MLP class
class
MLPBindings
{
class
MLPBindings
{
public:
public:
static
void
bind_forward
(
CPUInfer
&
cpuinfer
,
MLP
*
mlp
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
class
WarmUpBindinds
{
auto
input
=
args
[
0
].
cast
<
intptr_t
>
();
public:
auto
output
=
args
[
1
].
cast
<
intptr_t
>
();
struct
Args
{
cpuinfer
.
submit
(
&
MLP
::
forward
,
mlp
,
CPUInfer
*
cpuinfer
;
(
const
void
*
)
input
,
(
void
*
)
output
);
MLP
*
mlp
;
}
};
static
void
inner
(
void
*
args
)
{
static
void
bind_warm_up
(
CPUInfer
&
cpuinfer
,
MLP
*
mlp
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
Args
*
args_
=
(
Args
*
)
args
;
cpuinfer
.
submit
(
&
MLP
::
warm_up
,
mlp
);
args_
->
cpuinfer
->
enqueue
(
&
MLP
::
warm_up
,
args_
->
mlp
);
}
static
void
bind_functions
(
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
mlp
=
func
.
attr
(
"__self__"
).
cast
<
MLP
*>
();
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
bind_forward
(
cpuinfer
,
mlp
,
args
,
kwargs
);
}
else
if
(
func_name
==
"warm_up"
)
{
bind_warm_up
(
cpuinfer
,
mlp
,
args
,
kwargs
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
}
}
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MLP
&
mlp
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
mlp
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
ForwardBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
MLP
*
mlp
;
int
qlen
;
const
void
*
input
;
void
*
output
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
MLP
::
forward
,
args_
->
mlp
,
args_
->
qlen
,
args_
->
input
,
args_
->
output
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MLP
&
mlp
,
int
qlen
,
intptr_t
input
,
intptr_t
output
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
mlp
,
qlen
,
(
const
void
*
)
input
,
(
void
*
)
output
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
};
// Binding functions for the MOE class
class
MOEBindings
{
class
MOEBindings
{
public:
public:
static
void
bind_forward
(
CPUInfer
&
cpuinfer
,
MOE
*
moe
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
class
WarmUpBindinds
{
int
qlen
=
args
[
0
].
cast
<
int
>
();
public:
int
k
=
args
[
1
].
cast
<
int
>
();
struct
Args
{
auto
expert_ids
=
args
[
2
].
cast
<
intptr_t
>
();
CPUInfer
*
cpuinfer
;
auto
weights
=
args
[
3
].
cast
<
intptr_t
>
();
MOE
*
moe
;
auto
input
=
args
[
4
].
cast
<
intptr_t
>
();
};
auto
output
=
args
[
5
].
cast
<
intptr_t
>
();
static
void
inner
(
void
*
args
)
{
cpuinfer
.
submit
(
&
MOE
::
forward
,
moe
,
Args
*
args_
=
(
Args
*
)
args
;
qlen
,
k
,
(
const
uint64_t
*
)
expert_ids
,
(
const
float
*
)
weights
,
(
const
void
*
)
input
,
(
void
*
)
output
);
args_
->
cpuinfer
->
enqueue
(
&
MOE
::
warm_up
,
args_
->
moe
);
}
static
void
bind_warm_up
(
CPUInfer
&
cpuinfer
,
MOE
*
moe
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
cpuinfer
.
submit
(
&
MOE
::
warm_up
,
moe
);
}
static
void
bind_functions
(
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
auto
moe
=
func
.
attr
(
"__self__"
).
cast
<
MOE
*>
();
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
bind_forward
(
cpuinfer
,
moe
,
args
,
kwargs
);
}
else
if
(
func_name
==
"warm_up"
)
{
bind_warm_up
(
cpuinfer
,
moe
,
args
,
kwargs
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
}
}
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MOE
&
moe
)
{
};
Args
*
args
=
new
Args
{
nullptr
,
&
moe
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
struct
MOEForwardArgs
{
}
CPUInfer
*
cpuinfer
;
};
MOE
*
moe
;
class
ForwardBindings
{
int
qlen
;
public:
int
k
;
struct
Args
{
uint64_t
*
expert_ids
;
CPUInfer
*
cpuinfer
;
float
*
weights
;
MOE
*
moe
;
void
*
input
;
int
qlen
;
void
*
output
;
int
k
;
const
uint64_t
*
expert_ids
;
const
float
*
weights
;
const
void
*
input
;
void
*
output
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
MOE
::
forward
,
args_
->
moe
,
args_
->
qlen
,
args_
->
k
,
args_
->
expert_ids
,
args_
->
weights
,
args_
->
input
,
args_
->
output
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
interface
(
MOE
&
moe
,
int
qlen
,
int
k
,
intptr_t
expert_ids
,
intptr_t
weights
,
intptr_t
input
,
intptr_t
output
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
,
qlen
,
k
,
(
const
uint64_t
*
)
expert_ids
,
(
const
float
*
)
weights
,
(
const
void
*
)
input
,
(
void
*
)
output
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
};
void
submit_moe_forward_with_host_args_ptr
(
void
*
host_args_ptr
)
{
MOEForwardArgs
*
host_args
=
(
MOEForwardArgs
*
)
host_args_ptr
;
host_args
->
cpuinfer
->
submit
(
&
MOE
::
forward
,
host_args
->
moe
,
host_args
->
qlen
,
host_args
->
k
,
host_args
->
expert_ids
,
host_args
->
weights
,
host_args
->
input
,
host_args
->
output
);
}
void
cpuinfer_sync
(
void
*
host_args_ptr
)
{
CPUInfer
*
cpuinfer
=
(
CPUInfer
*
)
host_args_ptr
;
cpuinfer
->
sync
();
}
PYBIND11_MODULE
(
cpuinfer_ext
,
m
)
{
PYBIND11_MODULE
(
cpuinfer_ext
,
m
)
{
auto
linear_module
=
m
.
def_submodule
(
"linear"
);
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
"submit"
,
&
CPUInfer
::
submit
)
.
def
(
"submit_with_cuda_stream"
,
&
CPUInfer
::
submit_with_cuda_stream
)
.
def
(
"sync"
,
&
CPUInfer
::
sync
)
.
def
(
"sync_with_cuda_stream"
,
&
CPUInfer
::
sync_with_cuda_stream
);
auto
linear_module
=
m
.
def_submodule
(
"linear"
);
py
::
class_
<
LinearConfig
>
(
linear_module
,
"LinearConfig"
)
py
::
class_
<
LinearConfig
>
(
linear_module
,
"LinearConfig"
)
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
intptr_t
proj
,
int
proj_type
,
int
hidden_type
)
{
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_max_len
,
intptr_t
proj
,
int
proj_type
,
int
hidden_type
)
{
return
LinearConfig
(
hidden_size
,
intermediate_size
,
stride
,
(
void
*
)
proj
,
(
ggml_type
)
proj_type
,
(
ggml_type
)
hidden_type
);
return
LinearConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
(
void
*
)
proj
,
(
ggml_type
)
proj_type
,
(
ggml_type
)
hidden_type
);
}));
}));
py
::
class_
<
Linear
>
(
linear_module
,
"Linear"
)
py
::
class_
<
Linear
>
(
linear_module
,
"Linear"
)
.
def
(
py
::
init
<
LinearConfig
>
())
.
def
(
py
::
init
<
LinearConfig
>
())
.
def
(
"warm_up"
,
[](
Linear
&
linear
)
{
.
def
(
"warm_up"
,
&
LinearBindings
::
WarmUpBindinds
::
interface
)
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
.
def
(
"forward"
,
&
LinearBindings
::
ForwardBindings
::
interface
);
})
.
def
(
"forward"
,
[](
Linear
&
linear
,
intptr_t
input
,
intptr_t
output
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
});
auto
mlp_module
=
m
.
def_submodule
(
"mlp"
);
auto
mlp_module
=
m
.
def_submodule
(
"mlp"
);
py
::
class_
<
MLPConfig
>
(
mlp_module
,
"MLPConfig"
)
py
::
class_
<
MLPConfig
>
(
mlp_module
,
"MLPConfig"
)
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
.
def
(
py
::
init
([](
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
return
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
return
MLPConfig
(
hidden_size
,
intermediate_size
,
stride
,
group_max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
}));
}));
py
::
class_
<
MLP
>
(
mlp_module
,
"MLP"
)
py
::
class_
<
MLP
>
(
mlp_module
,
"MLP"
)
.
def
(
py
::
init
<
MLPConfig
>
())
.
def
(
py
::
init
<
MLPConfig
>
())
.
def
(
"warm_up"
,
[](
MLP
&
mlp
)
{
.
def
(
"warm_up"
,
&
MLPBindings
::
WarmUpBindinds
::
interface
)
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
.
def
(
"forward"
,
&
MLPBindings
::
ForwardBindings
::
interface
);
})
.
def
(
"forward"
,
[](
MLP
&
mlp
,
intptr_t
input
,
intptr_t
output
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
});
auto
moe_module
=
m
.
def_submodule
(
"moe"
);
auto
moe_module
=
m
.
def_submodule
(
"moe"
);
py
::
class_
<
MOEConfig
>
(
moe_module
,
"MOEConfig"
)
py
::
class_
<
MOEConfig
>
(
moe_module
,
"MOEConfig"
)
.
def
(
py
::
init
([](
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_min_len
,
int
group_max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
.
def
(
py
::
init
([](
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_min_len
,
int
group_max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
,
int
gate_type
,
int
up_type
,
int
down_type
,
int
hidden_type
)
{
return
MOEConfig
(
expert_num
,
routed_expert_num
,
hidden_size
,
intermediate_size
,
stride
,
group_min_len
,
group_max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
return
MOEConfig
(
expert_num
,
routed_expert_num
,
hidden_size
,
intermediate_size
,
stride
,
group_min_len
,
group_max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
,
(
ggml_type
)
gate_type
,
(
ggml_type
)
up_type
,
(
ggml_type
)
down_type
,
(
ggml_type
)
hidden_type
);
}));
}));
py
::
class_
<
MOE
>
(
moe_module
,
"MOE"
)
py
::
class_
<
MOE
>
(
moe_module
,
"MOE"
)
.
def
(
py
::
init
<
MOEConfig
>
())
.
def
(
py
::
init
<
MOEConfig
>
())
.
def
(
"warm_up"
,
[](
MOE
&
moe
)
{
.
def
(
"warm_up"
,
&
MOEBindings
::
WarmUpBindinds
::
interface
)
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
.
def
(
"forward"
,
&
MOEBindings
::
ForwardBindings
::
interface
);
})
.
def
(
"forward"
,
[](
MOE
&
moe
,
int
k
,
uint64_t
expert_ids
,
intptr_t
weights
,
intptr_t
input
,
intptr_t
output
)
{
throw
std
::
runtime_error
(
"!!! Doing nothing, please use CPUInfer.submit to call it!!!
\n
"
);
});
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
"submit"
,
[
linear_module
,
mlp_module
,
moe_module
](
CPUInfer
&
cpuinfer
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
if
(
py
::
hasattr
(
func
,
"__self__"
)
&&
py
::
hasattr
(
func
,
"__func__"
))
{
std
::
string
class_name
=
py
::
str
(
func
.
attr
(
"__self__"
)
.
attr
(
"__class__"
)
.
attr
(
"__name__"
));
if
(
class_name
==
"Linear"
)
{
LinearBindings
::
bind_functions
(
cpuinfer
,
func
,
args
,
kwargs
);
}
else
if
(
class_name
==
"MLP"
)
{
MLPBindings
::
bind_functions
(
cpuinfer
,
func
,
args
,
kwargs
);
}
else
if
(
class_name
==
"MOE"
)
{
MOEBindings
::
bind_functions
(
cpuinfer
,
func
,
args
,
kwargs
);
}
else
{
// handle other classes
throw
py
::
type_error
(
"Unsupported class type: "
+
class_name
);
}
}
else
{
// handle cases where func does not have __self__ or
// __func__
throw
py
::
type_error
(
"Invalid function object: missing "
"__self__ or __func__ attribute."
);
}
})
.
def
(
"submit_with_cuda_stream"
,
[
linear_module
,
mlp_module
,
moe_module
](
CPUInfer
&
cpuinfer
,
intptr_t
user_cuda_stream
,
py
::
object
func
,
py
::
args
args
,
py
::
kwargs
kwargs
)
{
if
(
py
::
hasattr
(
func
,
"__self__"
)
&&
py
::
hasattr
(
func
,
"__func__"
))
{
std
::
string
class_name
=
py
::
str
(
func
.
attr
(
"__self__"
)
.
attr
(
"__class__"
)
.
attr
(
"__name__"
));
if
(
class_name
==
"MOE"
)
{
std
::
string
func_name
=
py
::
str
(
func
.
attr
(
"__func__"
).
attr
(
"__name__"
));
if
(
func_name
==
"forward"
)
{
auto
moe
=
func
.
attr
(
"__self__"
).
cast
<
MOE
*>
();
int
qlen
=
args
[
0
].
cast
<
int
>
();
int
k
=
args
[
1
].
cast
<
int
>
();
auto
expert_ids
=
args
[
2
].
cast
<
intptr_t
>
();
auto
weights
=
args
[
3
].
cast
<
intptr_t
>
();
auto
input
=
args
[
4
].
cast
<
intptr_t
>
();
auto
output
=
args
[
5
].
cast
<
intptr_t
>
();
MOEForwardArgs
*
moe_forward_args
=
new
MOEForwardArgs
{
&
cpuinfer
,
moe
,
qlen
,
k
,
(
uint64_t
*
)
expert_ids
,
(
float
*
)
weights
,
(
void
*
)
input
,
(
void
*
)
output
};
// submit_moe_forward_with_host_args_ptr(moe_forward_args);
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
submit_moe_forward_with_host_args_ptr
,
moe_forward_args
);
}
else
{
throw
py
::
value_error
(
"Unsupported function: "
+
std
::
string
(
func_name
));
}
}
else
{
// handle other classes
throw
py
::
type_error
(
"Unsupported class type: "
+
class_name
);
}
}
else
{
// handle cases where func does not have __self__ or
// __func__
throw
py
::
type_error
(
"Invalid function object: missing "
"__self__ or __func__ attribute."
);
}
})
.
def
(
"sync_with_cuda_stream"
,
[](
CPUInfer
&
cpuinfer
,
intptr_t
user_cuda_stream
)
{
// cpuinfer_sync((void*)(&cpuinfer));
cudaLaunchHostFunc
((
cudaStream_t
)
user_cuda_stream
,
(
cudaHostFn_t
)
cpuinfer_sync
,
(
void
*
)(
&
cpuinfer
));
})
.
def
(
"sync"
,
&
CPUInfer
::
sync
);
}
}
ktransformers/ktransformers_ext/operators/llamafile/linear.cpp
View file @
f2938031
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:34:58
* @LastEditTime : 2024-07-25 10:34:58
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
...
@@ -13,9 +13,15 @@ Linear::Linear(LinearConfig config) {
...
@@ -13,9 +13,15 @@ Linear::Linear(LinearConfig config) {
config_
=
config
;
config_
=
config
;
proj_
=
config_
.
proj
;
proj_
=
config_
.
proj
;
input_fp32_
.
resize
(
config_
.
input_size
);
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
mem_requests
;
proj_input_
.
resize
(
config_
.
input_size
*
4
);
mem_requests
.
push_back
({(
void
**
)
&
input_fp32_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
input_size
});
proj_output_
.
resize
(
config_
.
output_size
);
mem_requests
.
push_back
({(
void
**
)
&
proj_input_
,
config_
.
group_max_len
*
config_
.
input_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)});
mem_requests
.
push_back
({(
void
**
)
&
proj_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
output_size
});
shared_mem_buffer
.
alloc
(
this
,
mem_requests
);
}
Linear
::~
Linear
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
}
void
Linear
::
warm_up
(
Backend
*
backend
)
{
void
Linear
::
warm_up
(
Backend
*
backend
)
{
...
@@ -26,22 +32,42 @@ void Linear::warm_up(Backend* backend) {
...
@@ -26,22 +32,42 @@ void Linear::warm_up(Backend* backend) {
input_fp32
[
i
]
=
0
;
input_fp32
[
i
]
=
0
;
}
}
from_float
(
input_fp32
.
data
(),
input
.
data
(),
config_
.
input_size
,
config_
.
hidden_type
);
from_float
(
input_fp32
.
data
(),
input
.
data
(),
config_
.
input_size
,
config_
.
hidden_type
);
forward
(
input
.
data
(),
output
.
data
(),
backend
);
forward
_many
(
1
,
input
.
data
(),
output
.
data
(),
backend
);
}
}
void
Linear
::
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
Linear
::
forward
_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
const
void
*
proj_input_ptr
;
const
void
*
proj_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
)
{
proj_input_ptr
=
input
;
proj_input_ptr
=
input
;
}
else
{
}
else
{
to_float
(
input
,
input_fp32_
.
data
(),
config_
.
input_size
,
config_
.
hidden_type
);
to_float
(
input
,
input_fp32_
,
qlen
*
config_
.
input_size
,
config_
.
hidden_type
);
from_float
(
input_fp32_
.
data
()
,
proj_input_
.
data
(),
config_
.
input_size
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
);
from_float
(
input_fp32_
,
proj_input_
,
qlen
*
config_
.
input_size
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
);
proj_input_ptr
=
proj_input_
.
data
()
;
proj_input_ptr
=
proj_input_
;
}
}
int
nth
=
config_
.
output_size
/
config_
.
stride
;
int
nth
=
config_
.
output_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
;
llamafile_sgemm
(
config_
.
output_size
,
1
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_input_ptr
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_output_
.
data
(),
config_
.
output_size
,
ith
,
nth
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
proj_type
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
proj_ptr
=
proj_
+
ith
*
config_
.
stride
*
config_
.
input_size
*
ggml_type_size
(
config_
.
proj_type
)
/
ggml_blck_size
(
config_
.
proj_type
);
float
*
proj_output_ptr
=
proj_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_ptr
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_input_ptr
,
config_
.
input_size
/
ggml_blck_size
(
config_
.
proj_type
),
proj_output_ptr
,
config_
.
output_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
proj_type
,
ggml_internal_get_type_traits
(
config_
.
proj_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
float
*
output_fp32_ptr
=
proj_output_
+
i
*
config_
.
output_size
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
i
*
config_
.
output_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
)
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
});
from_float
(
proj_output_
.
data
(),
output
,
config_
.
output_size
,
config_
.
hidden_type
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
!=
0
)
{
from_float
(
proj_output_
,
output
,
qlen
*
config_
.
output_size
,
config_
.
hidden_type
);
}
}
void
Linear
::
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<=
0
)
{
return
;
}
int
forward_len
=
std
::
min
(
qlen
,
config_
.
group_max_len
);
forward_many
(
forward_len
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
input
+
forward_len
*
config_
.
input_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
output_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/linear.h
View file @
f2938031
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:00
* @LastEditTime : 2024-07-25 10:35:00
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
...
@@ -22,34 +22,38 @@
...
@@ -22,34 +22,38 @@
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
LinearConfig
{
struct
LinearConfig
{
int
input_size
;
int
input_size
;
int
output_size
;
int
output_size
;
int
stride
;
int
stride
;
int
group_max_len
;
void
*
proj
;
void
*
proj
;
ggml_type
proj_type
;
ggml_type
proj_type
;
ggml_type
hidden_type
;
ggml_type
hidden_type
;
LinearConfig
()
{}
LinearConfig
()
{}
LinearConfig
(
int
input_size
,
int
output_size
,
int
stride
,
void
*
proj
,
ggml_type
proj_type
,
ggml_type
hidden_type
)
LinearConfig
(
int
input_size
,
int
output_size
,
int
stride
,
int
group_max_len
,
void
*
proj
,
ggml_type
proj_type
,
ggml_type
hidden_type
)
:
input_size
(
input_size
),
output_size
(
output_size
),
stride
(
stride
),
proj
(
proj
),
proj_type
(
proj_type
),
hidden_type
(
hidden_type
)
{}
:
input_size
(
input_size
),
output_size
(
output_size
),
stride
(
stride
),
group_max_len
(
group_max_len
),
proj
(
proj
),
proj_type
(
proj_type
),
hidden_type
(
hidden_type
)
{}
};
};
class
Linear
{
class
Linear
{
public:
public:
Linear
(
LinearConfig
);
Linear
(
LinearConfig
);
~
Linear
();
void
warm_up
(
Backend
*
backend
);
void
warm_up
(
Backend
*
backend
);
void
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
private:
LinearConfig
config_
;
LinearConfig
config_
;
void
*
proj_
;
// [output_size * input_size ( /32 if quantized)]
void
*
proj_
;
// [output_size * input_size ( /32 if quantized)]
std
::
vector
<
float
>
input_fp32_
;
// [input_size]
float
*
input_fp32_
;
// [
group_max_len *
input_size]
std
::
vector
<
uint8_t
>
proj_input_
;
// [
input_size * 4
]
uint8_t
*
proj_input_
;
// [
group_max_len * input_size * ggml_type_size(ggml_internal_get_type_traits(proj_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(proj_type).vec_dot_type)
]
std
::
vector
<
float
>
proj_output_
;
// [output_size]
float
*
proj_output_
;
// [
group_max_len *
output_size]
};
};
#endif
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
View file @
f2938031
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:04
* @LastEditTime : 2024-07-25 10:35:04
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
...
@@ -15,14 +15,20 @@ MLP::MLP(MLPConfig config) {
...
@@ -15,14 +15,20 @@ MLP::MLP(MLPConfig config) {
up_proj_
=
config_
.
up_proj
;
up_proj_
=
config_
.
up_proj
;
down_proj_
=
config_
.
down_proj
;
down_proj_
=
config_
.
down_proj
;
input_fp32_
.
resize
(
config_
.
hidden_size
);
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
mem_requests
;
gate_input_
.
resize
(
config_
.
hidden_size
*
4
);
mem_requests
.
push_back
({(
void
**
)
&
input_fp32_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
});
up_input_
.
resize
(
config_
.
hidden_size
*
4
);
mem_requests
.
push_back
({(
void
**
)
&
gate_input_
,
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
gate_output_
.
resize
(
config_
.
intermediate_size
);
mem_requests
.
push_back
({(
void
**
)
&
up_input_
,
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
up_output_
.
resize
(
config_
.
intermediate_size
);
mem_requests
.
push_back
({(
void
**
)
&
gate_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
intermediate_fp32_
.
resize
(
config_
.
intermediate_size
);
mem_requests
.
push_back
({(
void
**
)
&
up_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
down_input_
.
resize
(
config_
.
intermediate_size
*
4
);
mem_requests
.
push_back
({(
void
**
)
&
intermediate_fp32_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
down_output_
.
resize
(
config_
.
hidden_size
);
mem_requests
.
push_back
({(
void
**
)
&
down_input_
,
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)});
mem_requests
.
push_back
({(
void
**
)
&
down_output_
,
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
});
shared_mem_buffer
.
alloc
(
this
,
mem_requests
);
}
MLP
::~
MLP
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
}
void
MLP
::
warm_up
(
Backend
*
backend
)
{
void
MLP
::
warm_up
(
Backend
*
backend
)
{
...
@@ -33,33 +39,33 @@ void MLP::warm_up(Backend* backend) {
...
@@ -33,33 +39,33 @@ void MLP::warm_up(Backend* backend) {
input_fp32
[
i
]
=
0
;
input_fp32
[
i
]
=
0
;
}
}
from_float
(
input_fp32
.
data
(),
input
.
data
(),
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
input_fp32
.
data
(),
input
.
data
(),
config_
.
hidden_size
,
config_
.
hidden_type
);
forward
(
input
.
data
(),
output
.
data
(),
backend
);
forward
_many
(
1
,
input
.
data
(),
output
.
data
(),
backend
);
}
}
static
float
act_fn
(
float
x
)
{
static
float
act_fn
(
float
x
)
{
return
x
/
(
1.0
f
+
expf
(
-
x
));
return
x
/
(
1.0
f
+
expf
(
-
x
));
}
}
void
MLP
::
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
MLP
::
forward
_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
const
void
*
gate_input_ptr
;
const
void
*
gate_input_ptr
;
const
void
*
up_input_ptr
;
const
void
*
up_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
gate_input_ptr
=
up_input_ptr
=
input
;
gate_input_ptr
=
up_input_ptr
=
input
;
}
else
{
}
else
{
to_float
(
input
,
input_fp32_
.
data
(),
config_
.
hidden_size
,
config_
.
hidden_type
);
to_float
(
input
,
input_fp32_
,
qlen
*
config_
.
hidden_size
,
config_
.
hidden_type
);
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
input_fp32_
.
data
()
,
gate_input_
.
data
(),
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
input_fp32_
,
gate_input_
,
qlen
*
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
gate_input_
.
data
()
;
gate_input_ptr
=
up_input_ptr
=
gate_input_
;
}
else
{
}
else
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
{
from_float
(
input_fp32_
.
data
()
,
gate_input_
.
data
(),
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
input_fp32_
,
gate_input_
,
qlen
*
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
gate_input_
.
data
()
;
gate_input_ptr
=
gate_input_
;
}
else
{
}
else
{
gate_input_ptr
=
input
;
gate_input_ptr
=
input
;
}
}
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
input_fp32_
.
data
()
,
up_input_
.
data
(),
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
from_float
(
input_fp32_
,
up_input_
,
qlen
*
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
up_input_
.
data
()
;
up_input_ptr
=
up_input_
;
}
else
{
}
else
{
up_input_ptr
=
input
;
up_input_ptr
=
input
;
}
}
...
@@ -69,35 +75,49 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -69,35 +75,49 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
int
ith
=
task_id
;
void
*
gate_proj_ptr
=
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
gate_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
gate_output_ptr
=
gate_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
strid
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_siz
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
up_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
up_output_ptr
=
up_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
intermediate_fp32_
[
i
]
=
act_fn
(
gate_output_
[
i
])
*
up_output_
[
i
];
for
(
int
j
=
ith
*
config_
.
stride
;
j
<
(
ith
+
1
)
*
config_
.
stride
;
j
++
)
{
}
intermediate_fp32_
[
i
*
config_
.
intermediate_size
+
j
]
=
act_fn
(
gate_output_
[
i
*
config_
.
intermediate_size
+
j
])
*
up_output_
[
i
*
config_
.
intermediate_size
+
j
];
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
}
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
.
data
()
+
ith
*
config_
.
stride
;
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
void
*
down_input_ptr
=
down_input_
.
data
()
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
+
i
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
;
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
void
*
down_input_ptr
=
down_input_
+
i
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
}
}
}
});
});
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
!=
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
!=
0
)
{
from_float
(
intermediate_fp32_
.
data
()
,
down_input_
.
data
(),
config_
.
intermediate_size
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_
,
down_input_
,
qlen
*
config_
.
intermediate_size
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
}
}
nth
=
config_
.
hidden_size
/
config_
.
stride
;
nth
=
config_
.
hidden_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
int
ith
=
task_id
;
void
*
down_proj_ptr
=
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
down_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
down_output_ptr
=
down_output_
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
.
data
()
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
strid
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
qlen
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_siz
e
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
from_float
(
down_output_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
float
*
output_fp32_ptr
=
down_output_
+
i
*
config_
.
hidden_size
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
)
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
}
});
});
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
!=
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
!=
0
)
{
from_float
(
down_output_
.
data
()
,
output
,
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
down_output_
,
output
,
qlen
*
config_
.
hidden_size
,
config_
.
hidden_type
);
}
}
}
}
void
MLP
::
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<=
0
)
{
return
;
}
int
forward_len
=
std
::
min
(
qlen
,
config_
.
group_max_len
);
forward_many
(
forward_len
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/mlp.h
View file @
f2938031
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-12 10:07:58
* @Date : 2024-07-12 10:07:58
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:06
* @LastEditTime : 2024-07-25 10:35:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
...
@@ -22,11 +22,13 @@
...
@@ -22,11 +22,13 @@
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MLPConfig
{
struct
MLPConfig
{
int
hidden_size
;
int
hidden_size
;
int
intermediate_size
;
int
intermediate_size
;
int
stride
;
int
stride
;
int
group_max_len
;
void
*
gate_proj
;
void
*
gate_proj
;
void
*
up_proj
;
void
*
up_proj
;
void
*
down_proj
;
void
*
down_proj
;
...
@@ -37,15 +39,17 @@ struct MLPConfig {
...
@@ -37,15 +39,17 @@ struct MLPConfig {
MLPConfig
()
{}
MLPConfig
()
{}
MLPConfig
(
int
hidden_size
,
int
intermediate_size
,
int
stride
,
void
*
gate_proj
,
void
*
up_proj
,
void
*
down_proj
,
ggml_type
gate_type
,
ggml_type
up_type
,
ggml_type
down_type
,
ggml_type
hidden_type
)
MLPConfig
(
int
hidden_size
,
int
intermediate_size
,
int
stride
,
int
group_max_len
,
void
*
gate_proj
,
void
*
up_proj
,
void
*
down_proj
,
ggml_type
gate_type
,
ggml_type
up_type
,
ggml_type
down_type
,
ggml_type
hidden_type
)
:
hidden_size
(
hidden_size
),
intermediate_size
(
intermediate_size
),
stride
(
stride
),
gate_proj
(
gate_proj
),
up_proj
(
up_proj
),
down_proj
(
down_proj
),
gate_type
(
gate_type
),
up_type
(
up_type
),
down_type
(
down_type
),
hidden_type
(
hidden_type
)
{}
:
hidden_size
(
hidden_size
),
intermediate_size
(
intermediate_size
),
stride
(
stride
),
group_max_len
(
group_max_len
),
gate_proj
(
gate_proj
),
up_proj
(
up_proj
),
down_proj
(
down_proj
),
gate_type
(
gate_type
),
up_type
(
up_type
),
down_type
(
down_type
),
hidden_type
(
hidden_type
)
{}
};
};
class
MLP
{
class
MLP
{
public:
public:
MLP
(
MLPConfig
);
MLP
(
MLPConfig
);
~
MLP
();
void
warm_up
(
Backend
*
backend
);
void
warm_up
(
Backend
*
backend
);
void
forward
(
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
private:
MLPConfig
config_
;
MLPConfig
config_
;
...
@@ -53,14 +57,14 @@ class MLP {
...
@@ -53,14 +57,14 @@ class MLP {
void
*
up_proj_
;
// [intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [intermediate_size * hidden_size ( /32 if quantized)]
void
*
down_proj_
;
// [hidden_size * intermediate_size ( /32 if quantized)]
void
*
down_proj_
;
// [hidden_size * intermediate_size ( /32 if quantized)]
std
::
vector
<
float
>
input_fp32_
;
// [hidden_size]
float
*
input_fp32_
;
// [
group_max_len *
hidden_size]
std
::
vector
<
uint8_t
>
gate_input_
;
// [
hidden_size * 4
]
uint8_t
*
gate_input_
;
// [
group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(gate_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(gate_type).vec_dot_type)
]
std
::
vector
<
uint8_t
>
up_input_
;
// [
hidden_size * 4
]
uint8_t
*
up_input_
;
// [
group_max_len * hidden_size * ggml_type_size(ggml_internal_get_type_traits(up_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(up_type).vec_dot_type)
]
std
::
vector
<
float
>
gate_output_
;
// [intermediate_size]
float
*
gate_output_
;
// [
group_max_len *
intermediate_size]
std
::
vector
<
float
>
up_output_
;
// [intermediate_size]
float
*
up_output_
;
// [
group_max_len *
intermediate_size]
std
::
vector
<
float
>
intermediate_fp32_
;
// [intermediate_size]
float
*
intermediate_fp32_
;
// [
group_max_len *
intermediate_size]
std
::
vector
<
uint8_t
>
down_input_
;
// [intermediate_size *
4
]
uint8_t
*
down_input_
;
// [
group_max_len *
intermediate_size *
ggml_type_size(ggml_internal_get_type_traits(down_type).vec_dot_type) / ggml_blck_size(ggml_internal_get_type_traits(down_type).vec_dot_type)
]
std
::
vector
<
float
>
down_output_
;
// [hidden_size]
float
*
down_output_
;
// [
group_max_len *
hidden_size]
};
};
#endif
#endif
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
View file @
f2938031
/**
/**
* @Description :
* @Description :
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:07
* @LastEditTime : 2024-07-25 10:35:07
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
#include "moe.h"
#include "moe.h"
#include <iostream>
#include <iostream>
#include "unistd.h"
#include "unistd.h"
void
*
MOE
::
buffer_
=
nullptr
;
MOE
::
MOE
(
MOEConfig
config
)
{
MOE
::
MOE
(
MOEConfig
config
)
{
config_
=
config
;
config_
=
config
;
gate_proj_
=
config_
.
gate_proj
;
gate_proj_
=
config_
.
gate_proj
;
up_proj_
=
config_
.
up_proj
;
up_proj_
=
config_
.
up_proj
;
down_proj_
=
config_
.
down_proj
;
down_proj_
=
config_
.
down_proj
;
if
(
MOE
::
buffer_
==
nullptr
)
{
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
s_mem_requests
;
uint64_t
buffer_size
=
0
;
s_mem_requests
.
push_back
({(
void
**
)
&
s_input_fp32_
,
sizeof
(
float
)
*
config_
.
hidden_size
});
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
s_mem_requests
.
push_back
({(
void
**
)
&
s_gate_input_
,
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
buffer_size
+=
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
s_mem_requests
.
push_back
({(
void
**
)
&
s_up_input_
,
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
buffer_size
+=
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_
=
malloc
(
buffer_size
);
}
uint64_t
offset
=
0
;
s_input_fp32_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
s_gate_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
s_up_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
s_gate_output_
.
resize
(
config_
.
routed_expert_num
);
s_gate_output_
.
resize
(
config_
.
routed_expert_num
);
s_up_output_
.
resize
(
config_
.
routed_expert_num
);
s_up_output_
.
resize
(
config_
.
routed_expert_num
);
s_intermediate_fp32_
.
resize
(
config_
.
routed_expert_num
);
s_intermediate_fp32_
.
resize
(
config_
.
routed_expert_num
);
s_down_input_
.
resize
(
config_
.
routed_expert_num
);
s_down_input_
.
resize
(
config_
.
routed_expert_num
);
s_down_output_
.
resize
(
config_
.
routed_expert_num
);
s_down_output_
.
resize
(
config_
.
routed_expert_num
);
for
(
int
i
=
0
;
i
<
config_
.
routed_expert_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
routed_expert_num
;
i
++
)
{
s_gate_output_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
s_mem_requests
.
push_back
({(
void
**
)
&
s_gate_output_
[
i
],
sizeof
(
float
)
*
config_
.
intermediate_size
});
offset
+=
sizeof
(
float
)
*
config_
.
intermediate_size
;
s_mem_requests
.
push_back
({(
void
**
)
&
s_up_output_
[
i
],
sizeof
(
float
)
*
config_
.
intermediate_size
});
s_up_output_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
s_mem_requests
.
push_back
({(
void
**
)
&
s_intermediate_fp32_
[
i
],
sizeof
(
float
)
*
config_
.
intermediate_size
});
offset
+=
sizeof
(
float
)
*
config_
.
intermediate_size
;
s_mem_requests
.
push_back
({(
void
**
)
&
s_down_input_
[
i
],
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)});
s_intermediate_fp32_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
s_mem_requests
.
push_back
({(
void
**
)
&
s_down_output_
[
i
],
sizeof
(
float
)
*
config_
.
hidden_size
});
offset
+=
sizeof
(
float
)
*
config_
.
intermediate_size
;
s_down_input_
[
i
]
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
s_down_output_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
}
}
s_output_fp32_
=
(
float
*
)(
buffer_
+
offset
);
s_mem_requests
.
push_back
({(
void
**
)
&
s_output_fp32_
,
sizeof
(
float
)
*
config_
.
hidden_size
});
shared_mem_buffer
.
alloc
(
this
,
s_mem_requests
);
offset
=
0
;
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
m_mem_requests
;
m_input_fp32_
.
resize
(
config_
.
group_max_len
);
m_input_fp32_
.
resize
(
config_
.
group_max_len
);
m_gate_input_
.
resize
(
config_
.
group_max_len
);
m_gate_input_
.
resize
(
config_
.
group_max_len
);
m_up_input_
.
resize
(
config_
.
group_max_len
);
m_up_input_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_input_fp32_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_input_fp32_
[
i
],
sizeof
(
float
)
*
config_
.
hidden_size
});
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
m_mem_requests
.
push_back
({(
void
**
)
&
m_gate_input_
[
i
],
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
m_gate_input_
[
i
]
=
(
uint8_t
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_up_input_
[
i
],
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
m_up_input_
[
i
]
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
}
}
m_local_gate_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_gate_input_
,
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)});
offset
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_up_input_
,
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)});
m_local_up_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_gate_output_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
offset
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_up_output_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
m_local_gate_output_
=
(
float
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_intermediate_fp32_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
});
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_down_input_
,
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)});
m_local_up_output_
=
(
float
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_down_output_
,
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
});
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
m_local_intermediate_fp32_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
;
m_local_down_input_
=
(
uint8_t
*
)(
buffer_
+
offset
);
offset
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
m_local_down_output_
=
(
float
*
)(
buffer_
+
offset
);
offset
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
m_output_fp32_
.
resize
(
config_
.
group_max_len
);
m_output_fp32_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_output_fp32_
[
i
]
=
(
float
*
)(
buffer_
+
offset
);
m_mem_requests
.
push_back
({(
void
**
)
&
m_output_fp32_
[
i
],
sizeof
(
float
)
*
config_
.
hidden_size
});
offset
+=
sizeof
(
float
)
*
config_
.
hidden_size
;
}
}
shared_mem_buffer
.
alloc
(
this
,
m_mem_requests
);
m_local_pos_
.
resize
(
config_
.
group_max_len
);
m_local_pos_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
...
@@ -107,6 +72,10 @@ MOE::MOE(MOEConfig config) {
...
@@ -107,6 +72,10 @@ MOE::MOE(MOEConfig config) {
m_local_down_output_ptr_
.
resize
(
config_
.
expert_num
);
m_local_down_output_ptr_
.
resize
(
config_
.
expert_num
);
}
}
MOE
::~
MOE
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
void
MOE
::
warm_up
(
Backend
*
backend
)
{
void
MOE
::
warm_up
(
Backend
*
backend
)
{
std
::
vector
<
float
>
input_fp32
(
config_
.
hidden_size
);
std
::
vector
<
float
>
input_fp32
(
config_
.
hidden_size
);
std
::
vector
<
uint8_t
>
input
(
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
));
std
::
vector
<
uint8_t
>
input
(
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
));
...
...
ktransformers/ktransformers_ext/operators/llamafile/moe.h
View file @
f2938031
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-22 02:03:22
* @Date : 2024-07-22 02:03:22
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:35:10
* @LastEditTime : 2024-07-25 10:35:10
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MOEConfig
{
struct
MOEConfig
{
int
expert_num
;
int
expert_num
;
...
@@ -48,13 +49,13 @@ struct MOEConfig {
...
@@ -48,13 +49,13 @@ struct MOEConfig {
class
MOE
{
class
MOE
{
public:
public:
MOE
(
MOEConfig
);
MOE
(
MOEConfig
);
~
MOE
();
void
warm_up
(
Backend
*
backend
);
void
warm_up
(
Backend
*
backend
);
void
forward_one
(
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_one
(
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward_many
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
private:
static
void
*
buffer_
;
MOEConfig
config_
;
MOEConfig
config_
;
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
...
...
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
0 → 100644
View file @
f2938031
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 09:21:29
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "shared_mem_buffer.h"
#include <cstdio>
SharedMemBuffer
::
SharedMemBuffer
()
{
buffer_
=
nullptr
;
size_
=
0
;
}
SharedMemBuffer
::~
SharedMemBuffer
()
{
if
(
buffer_
)
{
free
(
buffer_
);
}
}
void
SharedMemBuffer
::
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
uint64_t
size
=
0
;
for
(
auto
&
request
:
requests
)
{
size
+=
request
.
second
;
}
if
(
size
>
size_
)
{
if
(
buffer_
)
{
free
(
buffer_
);
}
buffer_
=
malloc
(
size
);
size_
=
size
;
for
(
auto
&
obj_requests
:
hist_requests_
)
{
for
(
auto
&
requests
:
obj_requests
.
second
)
{
arrange
(
requests
);
}
}
}
arrange
(
requests
);
hist_requests_
[
object
].
push_back
(
requests
);
}
void
SharedMemBuffer
::
dealloc
(
void
*
object
)
{
hist_requests_
.
erase
(
object
);
}
void
SharedMemBuffer
::
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
uint64_t
offset
=
0
;
for
(
auto
&
request
:
requests
)
{
*
(
request
.
first
)
=
buffer_
+
offset
;
offset
+=
request
.
second
;
}
}
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
0 → 100644
View file @
f2938031
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class
SharedMemBuffer
{
public:
SharedMemBuffer
();
~
SharedMemBuffer
();
void
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
void
dealloc
(
void
*
object
);
private:
void
*
buffer_
;
uint64_t
size_
;
std
::
map
<
void
*
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>>>
hist_requests_
;
void
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
};
static
SharedMemBuffer
shared_mem_buffer
;
#endif
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment