Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
f3d842a0
"doc/vscode:/vscode.git/clone" did not exist on "28948aacc93f0d409ce141b5169d8b90dad61ea8"
Commit
f3d842a0
authored
Apr 25, 2025
by
chenht2022
Browse files
support AMX
parent
b90362b5
Changes
15
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1799 additions
and
62 deletions
+1799
-62
.clang-format
.clang-format
+5
-0
csrc/ktransformers_ext/CMakeLists.txt
csrc/ktransformers_ext/CMakeLists.txt
+2
-1
csrc/ktransformers_ext/bench/bench_moe_amx.py
csrc/ktransformers_ext/bench/bench_moe_amx.py
+107
-0
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp
+3
-2
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
+37
-0
csrc/ktransformers_ext/ext_bindings.cpp
csrc/ktransformers_ext/ext_bindings.cpp
+91
-0
csrc/ktransformers_ext/operators/amx/la/amx.hpp
csrc/ktransformers_ext/operators/amx/la/amx.hpp
+974
-0
csrc/ktransformers_ext/operators/amx/la/utils.hpp
csrc/ktransformers_ext/operators/amx/la/utils.hpp
+46
-0
csrc/ktransformers_ext/operators/amx/moe.hpp
csrc/ktransformers_ext/operators/amx/moe.hpp
+398
-0
csrc/ktransformers_ext/operators/llamafile/linear.h
csrc/ktransformers_ext/operators/llamafile/linear.h
+1
-1
csrc/ktransformers_ext/operators/llamafile/mlp.h
csrc/ktransformers_ext/operators/llamafile/mlp.h
+1
-1
csrc/ktransformers_ext/operators/llamafile/moe.h
csrc/ktransformers_ext/operators/llamafile/moe.h
+1
-1
csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
...ktransformers_ext/operators/llamafile/shared_mem_buffer.h
+0
-37
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+56
-19
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
...formers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
+77
-0
No files found.
.clang-format
0 → 100644
View file @
f3d842a0
---
BasedOnStyle: LLVM
ColumnLimit: 120 # 设置最大行宽为 100
IndentWidth: 2
---
csrc/ktransformers_ext/CMakeLists.txt
View file @
f3d842a0
...
@@ -293,9 +293,10 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
...
@@ -293,9 +293,10 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/llamafile SOURCE_DIR3
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/llamafile SOURCE_DIR3
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llamafile SOURCE_DIR4
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llamafile SOURCE_DIR4
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/kvcache SOURCE_DIR5
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/kvcache SOURCE_DIR5
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/amx SOURCE_DIR6
)
set
(
ALL_SOURCES
${
SOURCE_DIR1
}
${
SOURCE_DIR2
}
${
SOURCE_DIR3
}
${
SOURCE_DIR4
}
${
SOURCE_DIR5
}
)
set
(
ALL_SOURCES
${
SOURCE_DIR1
}
${
SOURCE_DIR2
}
${
SOURCE_DIR3
}
${
SOURCE_DIR4
}
${
SOURCE_DIR5
}
${
SOURCE_DIR6
}
)
file
(
GLOB_RECURSE FMT_SOURCES
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.cpp"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.hpp"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.h"
)
file
(
GLOB_RECURSE FMT_SOURCES
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.cpp"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.hpp"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.h"
)
...
...
csrc/ktransformers_ext/bench/bench_moe_amx.py
0 → 100644
View file @
f3d842a0
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2025-04-25 18:28:12
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2025-04-25 18:28:12
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
import
time
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
'/../build'
)
import
cpuinfer_ext
import
torch
expert_num
=
8
hidden_size
=
7168
intermediate_size
=
2048
max_len
=
25600
n_routed_experts
=
8
layer_num
=
10
qlen
=
1024
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
65
)
warm_up_iter
=
100
test_iter
=
100
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
if
quant_mode
==
"bf16"
:
bytes_per_elem
=
2.000000
elif
quant_mode
==
"int8"
:
bytes_per_elem
=
1.000000
else
:
assert
(
False
)
moes
=
[]
gate_projs
=
[]
up_projs
=
[]
down_projs
=
[]
for
_
in
range
(
layer_num
):
gate_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
moe
.
AMX_MOEConfig
(
expert_num
,
n_routed_experts
,
hidden_size
,
intermediate_size
,
max_len
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
())
if
quant_mode
==
"bf16"
:
moe
=
cpuinfer_ext
.
moe
.
AMXBF16_MOE
(
config
)
CPUInfer
.
submit
(
moe
.
load_weights
())
CPUInfer
.
sync
()
elif
quant_mode
==
"int8"
:
moe
=
cpuinfer_ext
.
moe
.
AMXInt8_MOE
(
config
)
CPUInfer
.
submit
(
moe
.
load_weights
())
CPUInfer
.
sync
()
gate_projs
.
append
(
gate_proj
)
up_projs
.
append
(
up_proj
)
down_projs
.
append
(
down_proj
)
moes
.
append
(
moe
)
expert_ids
=
torch
.
stack
([
torch
.
stack
([
torch
.
randperm
(
expert_num
,
dtype
=
torch
.
int64
,
device
=
"cuda"
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)])
for
_
in
range
(
layer_num
)]).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
qlen_tensor
=
torch
.
tensor
([
qlen
],
dtype
=
torch
.
int32
)
# warm up
for
i
in
range
(
warm_up_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
(
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
(),
qlen_tensor
.
data_ptr
()
)
)
CPUInfer
.
sync
()
# test
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
(
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
(),
qlen_tensor
.
data_ptr
()
)
)
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time(us) per iteration: '
,
total_time
/
test_iter
*
1000000
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
n_routed_experts
*
bytes_per_elem
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
'Flops: '
,
hidden_size
*
intermediate_size
*
qlen
*
3
*
n_routed_experts
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GFLOPS'
)
print
(
''
)
bench_moe
(
"bf16"
)
bench_moe
(
"int8"
)
csrc/ktransformers_ext/
operators/llamafile
/shared_mem_buffer.cpp
→
csrc/ktransformers_ext/
cpu_backend
/shared_mem_buffer.cpp
View file @
f3d842a0
...
@@ -30,7 +30,8 @@ void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t
...
@@ -30,7 +30,8 @@ void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t
if
(
buffer_
)
{
if
(
buffer_
)
{
free
(
buffer_
);
free
(
buffer_
);
}
}
buffer_
=
malloc
(
size
);
buffer_
=
std
::
aligned_alloc
(
64
,
size
);
size_
=
size
;
size_
=
size
;
for
(
auto
&
obj_requests
:
hist_requests_
)
{
for
(
auto
&
obj_requests
:
hist_requests_
)
{
for
(
auto
&
requests
:
obj_requests
.
second
)
{
for
(
auto
&
requests
:
obj_requests
.
second
)
{
...
@@ -52,4 +53,4 @@ void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests)
...
@@ -52,4 +53,4 @@ void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests)
*
(
request
.
first
)
=
(
uint8_t
*
)
buffer_
+
offset
;
*
(
request
.
first
)
=
(
uint8_t
*
)
buffer_
+
offset
;
offset
+=
request
.
second
;
offset
+=
request
.
second
;
}
}
}
}
\ No newline at end of file
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
0 → 100644
View file @
f3d842a0
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class
SharedMemBuffer
{
public:
SharedMemBuffer
();
~
SharedMemBuffer
();
void
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
void
dealloc
(
void
*
object
);
private:
void
*
buffer_
;
uint64_t
size_
;
std
::
map
<
void
*
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>>>
hist_requests_
;
void
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
};
static
SharedMemBuffer
shared_mem_buffer
;
#endif
\ No newline at end of file
csrc/ktransformers_ext/ext_bindings.cpp
View file @
f3d842a0
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include "operators/llamafile/linear.h"
#include "operators/llamafile/linear.h"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.h"
#include "operators/llamafile/moe.h"
#include "operators/amx/moe.hpp"
#include "pybind11/functional.h"
#include "pybind11/functional.h"
#include "pybind11/operators.h"
#include "pybind11/operators.h"
#include "pybind11/pybind11.h"
#include "pybind11/pybind11.h"
...
@@ -563,6 +564,75 @@ class MOEBindings {
...
@@ -563,6 +564,75 @@ class MOEBindings {
};
};
};
};
template
<
class
T
>
class
AMX_MOEBindings
{
public:
class
WarmUpBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
AMX_MOE
<
T
>
*
moe
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
AMX_MOE
<
T
>::
warm_up
,
args_
->
moe
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
cpuinfer_interface
(
AMX_MOE
<
T
>
&
moe
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
LoadWeightsBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
AMX_MOE
<
T
>
*
moe
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
AMX_MOE
<
T
>::
load_weights
,
args_
->
moe
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
cpuinfer_interface
(
AMX_MOE
<
T
>
&
moe
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
ForwardBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
AMX_MOE
<
T
>
*
moe
;
int
qlen
;
int
k
;
const
uint64_t
*
expert_ids
;
const
float
*
weights
;
const
void
*
input
;
void
*
output
;
int
*
batch_size_tensor
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
AMX_MOE
<
T
>::
forward
,
args_
->
moe
,
args_
->
qlen
,
args_
->
k
,
args_
->
expert_ids
,
args_
->
weights
,
args_
->
input
,
args_
->
output
,
args_
->
batch_size_tensor
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
cpuinfer_interface
(
AMX_MOE
<
T
>
&
moe
,
int
qlen
,
int
k
,
intptr_t
expert_ids
,
intptr_t
weights
,
intptr_t
input
,
intptr_t
output
,
intptr_t
batch_size_tensor
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
,
qlen
,
k
,
(
const
uint64_t
*
)
expert_ids
,
(
const
float
*
)
weights
,
(
const
void
*
)
input
,
(
void
*
)
output
,
(
int
*
)
batch_size_tensor
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
PYBIND11_MODULE
(
cpuinfer_ext
,
m
)
{
PYBIND11_MODULE
(
cpuinfer_ext
,
m
)
{
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
.
def
(
py
::
init
<
int
>
())
.
def
(
py
::
init
<
int
>
())
...
@@ -621,6 +691,27 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
...
@@ -621,6 +691,27 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
.
def
(
"warm_up"
,
&
MOEBindings
::
WarmUpBindinds
::
cpuinfer_interface
)
.
def
(
"warm_up"
,
&
MOEBindings
::
WarmUpBindinds
::
cpuinfer_interface
)
.
def
(
"forward"
,
&
MOEBindings
::
ForwardBindings
::
cpuinfer_interface
);
.
def
(
"forward"
,
&
MOEBindings
::
ForwardBindings
::
cpuinfer_interface
);
py
::
class_
<
AMX_MOEConfig
>
(
moe_module
,
"AMX_MOEConfig"
)
.
def
(
py
::
init
([](
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
)
{
return
AMX_MOEConfig
(
expert_num
,
routed_expert_num
,
hidden_size
,
intermediate_size
,
max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
);
}));
py
::
class_
<
AMX_MOE
<
amx
::
GemmKernel224BF
>>
(
moe_module
,
"AMXBF16_MOE"
)
.
def
(
py
::
init
<
AMX_MOEConfig
>
())
.
def
(
"warm_up"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224BF
>::
WarmUpBindings
::
cpuinfer_interface
)
.
def
(
"load_weights"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224BF
>::
LoadWeightsBindings
::
cpuinfer_interface
)
.
def
(
"forward"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224BF
>::
ForwardBindings
::
cpuinfer_interface
);
py
::
class_
<
AMX_MOE
<
amx
::
GemmKernel224Int8
>>
(
moe_module
,
"AMXInt8_MOE"
)
.
def
(
py
::
init
<
AMX_MOEConfig
>
())
.
def
(
"warm_up"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224Int8
>::
WarmUpBindings
::
cpuinfer_interface
)
.
def
(
"load_weights"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224Int8
>::
LoadWeightsBindings
::
cpuinfer_interface
)
.
def
(
"forward"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224Int8
>::
ForwardBindings
::
cpuinfer_interface
);
auto
kvcache_module
=
m
.
def_submodule
(
"kvcache"
);
auto
kvcache_module
=
m
.
def_submodule
(
"kvcache"
);
py
::
enum_
<
AnchorType
>
(
kvcache_module
,
"AnchorType"
)
py
::
enum_
<
AnchorType
>
(
kvcache_module
,
"AnchorType"
)
...
...
csrc/ktransformers_ext/operators/amx/la/amx.hpp
0 → 100644
View file @
f3d842a0
This diff is collapsed.
Click to expand it.
csrc/ktransformers_ext/operators/amx/la/utils.hpp
0 → 100644
View file @
f3d842a0
/**
* @Description :
* @Author : chenht2022
* @Date : 2025-04-25 18:28:12
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2025-04-25 18:28:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <cstdint>
template
<
typename
T
>
T
*
offset_pointer
(
T
*
ptr
,
std
::
size_t
byte_offset
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
char
*>
(
ptr
)
+
byte_offset
);
}
template
<
typename
T
>
const
T
*
offset_pointer
(
const
T
*
ptr
,
std
::
size_t
byte_offset
)
{
return
reinterpret_cast
<
const
T
*>
(
reinterpret_cast
<
const
char
*>
(
ptr
)
+
byte_offset
);
}
template
<
typename
T
>
T
*
offset_pointer_row_major
(
T
*
t
,
int
row
,
int
col
,
std
::
size_t
ld
)
{
return
offset_pointer
(
t
,
row
*
ld
)
+
col
;
}
template
<
typename
T
>
T
*
offset_pointer_col_major
(
T
*
t
,
int
row
,
int
col
,
std
::
size_t
ld
)
{
return
offset_pointer
(
t
,
col
*
ld
)
+
row
;
}
static
inline
void
avx512_copy_32xbf16
(
__m512i
*
src
,
__m512i
*
dst
)
{
_mm512_storeu_si512
(
dst
,
_mm512_loadu_si512
(
src
));
}
static
inline
void
avx512_32xfp32_to_32xbf16
(
__m512
*
src0
,
__m512
*
src1
,
__m512i
*
dst
)
{
_mm512_storeu_si512
(
dst
,
__m512i
(
_mm512_cvtne2ps_pbh
(
*
src1
,
*
src0
)));
}
static
inline
void
avx512_32xbf16_to_32xfp32
(
__m512i
*
src
,
__m512
*
dst0
,
__m512
*
dst1
)
{
_mm512_storeu_ps
(
dst0
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)(
src
))),
16
)));
_mm512_storeu_ps
(
dst1
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)(
src
)
+
1
)),
16
)));
}
\ No newline at end of file
csrc/ktransformers_ext/operators/amx/moe.hpp
0 → 100644
View file @
f3d842a0
/**
* @Description :
* @Author : chenht2022
* @Date : 2025-04-25 18:28:12
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2025-04-25 18:28:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_OPERATOR_AMX_MOE_H
#define CPUINFER_OPERATOR_AMX_MOE_H
#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "la/amx.hpp"
#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
void
*
numa_alloc_aligned
(
size_t
size
,
int
node
,
size_t
alignment
)
{
void
*
ptr
=
numa_alloc_onnode
(
size
,
node
);
assert
(
reinterpret_cast
<
intptr_t
>
(
ptr
)
%
64
==
0
);
return
ptr
;
}
#endif
static
inline
__m512
exp_avx512
(
__m512
x
)
{
const
__m512
log2e
=
_mm512_set1_ps
(
1.44269504089
f
);
const
__m512
c1
=
_mm512_set1_ps
(
0.69314718056
f
);
__m512
y
=
_mm512_mul_ps
(
x
,
log2e
);
__m512i
int_part
=
_mm512_cvtps_epi32
(
y
);
__m512
frac_part
=
_mm512_sub_ps
(
y
,
_mm512_cvtepi32_ps
(
int_part
));
const
__m512
poly_1
=
_mm512_set1_ps
(
0.9999999995
f
);
const
__m512
poly_2
=
_mm512_set1_ps
(
0.6931471805
f
);
const
__m512
poly_3
=
_mm512_set1_ps
(
0.2402265069
f
);
const
__m512
poly_4
=
_mm512_set1_ps
(
0.0555041087
f
);
const
__m512
poly_5
=
_mm512_set1_ps
(
0.0096181291
f
);
const
__m512
poly_6
=
_mm512_set1_ps
(
0.0013333558
f
);
__m512
frac_exp
=
_mm512_fmadd_ps
(
frac_part
,
poly_6
,
_mm512_fmadd_ps
(
frac_part
,
poly_5
,
_mm512_fmadd_ps
(
frac_part
,
poly_4
,
_mm512_fmadd_ps
(
frac_part
,
poly_3
,
_mm512_fmadd_ps
(
frac_part
,
poly_2
,
poly_1
)))));
__m512
two_pow_i
=
_mm512_scalef_ps
(
_mm512_set1_ps
(
1.0
f
),
_mm512_cvtepi32_ps
(
int_part
));
return
_mm512_mul_ps
(
two_pow_i
,
frac_exp
);
}
static
inline
__m512
act_fn
(
__m512
gate_val
,
__m512
up_val
)
{
__m512
neg_gate_val
=
_mm512_sub_ps
(
_mm512_setzero_ps
(),
gate_val
);
__m512
exp_neg_gate
=
exp_avx512
(
neg_gate_val
);
__m512
denom
=
_mm512_add_ps
(
_mm512_set1_ps
(
1.0
f
),
exp_neg_gate
);
__m512
act_val
=
_mm512_div_ps
(
gate_val
,
denom
);
return
_mm512_mul_ps
(
act_val
,
up_val
);
}
struct
AMX_MOEConfig
{
int
expert_num
;
int
routed_expert_num
;
int
hidden_size
;
int
intermediate_size
;
int
max_len
;
void
*
gate_proj
;
void
*
up_proj
;
void
*
down_proj
;
AMX_MOEConfig
()
{}
AMX_MOEConfig
(
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
max_len
,
void
*
gate_proj
,
void
*
up_proj
,
void
*
down_proj
)
:
expert_num
(
expert_num
),
routed_expert_num
(
routed_expert_num
),
hidden_size
(
hidden_size
),
intermediate_size
(
intermediate_size
),
max_len
(
max_len
),
gate_proj
(
gate_proj
),
up_proj
(
up_proj
),
down_proj
(
down_proj
)
{}
};
template
<
class
T
>
class
AMX_MOE
{
private:
AMX_MOEConfig
config_
;
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
down_proj_
;
// [expert_num * hidden_size * intermediate_size ( /32 if quantized)]
ggml_bf16_t
*
m_local_input_
;
// [routed_expert_num * max_len * hidden_size]
ggml_bf16_t
*
m_local_gate_output_
;
// [routed_expert_num * max_len * intermediate_size]
ggml_bf16_t
*
m_local_up_output_
;
// [routed_expert_num * max_len * intermediate_size]
ggml_bf16_t
*
m_local_down_output_
;
// [routed_expert_num * max_len * hidden_size]
std
::
vector
<
std
::
vector
<
int
>>
m_local_pos_
;
// [max_len, routed_expert_num]
std
::
vector
<
int
>
m_local_num_
;
// [expert_num]
std
::
vector
<
int
>
m_expert_id_map_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_input_ptr_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_gate_output_ptr_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_up_output_ptr_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_down_output_ptr_
;
// [expert_num]
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferA
>>
gate_up_ba_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferC
>>
gate_bc_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferC
>>
up_bc_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferA
>>
down_ba_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferC
>>
down_bc_
;
#ifdef USE_NUMA
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>>
gate_bb_numa_
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>>
up_bb_numa_
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>>
down_bb_numa_
;
#else
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>
gate_bb_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>
up_bb_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>
down_bb_
;
#endif
public:
AMX_MOE
(
AMX_MOEConfig
config
)
{
config_
=
config
;
gate_proj_
=
config_
.
gate_proj
;
up_proj_
=
config_
.
up_proj
;
down_proj_
=
config_
.
down_proj
;
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
m_mem_requests
;
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_input_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
hidden_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_gate_output_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_up_output_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_down_output_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
hidden_size
});
std
::
vector
<
void
*>
gate_up_ba_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
gate_bc_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
up_bc_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
down_ba_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
down_bc_ptr
(
config_
.
expert_num
);
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
m_mem_requests
.
push_back
(
{(
void
**
)
&
gate_up_ba_ptr
[
i
],
T
::
BufferA
::
required_size
(
config_
.
max_len
,
config_
.
hidden_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
gate_bc_ptr
[
i
],
T
::
BufferC
::
required_size
(
config_
.
max_len
,
config_
.
intermediate_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
up_bc_ptr
[
i
],
T
::
BufferC
::
required_size
(
config_
.
max_len
,
config_
.
intermediate_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
down_ba_ptr
[
i
],
T
::
BufferA
::
required_size
(
config_
.
max_len
,
config_
.
intermediate_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
down_bc_ptr
[
i
],
T
::
BufferC
::
required_size
(
config_
.
max_len
,
config_
.
hidden_size
)});
}
shared_mem_buffer
.
alloc
(
this
,
m_mem_requests
);
m_local_pos_
.
resize
(
config_
.
max_len
);
for
(
int
i
=
0
;
i
<
config_
.
max_len
;
i
++
)
{
m_local_pos_
[
i
].
resize
(
config_
.
routed_expert_num
);
}
m_expert_id_map_
.
resize
(
config_
.
expert_num
);
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_input_ptr_
.
resize
(
config_
.
expert_num
);
m_local_gate_output_ptr_
.
resize
(
config_
.
expert_num
);
m_local_up_output_ptr_
.
resize
(
config_
.
expert_num
);
m_local_down_output_ptr_
.
resize
(
config_
.
expert_num
);
for
(
uint64_t
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
gate_up_ba_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferA
>
(
config_
.
max_len
,
config_
.
hidden_size
,
gate_up_ba_ptr
[
i
]));
gate_bc_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferC
>
(
config_
.
max_len
,
config_
.
intermediate_size
,
gate_bc_ptr
[
i
]));
up_bc_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferC
>
(
config_
.
max_len
,
config_
.
intermediate_size
,
up_bc_ptr
[
i
]));
down_ba_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferA
>
(
config_
.
max_len
,
config_
.
intermediate_size
,
down_ba_ptr
[
i
]));
down_bc_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferC
>
(
config_
.
max_len
,
config_
.
hidden_size
,
down_bc_ptr
[
i
]));
#ifdef USE_NUMA
int
numa_nodes
=
numa_num_configured_nodes
();
gate_bb_numa_
.
resize
(
numa_nodes
);
up_bb_numa_
.
resize
(
numa_nodes
);
down_bb_numa_
.
resize
(
numa_nodes
);
for
(
int
j
=
0
;
j
<
numa_nodes
;
j
++
)
{
void
*
gate_bb_ptr
=
numa_alloc_aligned
(
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
),
j
,
64
);
gate_bb_numa_
[
j
].
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_bb_ptr
));
void
*
up_bb_ptr
=
numa_alloc_aligned
(
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
),
j
,
64
);
up_bb_numa_
[
j
].
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
up_bb_ptr
));
void
*
down_bb_ptr
=
numa_alloc_aligned
(
T
::
BufferB
::
required_size
(
config_
.
hidden_size
,
config_
.
intermediate_size
),
j
,
64
);
down_bb_numa_
[
j
].
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_bb_ptr
));
}
#else
void
*
gate_bb_ptr
=
std
::
aligned_alloc
(
64
,
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
));
gate_bb_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_bb_ptr
));
void
*
up_bb_ptr
=
std
::
aligned_alloc
(
64
,
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
));
up_bb_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
up_bb_ptr
));
void
*
down_bb_ptr
=
std
::
aligned_alloc
(
64
,
T
::
BufferB
::
required_size
(
config_
.
hidden_size
,
config_
.
intermediate_size
));
down_bb_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_bb_ptr
));
#endif
}
}
~
AMX_MOE
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
void
load_weights
(
Backend
*
backend
)
{
int
nth
=
T
::
recommended_nth
(
config_
.
intermediate_size
);
backend
->
do_work_stealing_job
(
nth
*
config_
.
expert_num
,
nullptr
,
[
&
](
int
task_id
)
{
uint64_t
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
int
numa_nodes
=
numa_num_configured_nodes
();
for
(
int
j
=
0
;
j
<
numa_nodes
;
j
++
)
{
gate_bb_numa_
[
j
][
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
gate_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
up_bb_numa_
[
j
][
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
up_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
}
#else
gate_bb_
[
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
gate_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
up_bb_
[
expert_idx
]
->
from_mat
(
(
ggml_bf16_t
*
)
config_
.
up_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
#endif
},
nullptr
);
nth
=
T
::
recommended_nth
(
config_
.
hidden_size
);
backend
->
do_work_stealing_job
(
nth
*
config_
.
expert_num
,
nullptr
,
[
&
](
int
task_id
)
{
uint64_t
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
int
numa_nodes
=
numa_num_configured_nodes
();
for
(
int
j
=
0
;
j
<
numa_nodes
;
j
++
)
{
down_bb_numa_
[
j
][
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
down_proj
+
expert_idx
*
config_
.
hidden_size
*
config_
.
intermediate_size
,
ith
,
nth
);
}
#else
down_bb_
[
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
down_proj
+
expert_idx
*
config_
.
hidden_size
*
config_
.
intermediate_size
,
ith
,
nth
);
#endif
},
nullptr
);
}
void
warm_up
(
Backend
*
backend
)
{}
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
int
*
batch_size_tensor
,
Backend
*
backend
)
{
bool
use_amx
=
(
qlen
>
4
*
config_
.
expert_num
/
config_
.
routed_expert_num
);
qlen
=
batch_size_tensor
[
0
];
int
activated_expert
=
0
;
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
m_local_num_
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
m_local_pos_
[
i
][
j
]
=
m_local_num_
[
expert_ids
[
i
*
k
+
j
]]
++
;
}
}
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
if
(
m_local_num_
[
i
]
>
0
)
{
m_expert_id_map_
[
activated_expert
]
=
i
;
activated_expert
++
;
}
}
uint64_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
m_local_input_ptr_
[
i
]
=
m_local_input_
+
offset
*
config_
.
hidden_size
;
m_local_gate_output_ptr_
[
i
]
=
m_local_gate_output_
+
offset
*
config_
.
intermediate_size
;
m_local_up_output_ptr_
[
i
]
=
m_local_up_output_
+
offset
*
config_
.
intermediate_size
;
m_local_down_output_ptr_
[
i
]
=
m_local_down_output_
+
offset
*
config_
.
hidden_size
;
offset
+=
m_local_num_
[
i
];
}
backend
->
do_work_stealing_job
(
qlen
,
nullptr
,
[
&
](
int
i
)
{
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
memcpy
(
m_local_input_ptr_
[
expert_ids
[
i
*
k
+
j
]]
+
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
,
(
ggml_bf16_t
*
)
input
+
i
*
config_
.
hidden_size
,
sizeof
(
ggml_bf16_t
)
*
config_
.
hidden_size
);
}
},
nullptr
);
backend
->
do_work_stealing_job
(
activated_expert
,
nullptr
,
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
];
gate_up_ba_
[
expert_idx
]
->
from_mat
(
m_local_num_
[
expert_idx
],
m_local_input_ptr_
[
expert_idx
],
0
,
1
);
},
nullptr
);
int
nth
=
T
::
recommended_nth
(
config_
.
intermediate_size
);
backend
->
do_work_stealing_job
(
nth
*
activated_expert
,
[
&
](
int
_
)
{
T
::
config
();
},
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
/
nth
];
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
gate_bb_numa_
[
Backend
::
numa_node
][
expert_idx
],
gate_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
up_bb_numa_
[
Backend
::
numa_node
][
expert_idx
],
up_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#else
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
gate_bb_
[
expert_idx
],
gate_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
up_bb_
[
expert_idx
],
up_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#endif
gate_bc_
[
expert_idx
]
->
to_mat
(
m_local_num_
[
expert_idx
],
m_local_gate_output_ptr_
[
expert_idx
],
ith
,
nth
);
up_bc_
[
expert_idx
]
->
to_mat
(
m_local_num_
[
expert_idx
],
m_local_up_output_ptr_
[
expert_idx
],
ith
,
nth
);
auto
[
n_start
,
n_end
]
=
T
::
split_range_n
(
config_
.
intermediate_size
,
ith
,
nth
);
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
ggml_bf16_t
*
gate_output_ptr
=
&
m_local_gate_output_ptr_
[
expert_idx
][
i
*
config_
.
intermediate_size
];
ggml_bf16_t
*
up_output_ptr
=
&
m_local_up_output_ptr_
[
expert_idx
][
i
*
config_
.
intermediate_size
];
for
(
int
j
=
n_start
;
j
<
n_end
;
j
+=
32
)
{
__m512
gate_val0
,
gate_val1
,
up_val0
,
up_val1
;
avx512_32xbf16_to_32xfp32
((
__m512i
*
)(
gate_output_ptr
+
j
),
&
gate_val0
,
&
gate_val1
);
avx512_32xbf16_to_32xfp32
((
__m512i
*
)(
up_output_ptr
+
j
),
&
up_val0
,
&
up_val1
);
__m512
result0
=
act_fn
(
gate_val0
,
up_val0
);
__m512
result1
=
act_fn
(
gate_val1
,
up_val1
);
avx512_32xfp32_to_32xbf16
(
&
result0
,
&
result1
,
(
__m512i
*
)(
gate_output_ptr
+
j
));
}
}
},
nullptr
);
backend
->
do_work_stealing_job
(
activated_expert
,
nullptr
,
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
];
down_ba_
[
expert_idx
]
->
from_mat
(
m_local_num_
[
expert_idx
],
m_local_gate_output_ptr_
[
expert_idx
],
0
,
1
);
},
nullptr
);
nth
=
T
::
recommended_nth
(
config_
.
hidden_size
);
backend
->
do_work_stealing_job
(
nth
*
activated_expert
,
[
&
](
int
_
)
{
T
::
config
();
},
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
/
nth
];
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_ba_
[
expert_idx
],
down_bb_numa_
[
Backend
::
numa_node
][
expert_idx
],
down_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#else
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_ba_
[
expert_idx
],
down_bb_
[
expert_idx
],
down_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#endif
down_bc_
[
expert_idx
]
->
to_mat
(
m_local_num_
[
expert_idx
],
m_local_down_output_ptr_
[
expert_idx
],
ith
,
nth
);
},
nullptr
);
backend
->
do_work_stealing_job
(
qlen
,
nullptr
,
[
&
](
int
i
)
{
for
(
int
e
=
0
;
e
<
config_
.
hidden_size
;
e
+=
32
)
{
__m512
x0
=
_mm512_setzero_ps
();
__m512
x1
=
_mm512_setzero_ps
();
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
__m512
weight
=
_mm512_set1_ps
(
weights
[
i
*
k
+
j
]);
__m512
down_output0
,
down_output1
;
avx512_32xbf16_to_32xfp32
((
__m512i
*
)(
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]]
+
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
),
&
down_output0
,
&
down_output1
);
x0
=
_mm512_fmadd_ps
(
down_output0
,
weight
,
x0
);
x1
=
_mm512_fmadd_ps
(
down_output1
,
weight
,
x1
);
}
avx512_32xfp32_to_32xbf16
(
&
x0
,
&
x1
,
(
__m512i
*
)((
ggml_bf16_t
*
)
output
+
i
*
config_
.
hidden_size
+
e
));
}
},
nullptr
);
}
};
#endif
\ No newline at end of file
csrc/ktransformers_ext/operators/llamafile/linear.h
View file @
f3d842a0
...
@@ -17,12 +17,12 @@
...
@@ -17,12 +17,12 @@
#include <vector>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
LinearConfig
{
struct
LinearConfig
{
int
input_size
;
int
input_size
;
...
...
csrc/ktransformers_ext/operators/llamafile/mlp.h
View file @
f3d842a0
...
@@ -17,12 +17,12 @@
...
@@ -17,12 +17,12 @@
#include <vector>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MLPConfig
{
struct
MLPConfig
{
int
hidden_size
;
int
hidden_size
;
...
...
csrc/ktransformers_ext/operators/llamafile/moe.h
View file @
f3d842a0
...
@@ -17,12 +17,12 @@
...
@@ -17,12 +17,12 @@
#include <vector>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MOEConfig
{
struct
MOEConfig
{
int
expert_num
;
int
expert_num
;
...
...
csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
deleted
100644 → 0
View file @
b90362b5
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class
SharedMemBuffer
{
public:
SharedMemBuffer
();
~
SharedMemBuffer
();
void
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
void
dealloc
(
void
*
object
);
private:
void
*
buffer_
;
uint64_t
size_
;
std
::
map
<
void
*
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>>>
hist_requests_
;
void
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
};
static
SharedMemBuffer
shared_mem_buffer
;
#endif
\ No newline at end of file
ktransformers/operators/experts.py
View file @
f3d842a0
...
@@ -25,8 +25,9 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
...
@@ -25,8 +25,9 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Debug"
))
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Debug"
))
import
cpuinfer_ext
import
cpuinfer_ext
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
AMX_MOEConfig
,
AMXBF16_MOE
,
AMXInt8_MOE
import
ctypes
import
ctypes
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.custom_gguf
import
GGMLQuantizationType
,
GGUFLoader
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.server.config.config
import
Config
from
ktransformers.server.config.config
import
Config
from
transformers.activations
import
ACT2FN
from
transformers.activations
import
ACT2FN
...
@@ -141,6 +142,7 @@ class KExpertsCPU(KExpertsBase):
...
@@ -141,6 +142,7 @@ class KExpertsCPU(KExpertsBase):
assert
device
.
lower
()
==
"cpu"
,
"KExpertsCPU can only be loaded on CPU"
assert
device
.
lower
()
==
"cpu"
,
"KExpertsCPU can only be loaded on CPU"
self
.
n_routed_experts
=
n_routed_experts
self
.
n_routed_experts
=
n_routed_experts
self
.
out_device
=
out_device
self
.
out_device
=
out_device
self
.
backend
=
kwargs
.
get
(
"backend"
,
"llamafile"
)
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
,
warmup
:
bool
=
False
):
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
,
warmup
:
bool
=
False
):
if
device
:
if
device
:
...
@@ -163,27 +165,62 @@ class KExpertsCPU(KExpertsBase):
...
@@ -163,27 +165,62 @@ class KExpertsCPU(KExpertsBase):
)
)
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
n_routed_experts
=
self
.
n_routed_experts
n_routed_experts
=
self
.
n_routed_experts
self
.
cpu_infer
=
KExpertsCPU
.
CPU_INFER
# n_routed_experts = len(self.orig_module)
# n_routed_experts = len(self.orig_module)
moe_config
=
MOEConfig
(
if
self
.
backend
==
"llamafile"
:
n_routed_experts
,
moe_config
=
MOEConfig
(
self
.
config
.
num_experts_per_tok
,
n_routed_experts
,
self
.
config
.
hidden_size
,
self
.
config
.
num_experts_per_tok
,
self
.
config
.
moe_intermediate_size
,
self
.
config
.
hidden_size
,
64
,
self
.
config
.
moe_intermediate_size
,
10
,
64
,
1024
,
10
,
gate_ptr
,
1024
,
up_ptr
,
gate_ptr
,
down_ptr
,
up_ptr
,
self
.
gate_type
,
down_ptr
,
self
.
up_type
,
self
.
gate_type
,
self
.
down_type
,
self
.
up_type
,
30
,
# TODO: get from model.dtype
self
.
down_type
,
)
30
,
# TODO: get from model.dtype
)
self
.
moe
=
MOE
(
moe_config
)
elif
self
.
backend
==
"AMXBF16"
:
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
moe_config
=
AMX_MOEConfig
(
n_routed_experts
,
self
.
config
.
num_experts_per_tok
,
self
.
config
.
hidden_size
,
self
.
config
.
moe_intermediate_size
,
25600
,
gate_ptr
,
up_ptr
,
down_ptr
,
)
self
.
moe
=
AMXBF16_MOE
(
moe_config
)
self
.
cpu_infer
.
submit
(
self
.
moe
.
load_weights
())
self
.
cpu_infer
.
sync
()
elif
self
.
backend
==
"AMXInt8"
:
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
moe_config
=
AMX_MOEConfig
(
n_routed_experts
,
self
.
config
.
num_experts_per_tok
,
self
.
config
.
hidden_size
,
self
.
config
.
moe_intermediate_size
,
25600
,
gate_ptr
,
up_ptr
,
down_ptr
,
)
self
.
moe
=
AMXInt8_MOE
(
moe_config
)
self
.
cpu_infer
.
submit
(
self
.
moe
.
load_weights
())
self
.
cpu_infer
.
sync
()
# print(n_routed_experts, hidden_size, moe_intermediate_size)
# print(n_routed_experts, hidden_size, moe_intermediate_size)
num_experts_per_tok
=
self
.
config
.
num_experts_per_tok
num_experts_per_tok
=
self
.
config
.
num_experts_per_tok
self
.
moe
=
MOE
(
moe_config
)
self
.
cpu_infer
=
KExpertsCPU
.
CPU_INFER
if
warmup
:
if
warmup
:
self
.
cpu_infer
.
submit
(
self
.
moe
.
warm_up
())
self
.
cpu_infer
.
submit
(
self
.
moe
.
warm_up
())
self
.
cpu_infer
.
sync
()
self
.
cpu_infer
.
sync
()
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
0 → 100644
View file @
f3d842a0
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment