Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
f3d842a0
Commit
f3d842a0
authored
Apr 25, 2025
by
chenht2022
Browse files
support AMX
parent
b90362b5
Changes
15
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1799 additions
and
62 deletions
+1799
-62
.clang-format
.clang-format
+5
-0
csrc/ktransformers_ext/CMakeLists.txt
csrc/ktransformers_ext/CMakeLists.txt
+2
-1
csrc/ktransformers_ext/bench/bench_moe_amx.py
csrc/ktransformers_ext/bench/bench_moe_amx.py
+107
-0
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.cpp
+3
-2
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
+37
-0
csrc/ktransformers_ext/ext_bindings.cpp
csrc/ktransformers_ext/ext_bindings.cpp
+91
-0
csrc/ktransformers_ext/operators/amx/la/amx.hpp
csrc/ktransformers_ext/operators/amx/la/amx.hpp
+974
-0
csrc/ktransformers_ext/operators/amx/la/utils.hpp
csrc/ktransformers_ext/operators/amx/la/utils.hpp
+46
-0
csrc/ktransformers_ext/operators/amx/moe.hpp
csrc/ktransformers_ext/operators/amx/moe.hpp
+398
-0
csrc/ktransformers_ext/operators/llamafile/linear.h
csrc/ktransformers_ext/operators/llamafile/linear.h
+1
-1
csrc/ktransformers_ext/operators/llamafile/mlp.h
csrc/ktransformers_ext/operators/llamafile/mlp.h
+1
-1
csrc/ktransformers_ext/operators/llamafile/moe.h
csrc/ktransformers_ext/operators/llamafile/moe.h
+1
-1
csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
...ktransformers_ext/operators/llamafile/shared_mem_buffer.h
+0
-37
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+56
-19
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
...formers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
+77
-0
No files found.
.clang-format
0 → 100644
View file @
f3d842a0
---
BasedOnStyle: LLVM
ColumnLimit: 120 # 设置最大行宽为 100
IndentWidth: 2
---
csrc/ktransformers_ext/CMakeLists.txt
View file @
f3d842a0
...
...
@@ -293,9 +293,10 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/llamafile SOURCE_DIR3
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llamafile SOURCE_DIR4
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/kvcache SOURCE_DIR5
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/operators/amx SOURCE_DIR6
)
set
(
ALL_SOURCES
${
SOURCE_DIR1
}
${
SOURCE_DIR2
}
${
SOURCE_DIR3
}
${
SOURCE_DIR4
}
${
SOURCE_DIR5
}
)
set
(
ALL_SOURCES
${
SOURCE_DIR1
}
${
SOURCE_DIR2
}
${
SOURCE_DIR3
}
${
SOURCE_DIR4
}
${
SOURCE_DIR5
}
${
SOURCE_DIR6
}
)
file
(
GLOB_RECURSE FMT_SOURCES
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.cpp"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.hpp"
"
${
CMAKE_CURRENT_SOURCE_DIR
}
/*.h"
)
...
...
csrc/ktransformers_ext/bench/bench_moe_amx.py
0 → 100644
View file @
f3d842a0
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2025-04-25 18:28:12
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2025-04-25 18:28:12
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import
os
,
sys
import
time
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
'/../build'
)
import
cpuinfer_ext
import
torch
expert_num
=
8
hidden_size
=
7168
intermediate_size
=
2048
max_len
=
25600
n_routed_experts
=
8
layer_num
=
10
qlen
=
1024
CPUInfer
=
cpuinfer_ext
.
CPUInfer
(
65
)
warm_up_iter
=
100
test_iter
=
100
def
bench_moe
(
quant_mode
:
str
):
with
torch
.
inference_mode
(
mode
=
True
):
if
quant_mode
==
"bf16"
:
bytes_per_elem
=
2.000000
elif
quant_mode
==
"int8"
:
bytes_per_elem
=
1.000000
else
:
assert
(
False
)
moes
=
[]
gate_projs
=
[]
up_projs
=
[]
down_projs
=
[]
for
_
in
range
(
layer_num
):
gate_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
up_proj
=
torch
.
randn
((
expert_num
,
intermediate_size
,
hidden_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
down_proj
=
torch
.
randn
((
expert_num
,
hidden_size
,
intermediate_size
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
config
=
cpuinfer_ext
.
moe
.
AMX_MOEConfig
(
expert_num
,
n_routed_experts
,
hidden_size
,
intermediate_size
,
max_len
,
gate_proj
.
data_ptr
(),
up_proj
.
data_ptr
(),
down_proj
.
data_ptr
())
if
quant_mode
==
"bf16"
:
moe
=
cpuinfer_ext
.
moe
.
AMXBF16_MOE
(
config
)
CPUInfer
.
submit
(
moe
.
load_weights
())
CPUInfer
.
sync
()
elif
quant_mode
==
"int8"
:
moe
=
cpuinfer_ext
.
moe
.
AMXInt8_MOE
(
config
)
CPUInfer
.
submit
(
moe
.
load_weights
())
CPUInfer
.
sync
()
gate_projs
.
append
(
gate_proj
)
up_projs
.
append
(
up_proj
)
down_projs
.
append
(
down_proj
)
moes
.
append
(
moe
)
expert_ids
=
torch
.
stack
([
torch
.
stack
([
torch
.
randperm
(
expert_num
,
dtype
=
torch
.
int64
,
device
=
"cuda"
)[:
n_routed_experts
]
for
_
in
range
(
qlen
)])
for
_
in
range
(
layer_num
)]).
to
(
"cpu"
).
contiguous
()
weights
=
torch
.
rand
((
layer_num
,
qlen
,
n_routed_experts
),
dtype
=
torch
.
float32
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
input
=
torch
.
randn
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
output
=
torch
.
empty
((
layer_num
,
qlen
,
hidden_size
),
dtype
=
torch
.
bfloat16
,
device
=
"cuda"
).
to
(
"cpu"
).
contiguous
()
qlen_tensor
=
torch
.
tensor
([
qlen
],
dtype
=
torch
.
int32
)
# warm up
for
i
in
range
(
warm_up_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
(
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
(),
qlen_tensor
.
data_ptr
()
)
)
CPUInfer
.
sync
()
# test
start
=
time
.
perf_counter
()
for
i
in
range
(
test_iter
):
CPUInfer
.
submit
(
moes
[
i
%
layer_num
].
forward
(
qlen
,
n_routed_experts
,
expert_ids
[
i
%
layer_num
].
data_ptr
(),
weights
[
i
%
layer_num
].
data_ptr
(),
input
[
i
%
layer_num
].
data_ptr
(),
output
[
i
%
layer_num
].
data_ptr
(),
qlen_tensor
.
data_ptr
()
)
)
CPUInfer
.
sync
()
end
=
time
.
perf_counter
()
total_time
=
end
-
start
print
(
'Quant mode: '
,
quant_mode
)
print
(
'Time(s): '
,
total_time
)
print
(
'Iteration: '
,
test_iter
)
print
(
'Time(us) per iteration: '
,
total_time
/
test_iter
*
1000000
)
print
(
'Bandwidth: '
,
hidden_size
*
intermediate_size
*
3
*
n_routed_experts
*
bytes_per_elem
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GB/s'
)
print
(
'Flops: '
,
hidden_size
*
intermediate_size
*
qlen
*
3
*
n_routed_experts
*
2
*
test_iter
/
total_time
/
1000
/
1000
/
1000
,
'GFLOPS'
)
print
(
''
)
bench_moe
(
"bf16"
)
bench_moe
(
"int8"
)
csrc/ktransformers_ext/
operators/llamafile
/shared_mem_buffer.cpp
→
csrc/ktransformers_ext/
cpu_backend
/shared_mem_buffer.cpp
View file @
f3d842a0
...
...
@@ -30,7 +30,8 @@ void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t
if
(
buffer_
)
{
free
(
buffer_
);
}
buffer_
=
malloc
(
size
);
buffer_
=
std
::
aligned_alloc
(
64
,
size
);
size_
=
size
;
for
(
auto
&
obj_requests
:
hist_requests_
)
{
for
(
auto
&
requests
:
obj_requests
.
second
)
{
...
...
csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
0 → 100644
View file @
f3d842a0
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class
SharedMemBuffer
{
public:
SharedMemBuffer
();
~
SharedMemBuffer
();
void
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
void
dealloc
(
void
*
object
);
private:
void
*
buffer_
;
uint64_t
size_
;
std
::
map
<
void
*
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>>>
hist_requests_
;
void
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
};
static
SharedMemBuffer
shared_mem_buffer
;
#endif
\ No newline at end of file
csrc/ktransformers_ext/ext_bindings.cpp
View file @
f3d842a0
...
...
@@ -17,6 +17,7 @@
#include "operators/llamafile/linear.h"
#include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.h"
#include "operators/amx/moe.hpp"
#include "pybind11/functional.h"
#include "pybind11/operators.h"
#include "pybind11/pybind11.h"
...
...
@@ -563,6 +564,75 @@ class MOEBindings {
};
};
template
<
class
T
>
class
AMX_MOEBindings
{
public:
class
WarmUpBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
AMX_MOE
<
T
>
*
moe
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
AMX_MOE
<
T
>::
warm_up
,
args_
->
moe
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
cpuinfer_interface
(
AMX_MOE
<
T
>
&
moe
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
LoadWeightsBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
AMX_MOE
<
T
>
*
moe
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
AMX_MOE
<
T
>::
load_weights
,
args_
->
moe
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
cpuinfer_interface
(
AMX_MOE
<
T
>
&
moe
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
class
ForwardBindings
{
public:
struct
Args
{
CPUInfer
*
cpuinfer
;
AMX_MOE
<
T
>
*
moe
;
int
qlen
;
int
k
;
const
uint64_t
*
expert_ids
;
const
float
*
weights
;
const
void
*
input
;
void
*
output
;
int
*
batch_size_tensor
;
};
static
void
inner
(
void
*
args
)
{
Args
*
args_
=
(
Args
*
)
args
;
args_
->
cpuinfer
->
enqueue
(
&
AMX_MOE
<
T
>::
forward
,
args_
->
moe
,
args_
->
qlen
,
args_
->
k
,
args_
->
expert_ids
,
args_
->
weights
,
args_
->
input
,
args_
->
output
,
args_
->
batch_size_tensor
);
}
static
std
::
pair
<
intptr_t
,
intptr_t
>
cpuinfer_interface
(
AMX_MOE
<
T
>
&
moe
,
int
qlen
,
int
k
,
intptr_t
expert_ids
,
intptr_t
weights
,
intptr_t
input
,
intptr_t
output
,
intptr_t
batch_size_tensor
)
{
Args
*
args
=
new
Args
{
nullptr
,
&
moe
,
qlen
,
k
,
(
const
uint64_t
*
)
expert_ids
,
(
const
float
*
)
weights
,
(
const
void
*
)
input
,
(
void
*
)
output
,
(
int
*
)
batch_size_tensor
};
return
std
::
make_pair
((
intptr_t
)
&
inner
,
(
intptr_t
)
args
);
}
};
};
PYBIND11_MODULE
(
cpuinfer_ext
,
m
)
{
py
::
class_
<
CPUInfer
>
(
m
,
"CPUInfer"
)
.
def
(
py
::
init
<
int
>
())
...
...
@@ -621,6 +691,27 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
.
def
(
"warm_up"
,
&
MOEBindings
::
WarmUpBindinds
::
cpuinfer_interface
)
.
def
(
"forward"
,
&
MOEBindings
::
ForwardBindings
::
cpuinfer_interface
);
py
::
class_
<
AMX_MOEConfig
>
(
moe_module
,
"AMX_MOEConfig"
)
.
def
(
py
::
init
([](
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
max_len
,
intptr_t
gate_proj
,
intptr_t
up_proj
,
intptr_t
down_proj
)
{
return
AMX_MOEConfig
(
expert_num
,
routed_expert_num
,
hidden_size
,
intermediate_size
,
max_len
,
(
void
*
)
gate_proj
,
(
void
*
)
up_proj
,
(
void
*
)
down_proj
);
}));
py
::
class_
<
AMX_MOE
<
amx
::
GemmKernel224BF
>>
(
moe_module
,
"AMXBF16_MOE"
)
.
def
(
py
::
init
<
AMX_MOEConfig
>
())
.
def
(
"warm_up"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224BF
>::
WarmUpBindings
::
cpuinfer_interface
)
.
def
(
"load_weights"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224BF
>::
LoadWeightsBindings
::
cpuinfer_interface
)
.
def
(
"forward"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224BF
>::
ForwardBindings
::
cpuinfer_interface
);
py
::
class_
<
AMX_MOE
<
amx
::
GemmKernel224Int8
>>
(
moe_module
,
"AMXInt8_MOE"
)
.
def
(
py
::
init
<
AMX_MOEConfig
>
())
.
def
(
"warm_up"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224Int8
>::
WarmUpBindings
::
cpuinfer_interface
)
.
def
(
"load_weights"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224Int8
>::
LoadWeightsBindings
::
cpuinfer_interface
)
.
def
(
"forward"
,
&
AMX_MOEBindings
<
amx
::
GemmKernel224Int8
>::
ForwardBindings
::
cpuinfer_interface
);
auto
kvcache_module
=
m
.
def_submodule
(
"kvcache"
);
py
::
enum_
<
AnchorType
>
(
kvcache_module
,
"AnchorType"
)
...
...
csrc/ktransformers_ext/operators/amx/la/amx.hpp
0 → 100644
View file @
f3d842a0
This diff is collapsed.
Click to expand it.
csrc/ktransformers_ext/operators/amx/la/utils.hpp
0 → 100644
View file @
f3d842a0
/**
* @Description :
* @Author : chenht2022
* @Date : 2025-04-25 18:28:12
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2025-04-25 18:28:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <cstdint>
template
<
typename
T
>
T
*
offset_pointer
(
T
*
ptr
,
std
::
size_t
byte_offset
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
char
*>
(
ptr
)
+
byte_offset
);
}
template
<
typename
T
>
const
T
*
offset_pointer
(
const
T
*
ptr
,
std
::
size_t
byte_offset
)
{
return
reinterpret_cast
<
const
T
*>
(
reinterpret_cast
<
const
char
*>
(
ptr
)
+
byte_offset
);
}
template
<
typename
T
>
T
*
offset_pointer_row_major
(
T
*
t
,
int
row
,
int
col
,
std
::
size_t
ld
)
{
return
offset_pointer
(
t
,
row
*
ld
)
+
col
;
}
template
<
typename
T
>
T
*
offset_pointer_col_major
(
T
*
t
,
int
row
,
int
col
,
std
::
size_t
ld
)
{
return
offset_pointer
(
t
,
col
*
ld
)
+
row
;
}
static
inline
void
avx512_copy_32xbf16
(
__m512i
*
src
,
__m512i
*
dst
)
{
_mm512_storeu_si512
(
dst
,
_mm512_loadu_si512
(
src
));
}
static
inline
void
avx512_32xfp32_to_32xbf16
(
__m512
*
src0
,
__m512
*
src1
,
__m512i
*
dst
)
{
_mm512_storeu_si512
(
dst
,
__m512i
(
_mm512_cvtne2ps_pbh
(
*
src1
,
*
src0
)));
}
static
inline
void
avx512_32xbf16_to_32xfp32
(
__m512i
*
src
,
__m512
*
dst0
,
__m512
*
dst1
)
{
_mm512_storeu_ps
(
dst0
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)(
src
))),
16
)));
_mm512_storeu_ps
(
dst1
,
_mm512_castsi512_ps
(
_mm512_slli_epi32
(
_mm512_cvtepu16_epi32
(
_mm256_loadu_si256
((
const
__m256i
*
)(
src
)
+
1
)),
16
)));
}
\ No newline at end of file
csrc/ktransformers_ext/operators/amx/moe.hpp
0 → 100644
View file @
f3d842a0
/**
* @Description :
* @Author : chenht2022
* @Date : 2025-04-25 18:28:12
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2025-04-25 18:28:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_OPERATOR_AMX_MOE_H
#define CPUINFER_OPERATOR_AMX_MOE_H
#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "la/amx.hpp"
#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
void
*
numa_alloc_aligned
(
size_t
size
,
int
node
,
size_t
alignment
)
{
void
*
ptr
=
numa_alloc_onnode
(
size
,
node
);
assert
(
reinterpret_cast
<
intptr_t
>
(
ptr
)
%
64
==
0
);
return
ptr
;
}
#endif
static
inline
__m512
exp_avx512
(
__m512
x
)
{
const
__m512
log2e
=
_mm512_set1_ps
(
1.44269504089
f
);
const
__m512
c1
=
_mm512_set1_ps
(
0.69314718056
f
);
__m512
y
=
_mm512_mul_ps
(
x
,
log2e
);
__m512i
int_part
=
_mm512_cvtps_epi32
(
y
);
__m512
frac_part
=
_mm512_sub_ps
(
y
,
_mm512_cvtepi32_ps
(
int_part
));
const
__m512
poly_1
=
_mm512_set1_ps
(
0.9999999995
f
);
const
__m512
poly_2
=
_mm512_set1_ps
(
0.6931471805
f
);
const
__m512
poly_3
=
_mm512_set1_ps
(
0.2402265069
f
);
const
__m512
poly_4
=
_mm512_set1_ps
(
0.0555041087
f
);
const
__m512
poly_5
=
_mm512_set1_ps
(
0.0096181291
f
);
const
__m512
poly_6
=
_mm512_set1_ps
(
0.0013333558
f
);
__m512
frac_exp
=
_mm512_fmadd_ps
(
frac_part
,
poly_6
,
_mm512_fmadd_ps
(
frac_part
,
poly_5
,
_mm512_fmadd_ps
(
frac_part
,
poly_4
,
_mm512_fmadd_ps
(
frac_part
,
poly_3
,
_mm512_fmadd_ps
(
frac_part
,
poly_2
,
poly_1
)))));
__m512
two_pow_i
=
_mm512_scalef_ps
(
_mm512_set1_ps
(
1.0
f
),
_mm512_cvtepi32_ps
(
int_part
));
return
_mm512_mul_ps
(
two_pow_i
,
frac_exp
);
}
static
inline
__m512
act_fn
(
__m512
gate_val
,
__m512
up_val
)
{
__m512
neg_gate_val
=
_mm512_sub_ps
(
_mm512_setzero_ps
(),
gate_val
);
__m512
exp_neg_gate
=
exp_avx512
(
neg_gate_val
);
__m512
denom
=
_mm512_add_ps
(
_mm512_set1_ps
(
1.0
f
),
exp_neg_gate
);
__m512
act_val
=
_mm512_div_ps
(
gate_val
,
denom
);
return
_mm512_mul_ps
(
act_val
,
up_val
);
}
struct
AMX_MOEConfig
{
int
expert_num
;
int
routed_expert_num
;
int
hidden_size
;
int
intermediate_size
;
int
max_len
;
void
*
gate_proj
;
void
*
up_proj
;
void
*
down_proj
;
AMX_MOEConfig
()
{}
AMX_MOEConfig
(
int
expert_num
,
int
routed_expert_num
,
int
hidden_size
,
int
intermediate_size
,
int
max_len
,
void
*
gate_proj
,
void
*
up_proj
,
void
*
down_proj
)
:
expert_num
(
expert_num
),
routed_expert_num
(
routed_expert_num
),
hidden_size
(
hidden_size
),
intermediate_size
(
intermediate_size
),
max_len
(
max_len
),
gate_proj
(
gate_proj
),
up_proj
(
up_proj
),
down_proj
(
down_proj
)
{}
};
template
<
class
T
>
class
AMX_MOE
{
private:
AMX_MOEConfig
config_
;
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
down_proj_
;
// [expert_num * hidden_size * intermediate_size ( /32 if quantized)]
ggml_bf16_t
*
m_local_input_
;
// [routed_expert_num * max_len * hidden_size]
ggml_bf16_t
*
m_local_gate_output_
;
// [routed_expert_num * max_len * intermediate_size]
ggml_bf16_t
*
m_local_up_output_
;
// [routed_expert_num * max_len * intermediate_size]
ggml_bf16_t
*
m_local_down_output_
;
// [routed_expert_num * max_len * hidden_size]
std
::
vector
<
std
::
vector
<
int
>>
m_local_pos_
;
// [max_len, routed_expert_num]
std
::
vector
<
int
>
m_local_num_
;
// [expert_num]
std
::
vector
<
int
>
m_expert_id_map_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_input_ptr_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_gate_output_ptr_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_up_output_ptr_
;
// [expert_num]
std
::
vector
<
ggml_bf16_t
*>
m_local_down_output_ptr_
;
// [expert_num]
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferA
>>
gate_up_ba_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferC
>>
gate_bc_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferC
>>
up_bc_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferA
>>
down_ba_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferC
>>
down_bc_
;
#ifdef USE_NUMA
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>>
gate_bb_numa_
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>>
up_bb_numa_
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>>
down_bb_numa_
;
#else
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>
gate_bb_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>
up_bb_
;
std
::
vector
<
std
::
shared_ptr
<
typename
T
::
BufferB
>>
down_bb_
;
#endif
public:
AMX_MOE
(
AMX_MOEConfig
config
)
{
config_
=
config
;
gate_proj_
=
config_
.
gate_proj
;
up_proj_
=
config_
.
up_proj
;
down_proj_
=
config_
.
down_proj
;
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
m_mem_requests
;
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_input_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
hidden_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_gate_output_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_up_output_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
intermediate_size
});
m_mem_requests
.
push_back
({(
void
**
)
&
m_local_down_output_
,
sizeof
(
ggml_bf16_t
)
*
config_
.
routed_expert_num
*
config_
.
max_len
*
config_
.
hidden_size
});
std
::
vector
<
void
*>
gate_up_ba_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
gate_bc_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
up_bc_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
down_ba_ptr
(
config_
.
expert_num
);
std
::
vector
<
void
*>
down_bc_ptr
(
config_
.
expert_num
);
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
m_mem_requests
.
push_back
(
{(
void
**
)
&
gate_up_ba_ptr
[
i
],
T
::
BufferA
::
required_size
(
config_
.
max_len
,
config_
.
hidden_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
gate_bc_ptr
[
i
],
T
::
BufferC
::
required_size
(
config_
.
max_len
,
config_
.
intermediate_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
up_bc_ptr
[
i
],
T
::
BufferC
::
required_size
(
config_
.
max_len
,
config_
.
intermediate_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
down_ba_ptr
[
i
],
T
::
BufferA
::
required_size
(
config_
.
max_len
,
config_
.
intermediate_size
)});
m_mem_requests
.
push_back
(
{(
void
**
)
&
down_bc_ptr
[
i
],
T
::
BufferC
::
required_size
(
config_
.
max_len
,
config_
.
hidden_size
)});
}
shared_mem_buffer
.
alloc
(
this
,
m_mem_requests
);
m_local_pos_
.
resize
(
config_
.
max_len
);
for
(
int
i
=
0
;
i
<
config_
.
max_len
;
i
++
)
{
m_local_pos_
[
i
].
resize
(
config_
.
routed_expert_num
);
}
m_expert_id_map_
.
resize
(
config_
.
expert_num
);
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_input_ptr_
.
resize
(
config_
.
expert_num
);
m_local_gate_output_ptr_
.
resize
(
config_
.
expert_num
);
m_local_up_output_ptr_
.
resize
(
config_
.
expert_num
);
m_local_down_output_ptr_
.
resize
(
config_
.
expert_num
);
for
(
uint64_t
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
gate_up_ba_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferA
>
(
config_
.
max_len
,
config_
.
hidden_size
,
gate_up_ba_ptr
[
i
]));
gate_bc_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferC
>
(
config_
.
max_len
,
config_
.
intermediate_size
,
gate_bc_ptr
[
i
]));
up_bc_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferC
>
(
config_
.
max_len
,
config_
.
intermediate_size
,
up_bc_ptr
[
i
]));
down_ba_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferA
>
(
config_
.
max_len
,
config_
.
intermediate_size
,
down_ba_ptr
[
i
]));
down_bc_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferC
>
(
config_
.
max_len
,
config_
.
hidden_size
,
down_bc_ptr
[
i
]));
#ifdef USE_NUMA
int
numa_nodes
=
numa_num_configured_nodes
();
gate_bb_numa_
.
resize
(
numa_nodes
);
up_bb_numa_
.
resize
(
numa_nodes
);
down_bb_numa_
.
resize
(
numa_nodes
);
for
(
int
j
=
0
;
j
<
numa_nodes
;
j
++
)
{
void
*
gate_bb_ptr
=
numa_alloc_aligned
(
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
),
j
,
64
);
gate_bb_numa_
[
j
].
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_bb_ptr
));
void
*
up_bb_ptr
=
numa_alloc_aligned
(
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
),
j
,
64
);
up_bb_numa_
[
j
].
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
up_bb_ptr
));
void
*
down_bb_ptr
=
numa_alloc_aligned
(
T
::
BufferB
::
required_size
(
config_
.
hidden_size
,
config_
.
intermediate_size
),
j
,
64
);
down_bb_numa_
[
j
].
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_bb_ptr
));
}
#else
void
*
gate_bb_ptr
=
std
::
aligned_alloc
(
64
,
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
));
gate_bb_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_bb_ptr
));
void
*
up_bb_ptr
=
std
::
aligned_alloc
(
64
,
T
::
BufferB
::
required_size
(
config_
.
intermediate_size
,
config_
.
hidden_size
));
up_bb_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
intermediate_size
,
config_
.
hidden_size
,
up_bb_ptr
));
void
*
down_bb_ptr
=
std
::
aligned_alloc
(
64
,
T
::
BufferB
::
required_size
(
config_
.
hidden_size
,
config_
.
intermediate_size
));
down_bb_
.
push_back
(
std
::
make_shared
<
typename
T
::
BufferB
>
(
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_bb_ptr
));
#endif
}
}
~
AMX_MOE
()
{
shared_mem_buffer
.
dealloc
(
this
);
}
void
load_weights
(
Backend
*
backend
)
{
int
nth
=
T
::
recommended_nth
(
config_
.
intermediate_size
);
backend
->
do_work_stealing_job
(
nth
*
config_
.
expert_num
,
nullptr
,
[
&
](
int
task_id
)
{
uint64_t
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
int
numa_nodes
=
numa_num_configured_nodes
();
for
(
int
j
=
0
;
j
<
numa_nodes
;
j
++
)
{
gate_bb_numa_
[
j
][
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
gate_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
up_bb_numa_
[
j
][
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
up_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
}
#else
gate_bb_
[
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
gate_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
up_bb_
[
expert_idx
]
->
from_mat
(
(
ggml_bf16_t
*
)
config_
.
up_proj
+
expert_idx
*
config_
.
intermediate_size
*
config_
.
hidden_size
,
ith
,
nth
);
#endif
},
nullptr
);
nth
=
T
::
recommended_nth
(
config_
.
hidden_size
);
backend
->
do_work_stealing_job
(
nth
*
config_
.
expert_num
,
nullptr
,
[
&
](
int
task_id
)
{
uint64_t
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
int
numa_nodes
=
numa_num_configured_nodes
();
for
(
int
j
=
0
;
j
<
numa_nodes
;
j
++
)
{
down_bb_numa_
[
j
][
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
down_proj
+
expert_idx
*
config_
.
hidden_size
*
config_
.
intermediate_size
,
ith
,
nth
);
}
#else
down_bb_
[
expert_idx
]
->
from_mat
((
ggml_bf16_t
*
)
config_
.
down_proj
+
expert_idx
*
config_
.
hidden_size
*
config_
.
intermediate_size
,
ith
,
nth
);
#endif
},
nullptr
);
}
void
warm_up
(
Backend
*
backend
)
{}
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
int
*
batch_size_tensor
,
Backend
*
backend
)
{
bool
use_amx
=
(
qlen
>
4
*
config_
.
expert_num
/
config_
.
routed_expert_num
);
qlen
=
batch_size_tensor
[
0
];
int
activated_expert
=
0
;
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
m_local_num_
[
i
]
=
0
;
}
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
m_local_pos_
[
i
][
j
]
=
m_local_num_
[
expert_ids
[
i
*
k
+
j
]]
++
;
}
}
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
if
(
m_local_num_
[
i
]
>
0
)
{
m_expert_id_map_
[
activated_expert
]
=
i
;
activated_expert
++
;
}
}
uint64_t
offset
=
0
;
for
(
int
i
=
0
;
i
<
config_
.
expert_num
;
i
++
)
{
m_local_input_ptr_
[
i
]
=
m_local_input_
+
offset
*
config_
.
hidden_size
;
m_local_gate_output_ptr_
[
i
]
=
m_local_gate_output_
+
offset
*
config_
.
intermediate_size
;
m_local_up_output_ptr_
[
i
]
=
m_local_up_output_
+
offset
*
config_
.
intermediate_size
;
m_local_down_output_ptr_
[
i
]
=
m_local_down_output_
+
offset
*
config_
.
hidden_size
;
offset
+=
m_local_num_
[
i
];
}
backend
->
do_work_stealing_job
(
qlen
,
nullptr
,
[
&
](
int
i
)
{
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
memcpy
(
m_local_input_ptr_
[
expert_ids
[
i
*
k
+
j
]]
+
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
,
(
ggml_bf16_t
*
)
input
+
i
*
config_
.
hidden_size
,
sizeof
(
ggml_bf16_t
)
*
config_
.
hidden_size
);
}
},
nullptr
);
backend
->
do_work_stealing_job
(
activated_expert
,
nullptr
,
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
];
gate_up_ba_
[
expert_idx
]
->
from_mat
(
m_local_num_
[
expert_idx
],
m_local_input_ptr_
[
expert_idx
],
0
,
1
);
},
nullptr
);
int
nth
=
T
::
recommended_nth
(
config_
.
intermediate_size
);
backend
->
do_work_stealing_job
(
nth
*
activated_expert
,
[
&
](
int
_
)
{
T
::
config
();
},
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
/
nth
];
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
gate_bb_numa_
[
Backend
::
numa_node
][
expert_idx
],
gate_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
up_bb_numa_
[
Backend
::
numa_node
][
expert_idx
],
up_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#else
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
gate_bb_
[
expert_idx
],
gate_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
,
config_
.
hidden_size
,
gate_up_ba_
[
expert_idx
],
up_bb_
[
expert_idx
],
up_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#endif
gate_bc_
[
expert_idx
]
->
to_mat
(
m_local_num_
[
expert_idx
],
m_local_gate_output_ptr_
[
expert_idx
],
ith
,
nth
);
up_bc_
[
expert_idx
]
->
to_mat
(
m_local_num_
[
expert_idx
],
m_local_up_output_ptr_
[
expert_idx
],
ith
,
nth
);
auto
[
n_start
,
n_end
]
=
T
::
split_range_n
(
config_
.
intermediate_size
,
ith
,
nth
);
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
ggml_bf16_t
*
gate_output_ptr
=
&
m_local_gate_output_ptr_
[
expert_idx
][
i
*
config_
.
intermediate_size
];
ggml_bf16_t
*
up_output_ptr
=
&
m_local_up_output_ptr_
[
expert_idx
][
i
*
config_
.
intermediate_size
];
for
(
int
j
=
n_start
;
j
<
n_end
;
j
+=
32
)
{
__m512
gate_val0
,
gate_val1
,
up_val0
,
up_val1
;
avx512_32xbf16_to_32xfp32
((
__m512i
*
)(
gate_output_ptr
+
j
),
&
gate_val0
,
&
gate_val1
);
avx512_32xbf16_to_32xfp32
((
__m512i
*
)(
up_output_ptr
+
j
),
&
up_val0
,
&
up_val1
);
__m512
result0
=
act_fn
(
gate_val0
,
up_val0
);
__m512
result1
=
act_fn
(
gate_val1
,
up_val1
);
avx512_32xfp32_to_32xbf16
(
&
result0
,
&
result1
,
(
__m512i
*
)(
gate_output_ptr
+
j
));
}
}
},
nullptr
);
backend
->
do_work_stealing_job
(
activated_expert
,
nullptr
,
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
];
down_ba_
[
expert_idx
]
->
from_mat
(
m_local_num_
[
expert_idx
],
m_local_gate_output_ptr_
[
expert_idx
],
0
,
1
);
},
nullptr
);
nth
=
T
::
recommended_nth
(
config_
.
hidden_size
);
backend
->
do_work_stealing_job
(
nth
*
activated_expert
,
[
&
](
int
_
)
{
T
::
config
();
},
[
&
](
int
task_id
)
{
int
expert_idx
=
m_expert_id_map_
[
task_id
/
nth
];
int
ith
=
task_id
%
nth
;
#ifdef USE_NUMA
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_ba_
[
expert_idx
],
down_bb_numa_
[
Backend
::
numa_node
][
expert_idx
],
down_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#else
amx
::
mat_mul
(
m_local_num_
[
expert_idx
],
config_
.
hidden_size
,
config_
.
intermediate_size
,
down_ba_
[
expert_idx
],
down_bb_
[
expert_idx
],
down_bc_
[
expert_idx
],
ith
,
nth
,
use_amx
);
#endif
down_bc_
[
expert_idx
]
->
to_mat
(
m_local_num_
[
expert_idx
],
m_local_down_output_ptr_
[
expert_idx
],
ith
,
nth
);
},
nullptr
);
backend
->
do_work_stealing_job
(
qlen
,
nullptr
,
[
&
](
int
i
)
{
for
(
int
e
=
0
;
e
<
config_
.
hidden_size
;
e
+=
32
)
{
__m512
x0
=
_mm512_setzero_ps
();
__m512
x1
=
_mm512_setzero_ps
();
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
__m512
weight
=
_mm512_set1_ps
(
weights
[
i
*
k
+
j
]);
__m512
down_output0
,
down_output1
;
avx512_32xbf16_to_32xfp32
((
__m512i
*
)(
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]]
+
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
),
&
down_output0
,
&
down_output1
);
x0
=
_mm512_fmadd_ps
(
down_output0
,
weight
,
x0
);
x1
=
_mm512_fmadd_ps
(
down_output1
,
weight
,
x1
);
}
avx512_32xfp32_to_32xbf16
(
&
x0
,
&
x1
,
(
__m512i
*
)((
ggml_bf16_t
*
)
output
+
i
*
config_
.
hidden_size
+
e
));
}
},
nullptr
);
}
};
#endif
\ No newline at end of file
csrc/ktransformers_ext/operators/llamafile/linear.h
View file @
f3d842a0
...
...
@@ -17,12 +17,12 @@
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
LinearConfig
{
int
input_size
;
...
...
csrc/ktransformers_ext/operators/llamafile/mlp.h
View file @
f3d842a0
...
...
@@ -17,12 +17,12 @@
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MLPConfig
{
int
hidden_size
;
...
...
csrc/ktransformers_ext/operators/llamafile/moe.h
View file @
f3d842a0
...
...
@@ -17,12 +17,12 @@
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct
MOEConfig
{
int
expert_num
;
...
...
csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
deleted
100644 → 0
View file @
b90362b5
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class
SharedMemBuffer
{
public:
SharedMemBuffer
();
~
SharedMemBuffer
();
void
alloc
(
void
*
object
,
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
void
dealloc
(
void
*
object
);
private:
void
*
buffer_
;
uint64_t
size_
;
std
::
map
<
void
*
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>>>
hist_requests_
;
void
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
);
};
static
SharedMemBuffer
shared_mem_buffer
;
#endif
\ No newline at end of file
ktransformers/operators/experts.py
View file @
f3d842a0
...
...
@@ -25,8 +25,9 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Debug"
))
import
cpuinfer_ext
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
AMX_MOEConfig
,
AMXBF16_MOE
,
AMXInt8_MOE
import
ctypes
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.custom_gguf
import
GGMLQuantizationType
,
GGUFLoader
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.server.config.config
import
Config
from
transformers.activations
import
ACT2FN
...
...
@@ -141,6 +142,7 @@ class KExpertsCPU(KExpertsBase):
assert
device
.
lower
()
==
"cpu"
,
"KExpertsCPU can only be loaded on CPU"
self
.
n_routed_experts
=
n_routed_experts
self
.
out_device
=
out_device
self
.
backend
=
kwargs
.
get
(
"backend"
,
"llamafile"
)
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
,
warmup
:
bool
=
False
):
if
device
:
...
...
@@ -163,7 +165,9 @@ class KExpertsCPU(KExpertsBase):
)
# print(self.gate_qtype, self.up_qtype, self.down_qtype)
n_routed_experts
=
self
.
n_routed_experts
self
.
cpu_infer
=
KExpertsCPU
.
CPU_INFER
# n_routed_experts = len(self.orig_module)
if
self
.
backend
==
"llamafile"
:
moe_config
=
MOEConfig
(
n_routed_experts
,
self
.
config
.
num_experts_per_tok
,
...
...
@@ -180,10 +184,43 @@ class KExpertsCPU(KExpertsBase):
self
.
down_type
,
30
,
# TODO: get from model.dtype
)
self
.
moe
=
MOE
(
moe_config
)
elif
self
.
backend
==
"AMXBF16"
:
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
moe_config
=
AMX_MOEConfig
(
n_routed_experts
,
self
.
config
.
num_experts_per_tok
,
self
.
config
.
hidden_size
,
self
.
config
.
moe_intermediate_size
,
25600
,
gate_ptr
,
up_ptr
,
down_ptr
,
)
self
.
moe
=
AMXBF16_MOE
(
moe_config
)
self
.
cpu_infer
.
submit
(
self
.
moe
.
load_weights
())
self
.
cpu_infer
.
sync
()
elif
self
.
backend
==
"AMXInt8"
:
assert
self
.
gate_type
==
GGMLQuantizationType
.
BF16
assert
self
.
up_type
==
GGMLQuantizationType
.
BF16
assert
self
.
down_type
==
GGMLQuantizationType
.
BF16
moe_config
=
AMX_MOEConfig
(
n_routed_experts
,
self
.
config
.
num_experts_per_tok
,
self
.
config
.
hidden_size
,
self
.
config
.
moe_intermediate_size
,
25600
,
gate_ptr
,
up_ptr
,
down_ptr
,
)
self
.
moe
=
AMXInt8_MOE
(
moe_config
)
self
.
cpu_infer
.
submit
(
self
.
moe
.
load_weights
())
self
.
cpu_infer
.
sync
()
# print(n_routed_experts, hidden_size, moe_intermediate_size)
num_experts_per_tok
=
self
.
config
.
num_experts_per_tok
self
.
moe
=
MOE
(
moe_config
)
self
.
cpu_infer
=
KExpertsCPU
.
CPU_INFER
if
warmup
:
self
.
cpu_infer
.
submit
(
self
.
moe
.
warm_up
())
self
.
cpu_infer
.
sync
()
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
0 → 100644
View file @
f3d842a0
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^lm_head$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek_v3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
backend
:
"
AMXInt8"
# or "AMXBF16" or "llamafile" (default)
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
absorb_for_prefill
:
False
# change this to True to enable long context(prefill may slower).
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment