Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
11544ef2
Unverified
Commit
11544ef2
authored
Aug 08, 2024
by
UnicornChan
Committed by
GitHub
Aug 08, 2024
Browse files
Merge pull request #25 from kvcache-ai/windows
Windows Support
parents
442e13bc
0e613b60
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
258 additions
and
96 deletions
+258
-96
ktransformers/ktransformers_ext/CMakeLists.txt
ktransformers/ktransformers_ext/CMakeLists.txt
+19
-4
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+41
-3
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
...formers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+54
-19
ktransformers/ktransformers_ext/cuda/setup.py
ktransformers/ktransformers_ext/cuda/setup.py
+1
-1
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+5
-5
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+18
-18
ktransformers/ktransformers_ext/operators/llamafile/moe.h
ktransformers/ktransformers_ext/operators/llamafile/moe.h
+1
-1
ktransformers/local_chat.py
ktransformers/local_chat.py
+1
-1
ktransformers/models/custom_cache.py
ktransformers/models/custom_cache.py
+1
-0
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+10
-3
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+1
-1
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+62
-17
ktransformers/util/utils.py
ktransformers/util/utils.py
+8
-7
pyproject.toml
pyproject.toml
+2
-1
setup.py
setup.py
+15
-2
third_party/llamafile/iqk_mul_mat.inc
third_party/llamafile/iqk_mul_mat.inc
+4
-3
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
+1
-1
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
+1
-1
third_party/llamafile/sgemm.cpp
third_party/llamafile/sgemm.cpp
+12
-7
third_party/llamafile/tinyblas_cpu.h
third_party/llamafile/tinyblas_cpu.h
+1
-1
No files found.
ktransformers/ktransformers_ext/CMakeLists.txt
View file @
11544ef2
cmake_minimum_required
(
VERSION 3.1
6
)
cmake_minimum_required
(
VERSION 3.1
7
)
project
(
cpuinfer_ext VERSION 0.1.0
)
project
(
cpuinfer_ext VERSION 0.1.0
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
...
@@ -190,7 +190,13 @@ else()
...
@@ -190,7 +190,13 @@ else()
message
(
STATUS
"Unknown architecture"
)
message
(
STATUS
"Unknown architecture"
)
endif
()
endif
()
find_package
(
CUDA REQUIRED
)
# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
# find_package(FindCUDAToolkit REQUIRED)
# if(CUDAToolkit_FOUND)
# message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
# else()
# message(STATUS "Can't found CUDA lib")
# endif()
add_compile_options
(
"$<$<COMPILE_LANGUAGE:CXX>:
${
ARCH_FLAGS
}
>"
)
add_compile_options
(
"$<$<COMPILE_LANGUAGE:CXX>:
${
ARCH_FLAGS
}
>"
)
add_compile_options
(
"$<$<COMPILE_LANGUAGE:C>:
${
ARCH_FLAGS
}
>"
)
add_compile_options
(
"$<$<COMPILE_LANGUAGE:C>:
${
ARCH_FLAGS
}
>"
)
...
@@ -199,7 +205,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
...
@@ -199,7 +205,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
add_subdirectory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llama.cpp
${
CMAKE_CURRENT_BINARY_DIR
}
/third_party/llama.cpp
)
add_subdirectory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llama.cpp
${
CMAKE_CURRENT_BINARY_DIR
}
/third_party/llama.cpp
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party
)
include_directories
(
"
${
CUDA_INCLUDE_DIRS
}
"
)
if
(
WIN32
)
include_directories
(
"$ENV{CUDA_PATH}/include"
)
elseif
(
UNIX
)
find_package
(
CUDA REQUIRED
)
include_directories
(
"
${
CUDA_INCLUDE_DIRS
}
"
)
endif
()
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
SOURCE_DIR1
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
SOURCE_DIR1
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cpu_backend SOURCE_DIR2
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cpu_backend SOURCE_DIR2
)
...
@@ -210,4 +221,8 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
...
@@ -210,4 +221,8 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
pybind11_add_module
(
${
PROJECT_NAME
}
MODULE
${
ALL_SOURCES
}
)
pybind11_add_module
(
${
PROJECT_NAME
}
MODULE
${
ALL_SOURCES
}
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE llama
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE llama
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
"/usr/local/cuda/lib64/libcudart.so"
)
if
(
WIN32
)
\ No newline at end of file
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
"$ENV{CUDA_PATH}/lib/x64/cudart.lib"
)
#CUDA::cudart
elseif
(
UNIX
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
"$ENV{CUDA_HOME}/lib64/libcudart.so"
)
endif
()
\ No newline at end of file
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
View file @
11544ef2
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chen
ht2022
* @LastEditors : chen
xl
* @LastEditTime : 2024-0
7-25 10:33:47
* @LastEditTime : 2024-0
8-08 04:23:51
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
#ifndef CPUINFER_TASKQUEUE_H
#ifndef CPUINFER_TASKQUEUE_H
...
@@ -17,6 +17,44 @@
...
@@ -17,6 +17,44 @@
#include <queue>
#include <queue>
#include <thread>
#include <thread>
#include <vector>
#include <vector>
#ifdef _WIN32
#include <windows.h>
#endif
class
custom_mutex
{
private:
#ifdef _WIN32
HANDLE
global_mutex
;
#else
std
::
mutex
global_mutex
;
#endif
public:
custom_mutex
()
{
#ifdef _WIN32
HANDLE
global_mutex
;
#endif
}
void
lock
()
{
#ifdef _WIN32
WaitForSingleObject
(
global_mutex
,
INFINITE
);
#else
global_mutex
.
lock
();
#endif
}
void
unlock
()
{
#ifdef _WIN32
ReleaseMutex
(
global_mutex
);
#else
global_mutex
.
lock
();
#endif
}
};
class
TaskQueue
{
class
TaskQueue
{
public:
public:
...
@@ -32,7 +70,7 @@ class TaskQueue {
...
@@ -32,7 +70,7 @@ class TaskQueue {
std
::
queue
<
std
::
function
<
void
()
>>
tasks
;
std
::
queue
<
std
::
function
<
void
()
>>
tasks
;
std
::
thread
worker
;
std
::
thread
worker
;
std
::
mutex
mutex
;
custom_
mutex
mutex
;
std
::
atomic
<
bool
>
sync_flag
;
std
::
atomic
<
bool
>
sync_flag
;
std
::
atomic
<
bool
>
exit_flag
;
std
::
atomic
<
bool
>
exit_flag
;
};
};
...
...
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
View file @
11544ef2
...
@@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
...
@@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
thread_m_blocks
=
exec_cfg
.
max_m_blocks
;
thread_m_blocks
=
exec_cfg
.
max_m_blocks
;
}
}
// Define kernel configurations
// Define kernel configurations
if
(
false
)
{
#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
str(prob_n) + ", " + str(prob_k) + "]" + \
", has_act_order = " + str(has_act_order) + \
", num_groups = " + str(num_groups) + \
", group_size = " + str(group_size) + \
", thread_m_blocks = " + str(thread_m_blocks) + \
", thread_n_blocks = " + str(thread_n_blocks) + \
", thread_k_blocks = " + str(thread_k_blocks));
if
(
num_bits
==
4
&&
num_threads
==
256
)
{
if
(
false
)
{
}
CALL_IF
(
4
,
32
,
2
,
256
)
CALL_IF
(
4
,
16
,
4
,
256
)
CALL_IF
(
4
,
8
,
8
,
256
)
else
{
undefined_error
}
}
else
if
(
num_bits
==
4
&&
num_threads
==
128
)
{
if
(
false
)
{
}
CALL_IF
(
4
,
8
,
4
,
128
)
CALL_IF
(
4
,
4
,
8
,
128
)
else
{
undefined_error
}
}
else
if
(
num_bits
==
8
&&
num_threads
==
256
)
{
if
(
false
)
{
}
CALL_IF
(
8
,
32
,
2
,
256
)
CALL_IF
(
8
,
16
,
4
,
256
)
CALL_IF
(
8
,
8
,
8
,
256
)
else
{
undefined_error
}
}
else
if
(
num_bits
==
8
&&
num_threads
==
128
)
{
if
(
false
)
{
}
CALL_IF
(
8
,
8
,
4
,
128
)
CALL_IF
(
8
,
4
,
8
,
128
)
else
{
undefined_error
}
}
}
CALL_IF
(
4
,
32
,
2
,
256
)
CALL_IF
(
4
,
16
,
4
,
256
)
CALL_IF
(
4
,
8
,
8
,
256
)
CALL_IF
(
4
,
8
,
4
,
128
)
CALL_IF
(
4
,
4
,
8
,
128
)
CALL_IF
(
8
,
32
,
2
,
256
)
CALL_IF
(
8
,
16
,
4
,
256
)
CALL_IF
(
8
,
8
,
8
,
256
)
CALL_IF
(
8
,
8
,
4
,
128
)
CALL_IF
(
8
,
4
,
8
,
128
)
else
{
else
{
TORCH_CHECK
(
false
,
"Unsupported shapes: MNK = ["
+
str
(
prob_m
)
+
", "
+
undefined_error
str
(
prob_n
)
+
", "
+
str
(
prob_k
)
+
"]"
+
", has_act_order = "
+
str
(
has_act_order
)
+
", num_groups = "
+
str
(
num_groups
)
+
", group_size = "
+
str
(
group_size
)
+
", thread_m_blocks = "
+
str
(
thread_m_blocks
)
+
", thread_n_blocks = "
+
str
(
thread_n_blocks
)
+
", thread_k_blocks = "
+
str
(
thread_k_blocks
));
}
}
A_ptr
+=
16
*
thread_m_blocks
*
(
prob_k
/
8
)
*
par
;
A_ptr
+=
16
*
thread_m_blocks
*
(
prob_k
/
8
)
*
par
;
...
...
ktransformers/ktransformers_ext/cuda/setup.py
View file @
11544ef2
...
@@ -10,7 +10,7 @@ setup(name='KTransformersOps',
...
@@ -10,7 +10,7 @@ setup(name='KTransformersOps',
'custom_gguf/dequant.cu'
,
'custom_gguf/dequant.cu'
,
'binding.cpp'
,
'binding.cpp'
,
'gptq_marlin/gptq_marlin.cu'
,
'gptq_marlin/gptq_marlin.cu'
,
# 'gptq_marlin_repack.cu',
# 'gptq_marlin_repack.cu',
])
])
],
],
cmdclass
=
{
'build_ext'
:
BuildExtension
cmdclass
=
{
'build_ext'
:
BuildExtension
...
...
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
View file @
11544ef2
...
@@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
int
nth
=
config_
.
intermediate_size
/
config_
.
stride
;
int
nth
=
config_
.
intermediate_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
int
ith
=
task_id
;
void
*
gate_proj_ptr
=
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
gate_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
gate_output_ptr
=
gate_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
up_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
up_output_ptr
=
up_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
}
}
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
.
data
()
+
ith
*
config_
.
stride
;
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
.
data
()
+
ith
*
config_
.
stride
;
void
*
down_input_ptr
=
down_input_
.
data
()
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
void
*
down_input_ptr
=
(
uint8_t
*
)
down_input_
.
data
()
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
}
}
});
});
...
@@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
nth
=
config_
.
hidden_size
/
config_
.
stride
;
nth
=
config_
.
hidden_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
int
ith
=
task_id
;
void
*
down_proj_ptr
=
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
down_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
down_output_ptr
=
down_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
.
data
(),
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
.
data
(),
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
void
*
output_ptr
=
(
uint8_t
*
)
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
down_output_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
from_float
(
down_output_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
});
...
...
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
View file @
11544ef2
...
@@ -9,9 +9,9 @@
...
@@ -9,9 +9,9 @@
**/
**/
#include "moe.h"
#include "moe.h"
#include <iostream>
#include <iostream>
#include
"unistd.h"
#include
<cstdint>
void
*
MOE
::
buffer_
=
nullptr
;
uint8_t
*
MOE
::
buffer_
=
nullptr
;
MOE
::
MOE
(
MOEConfig
config
)
{
MOE
::
MOE
(
MOEConfig
config
)
{
config_
=
config
;
config_
=
config
;
...
@@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) {
...
@@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) {
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_
=
malloc
(
buffer_size
);
buffer_
=
(
uint8_t
*
)
malloc
(
buffer_size
);
}
}
uint64_t
offset
=
0
;
uint64_t
offset
=
0
;
...
@@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) {
...
@@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) {
m_local_pos_
.
resize
(
config_
.
group_max_len
);
m_local_pos_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_local_pos_
[
i
].
res
erv
e
(
config_
.
expert_num
);
m_local_pos_
[
i
].
res
iz
e
(
config_
.
routed_
expert_num
);
}
}
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
...
@@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
void
*
output_ptr
=
(
uint8_t
*
)
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
});
...
@@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
const
void
*
gate_input_ptr
;
const
void
*
gate_input_ptr
;
const
void
*
up_input_ptr
;
const
void
*
up_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
gate_input_ptr
=
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
else
{
}
else
{
to_float
(
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
to_float
(
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
...
@@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
m_gate_input_
[
i
];
gate_input_ptr
=
m_gate_input_
[
i
];
}
else
{
}
else
{
gate_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
m_up_input_
[
i
];
up_input_ptr
=
m_up_input_
[
i
];
}
else
{
}
else
{
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
}
}
}
}
...
@@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_proj_ptr
=
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
...
@@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
});
});
...
@@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
}
}
}
}
from_float
(
m_output_fp32_
[
i
],
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
m_output_fp32_
[
i
],
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
});
});
}
}
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
return
;
return
;
}
}
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
(
uint8_t
*
)
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/moe.h
View file @
11544ef2
...
@@ -54,7 +54,7 @@ class MOE {
...
@@ -54,7 +54,7 @@ class MOE {
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
private:
static
void
*
buffer_
;
static
uint8_t
*
buffer_
;
MOEConfig
config_
;
MOEConfig
config_
;
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
...
...
ktransformers/local_chat.py
View file @
11544ef2
...
@@ -112,4 +112,4 @@ def local_chat(
...
@@ -112,4 +112,4 @@ def local_chat(
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
)
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
fire
.
Fire
(
local_chat
)
fire
.
Fire
(
local_chat
)
\ No newline at end of file
ktransformers/models/custom_cache.py
View file @
11544ef2
...
@@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
...
@@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
...
...
ktransformers/operators/experts.py
View file @
11544ef2
...
@@ -19,7 +19,8 @@ import torch
...
@@ -19,7 +19,8 @@ import torch
import
sys
,
os
import
sys
,
os
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.operators.base_operator
import
BaseInjectedModule
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
"/../ktransformers_ext/build"
)
#sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build/")
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
"
\\
..
\\
ktransformers_ext
\\
build
\\
Release"
)
import
cpuinfer_ext
import
cpuinfer_ext
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
import
ctypes
import
ctypes
...
@@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase):
...
@@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase):
def
forward
(
self
,
input_tensor
,
expert_ids
,
weights
):
def
forward
(
self
,
input_tensor
,
expert_ids
,
weights
):
# generate, capture and run cuda graph
# generate, capture and run cuda graph
if
input_tensor
.
size
(
0
)
==
1
:
if
input_tensor
.
size
(
0
)
==
1
:
# TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
#print("capturing experts")
#print("capturing experts")
MLPCPUExperts
.
input_tensor_cpu
.
copy_
(
input_tensor
,
non_blocking
=
True
)
MLPCPUExperts
.
input_tensor_cpu
.
copy_
(
input_tensor
,
non_blocking
=
True
)
MLPCPUExperts
.
expert_ids_cpu
.
copy_
(
expert_ids
,
non_blocking
=
True
)
MLPCPUExperts
.
expert_ids_cpu
.
copy_
(
expert_ids
,
non_blocking
=
True
)
...
@@ -359,7 +361,12 @@ class MLPExpertsTorch(MLPExpertsBase):
...
@@ -359,7 +361,12 @@ class MLPExpertsTorch(MLPExpertsBase):
self
.
down
=
None
self
.
down
=
None
def
forward
(
self
,
hidden_states_cpu
:
torch
.
Tensor
,
selected_experts_cpu
:
torch
.
Tensor
,
routing_weights_cpu
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states_cpu
:
torch
.
Tensor
,
selected_experts_cpu
:
torch
.
Tensor
,
routing_weights_cpu
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# TODO: forward should transfer data to gpu, and make the data transfering capturable using pin memory,
# just like CPUInfer MLPCPUExperts. There may be a base class of experts on cpu
hidden_states_cpu
=
hidden_states_cpu
.
to
(
"cpu"
)
selected_experts_cpu
=
selected_experts_cpu
.
to
(
"cpu"
)
routing_weights_cpu
=
routing_weights_cpu
.
to
(
"cpu"
)
batch_sequence_length
,
hidden_dim
=
hidden_states_cpu
.
size
()
batch_sequence_length
,
hidden_dim
=
hidden_states_cpu
.
size
()
final_hidden_states
=
torch
.
zeros
(
final_hidden_states
=
torch
.
zeros
(
...
@@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
...
@@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
topk_idx
,
topk_weight
,
aux_loss
=
self
.
gate
(
hidden_states
)
topk_idx
,
topk_weight
,
aux_loss
=
self
.
gate
(
hidden_states
)
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
])
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
])
if
sequence_length
==
1
:
if
sequence_length
==
1
and
hasattr
(
self
.
experts
.
generate_experts
,
"submit_for_one_decode"
)
:
self
.
experts
.
generate_experts
.
submit_for_one_decode
(
hidden_states
[
0
],
topk_idx
[
0
],
topk_weight
[
0
])
self
.
experts
.
generate_experts
.
submit_for_one_decode
(
hidden_states
[
0
],
topk_idx
[
0
],
topk_weight
[
0
])
if
self
.
config
.
n_shared_experts
is
not
None
:
if
self
.
config
.
n_shared_experts
is
not
None
:
y_
=
self
.
shared_experts
(
identity
).
squeeze
(
0
)
y_
=
self
.
shared_experts
(
identity
).
squeeze
(
0
)
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
View file @
11544ef2
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
prefill_device
:
"
cuda"
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
-
match
:
...
...
ktransformers/util/custom_gguf.py
View file @
11544ef2
...
@@ -7,6 +7,9 @@ Date : 2024-07-26 08:48:54
...
@@ -7,6 +7,9 @@ Date : 2024-07-26 08:48:54
Version : 1.0.0
Version : 1.0.0
LastEditors : Azure
LastEditors : Azure
LastEditTime : 2024-07-26 09:28:25
LastEditTime : 2024-07-26 09:28:25
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
...
@@ -96,6 +99,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
...
@@ -96,6 +99,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
GGML_TYPES
=
{
GGML_TYPES
=
{
"F32"
:
0
,
"F32"
:
0
,
"F16"
:
1
,
"F16"
:
1
,
"Q4_0"
:
2
,
"Q5_0"
:
6
,
"Q8_0"
:
8
,
"Q8_0"
:
8
,
"Q2_K"
:
10
,
"Q2_K"
:
10
,
"Q3_K"
:
11
,
"Q3_K"
:
11
,
...
@@ -109,6 +114,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
...
@@ -109,6 +114,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
GGML_BLOCK_SIZES
=
{
GGML_BLOCK_SIZES
=
{
"F32"
:
4
,
"F32"
:
4
,
"F16"
:
2
,
"F16"
:
2
,
"Q4_0"
:
2
+
16
,
"Q5_0"
:
2
+
4
+
16
,
"Q8_0"
:
2
+
32
,
"Q8_0"
:
2
+
32
,
"Q2_K"
:
256
//
16
+
256
//
4
+
2
+
2
,
"Q2_K"
:
256
//
16
+
256
//
4
+
2
+
2
,
"Q3_K"
:
256
//
8
+
256
//
4
+
12
+
2
,
"Q3_K"
:
256
//
8
+
256
//
4
+
12
+
2
,
...
@@ -120,6 +127,8 @@ GGML_BLOCK_SIZES = {
...
@@ -120,6 +127,8 @@ GGML_BLOCK_SIZES = {
GGML_ELEMENTS_PER_BLOCK
=
{
GGML_ELEMENTS_PER_BLOCK
=
{
"F32"
:
1
,
"F32"
:
1
,
"F16"
:
1
,
"F16"
:
1
,
"Q4_0"
:
32
,
"Q5_0"
:
32
,
"Q8_0"
:
32
,
"Q8_0"
:
32
,
"Q2_K"
:
256
,
"Q2_K"
:
256
,
"Q3_K"
:
256
,
"Q3_K"
:
256
,
...
@@ -128,14 +137,6 @@ GGML_ELEMENTS_PER_BLOCK = {
...
@@ -128,14 +137,6 @@ GGML_ELEMENTS_PER_BLOCK = {
"Q6_K"
:
256
,
"Q6_K"
:
256
,
}
}
# DATA_TYPES = {
# "uint32": 4,
# "int32": 5,
# "float32": 6,
# "string": 8,
# "array": 9,
# "uint64": 10,
# }
DATA_TYPES
=
{
DATA_TYPES
=
{
"uint8"
:
0
,
"uint8"
:
0
,
"int8"
:
1
,
"int8"
:
1
,
...
@@ -272,7 +273,7 @@ class GGUFLoader:
...
@@ -272,7 +273,7 @@ class GGUFLoader:
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
t
=
self
.
tensor_info
[
name
]
t
=
self
.
tensor_info
[
name
]
shape
=
t
[
"shape"
]
shape
=
t
[
"shape"
]
ggml_type
=
t
[
"ggml_type"
]
ggml_type
=
t
[
"ggml_type"
]
...
@@ -282,10 +283,12 @@ class GGUFLoader:
...
@@ -282,10 +283,12 @@ class GGUFLoader:
ggml_name
=
GGML_NAMES
[
ggml_type
]
ggml_name
=
GGML_NAMES
[
ggml_type
]
data
=
self
.
get_mmap_tensor
(
name
)
data
=
self
.
get_mmap_tensor
(
name
)
if
"cuda"
in
device
.
lower
():
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
#values = GGML_DEQUANTIZE[ggml_name](data)
#print("load_gguf_tensor")
#values = torch.from_numpy(values).to(device = device)
else
:
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
values
=
torch
.
from_numpy
(
values
)
...
@@ -375,7 +378,7 @@ def dequantize_q2_k(data):
...
@@ -375,7 +378,7 @@ def dequantize_q2_k(data):
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
def
dequantize_q2_k_gpu
(
data
):
def
dequantize_q2_k_gpu
(
data
):
pass
raise
NotImplementedError
()
def
dequantize_q3_k
(
data
):
def
dequantize_q3_k
(
data
):
# C implementation
# C implementation
...
@@ -420,7 +423,7 @@ def dequantize_q3_k(data):
...
@@ -420,7 +423,7 @@ def dequantize_q3_k(data):
],
axis
=
1
)
],
axis
=
1
)
def
dequantize_q3_k_gpu
(
data
):
def
dequantize_q3_k_gpu
(
data
):
pass
raise
NotImplementedError
()
def
dequantize_q4_k
(
data
):
def
dequantize_q4_k
(
data
):
# C implementation
# C implementation
...
@@ -429,20 +432,16 @@ def dequantize_q4_k(data):
...
@@ -429,20 +432,16 @@ def dequantize_q4_k(data):
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
block_size
=
GGML_BLOCK_SIZES
[
"Q4_K"
]
block_size
=
GGML_BLOCK_SIZES
[
"Q4_K"
]
num_blocks
=
len
(
data
)
//
block_size
num_blocks
=
len
(
data
)
//
block_size
data_f16
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
block_size
//
2
)
data_f16
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
block_size
//
2
)
data_u8
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
block_size
)
data_u8
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
block_size
)
# Casting to float32 because float16 is very slow on CPU
# Casting to float32 because float16 is very slow on CPU
scale_factors
=
data_f16
[:,
0
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
scale_factors
=
data_f16
[:,
0
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
scale_offsets
=
data_f16
[:,
1
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
scale_offsets
=
data_f16
[:,
1
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
qs1
=
data_u8
[:,
4
:
16
].
reshape
(
num_blocks
,
12
,
1
)
qs1
=
data_u8
[:,
4
:
16
].
reshape
(
num_blocks
,
12
,
1
)
qs2
=
data_u8
[:,
16
:].
reshape
(
num_blocks
,
4
,
32
)
qs2
=
data_u8
[:,
16
:].
reshape
(
num_blocks
,
4
,
32
)
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
factors
=
scale_factors
*
np
.
concatenate
([
qs1
[:,
0
:
4
]
&
0b111111
,
(
qs1
[:,
8
:]
&
15
)
|
((
qs1
[:,
0
:
4
]
>>
6
)
<<
4
)],
axis
=
1
)
factors
=
scale_factors
*
np
.
concatenate
([
qs1
[:,
0
:
4
]
&
0b111111
,
(
qs1
[:,
8
:]
&
15
)
|
((
qs1
[:,
0
:
4
]
>>
6
)
<<
4
)],
axis
=
1
)
offsets
=
scale_offsets
*
np
.
concatenate
([
qs1
[:,
4
:
8
]
&
0b111111
,
(
qs1
[:,
8
:]
>>
4
)
|
((
qs1
[:,
4
:
8
]
>>
6
)
<<
4
)],
axis
=
1
)
offsets
=
scale_offsets
*
np
.
concatenate
([
qs1
[:,
4
:
8
]
&
0b111111
,
(
qs1
[:,
8
:]
>>
4
)
|
((
qs1
[:,
4
:
8
]
>>
6
)
<<
4
)],
axis
=
1
)
# Interleave low and high quantized bits
# Interleave low and high quantized bits
qs2
=
np
.
stack
([
qs2
&
0xf
,
qs2
>>
4
],
axis
=
2
).
reshape
(
num_blocks
,
8
,
32
)
qs2
=
np
.
stack
([
qs2
&
0xf
,
qs2
>>
4
],
axis
=
2
).
reshape
(
num_blocks
,
8
,
32
)
# Dequantize final weights using scales and offsets
# Dequantize final weights using scales and offsets
...
@@ -513,7 +512,7 @@ def dequantize_q5_k(data):
...
@@ -513,7 +512,7 @@ def dequantize_q5_k(data):
],
axis
=
1
)
],
axis
=
1
)
def
dequantize_q5_k_gpu
(
data
):
def
dequantize_q5_k_gpu
(
data
):
pass
raise
NotImplementedError
()
def
dequantize_q6_k
(
data
):
def
dequantize_q6_k
(
data
):
...
@@ -573,6 +572,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
...
@@ -573,6 +572,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
data
=
torch
.
from_numpy
(
data
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
210
,
device
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
210
,
device
)
def
dequantize_q4_0
(
data
):
# C implementation
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
# C struct definition
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
num_blocks
=
len
(
data
)
//
GGML_BLOCK_SIZES
[
"Q4_0"
]
scales
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
1
+
8
)[:,
:
1
].
astype
(
np
.
float32
)
qs
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
2
+
16
)[:,
2
:]
return
np
.
concatenate
([
scales
*
((
qs
&
0xf
).
astype
(
np
.
int8
)
-
8
),
scales
*
((
qs
>>
4
).
astype
(
np
.
int8
)
-
8
),
],
axis
=
1
)
def
dequantize_q4_0_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q5_0
(
data
):
# C implementation
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
# C struct definition
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
num_blocks
=
len
(
data
)
//
GGML_BLOCK_SIZES
[
"Q5_0"
]
scales
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
1
+
2
+
8
)[:,
:
1
].
astype
(
np
.
float32
)
qh
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
2
+
4
+
16
)[:,
2
:
2
+
4
]
qs
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
2
+
4
+
16
)[:,
2
+
4
:]
bits
=
np
.
unpackbits
(
qh
,
axis
=-
1
,
bitorder
=
"little"
)
x0
=
((
qs
&
0xf
).
astype
(
np
.
int8
)
|
(
bits
[:,
:
16
]
<<
4
))
-
16
x1
=
((
qs
>>
4
).
astype
(
np
.
int8
)
|
(
bits
[:,
16
:]
<<
4
))
-
16
return
np
.
concatenate
([
scales
*
x0
,
scales
*
x1
,
],
axis
=
1
)
def
dequantize_q5_0_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q8_0
(
data
):
def
dequantize_q8_0
(
data
):
# C struct definition
# C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
...
@@ -615,6 +656,8 @@ def dequantize_f16_gpu(data, device):
...
@@ -615,6 +656,8 @@ def dequantize_f16_gpu(data, device):
GGML_DEQUANTIZE
=
{
GGML_DEQUANTIZE
=
{
"F32"
:
dequantize_f32
,
"F32"
:
dequantize_f32
,
"F16"
:
dequantize_f16
,
"F16"
:
dequantize_f16
,
"Q4_0"
:
dequantize_q4_0
,
"Q5_0"
:
dequantize_q5_0
,
"Q8_0"
:
dequantize_q8_0
,
"Q8_0"
:
dequantize_q8_0
,
"Q2_K"
:
dequantize_q2_k
,
"Q2_K"
:
dequantize_q2_k
,
"Q3_K"
:
dequantize_q3_k
,
"Q3_K"
:
dequantize_q3_k
,
...
@@ -626,6 +669,8 @@ GGML_DEQUANTIZE = {
...
@@ -626,6 +669,8 @@ GGML_DEQUANTIZE = {
GGML_DEQUANTIZE_GPU
=
{
GGML_DEQUANTIZE_GPU
=
{
"F32"
:
dequantize_f32_gpu
,
"F32"
:
dequantize_f32_gpu
,
"F16"
:
dequantize_f16_gpu
,
"F16"
:
dequantize_f16_gpu
,
"Q4_0"
:
dequantize_q4_0_gpu
,
"Q5_0"
:
dequantize_q5_0_gpu
,
"Q8_0"
:
dequantize_q8_0_gpu
,
"Q8_0"
:
dequantize_q8_0_gpu
,
"Q2_K"
:
dequantize_q2_k_gpu
,
"Q2_K"
:
dequantize_q2_k_gpu
,
"Q3_K"
:
dequantize_q3_k_gpu
,
"Q3_K"
:
dequantize_q3_k_gpu
,
...
...
ktransformers/util/utils.py
View file @
11544ef2
...
@@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
...
@@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
past_key_values
.
change_seq_length
(
1
)
past_key_values
.
change_seq_length
(
1
)
"""
"""
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
custom_stream = torch.cuda.Stream()
with torch.cuda.stream(custom_stream):
with torch.cuda.stream(custom_stream):
logits=model(
cur_token
,
logits=model(
inputs_embeds = inputs_embeds
,
position_ids
=
position_ids,
position_ids
=
position_ids,
cache_position
=
cache_position,
cache_position
=
cache_position,
past_key_values
=
past_key_values,
past_key_values
=
past_key_values,
return_dict
=
False, use_cache
=
True)[0]
return_dict
=
False, use_cache
=
True)
[0]
#
"""
"""
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
#print(logits)
#print(logits)
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
...
@@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
...
@@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
generated_ids
[:,
cache_position
]
=
inputs
.
to
(
torch_device
).
to
(
torch
.
int
)
generated_ids
[:,
cache_position
]
=
inputs
.
to
(
torch_device
).
to
(
torch
.
int
)
past_key_values
.
cur_idx
=
cache_position
past_key_values
.
cur_idx
=
cache_position
start_time
=
time
.
time
()
start_time
=
time
.
time
()
#custom_stream = torch.cuda.Stream()
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
"cuda"
)
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
"cuda"
)
logits
=
model
(
logits
=
model
(
...
...
pyproject.toml
View file @
11544ef2
...
@@ -3,7 +3,8 @@ requires = [
...
@@ -3,7 +3,8 @@ requires = [
"setuptools"
,
"setuptools"
,
"torch >= 2.3.0"
,
"torch >= 2.3.0"
,
"ninja"
,
"ninja"
,
"packaging"
"packaging"
,
"cpufeature"
]
]
build-backend
=
"setuptools.build_meta"
build-backend
=
"setuptools.build_meta"
...
...
setup.py
View file @
11544ef2
...
@@ -6,7 +6,7 @@ Author : chenxl
...
@@ -6,7 +6,7 @@ Author : chenxl
Date : 2024-07-27 16:15:27
Date : 2024-07-27 16:15:27
Version : 1.0.0
Version : 1.0.0
LastEditors : chenxl
LastEditors : chenxl
LastEditTime : 2024-0
7-31
0
9
:4
4:46
LastEditTime : 2024-0
8-08
0
2
:4
5:15
Adapted from:
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
Copyright (c) 2023, Tri Dao.
...
@@ -19,6 +19,7 @@ import re
...
@@ -19,6 +19,7 @@ import re
import
ast
import
ast
import
subprocess
import
subprocess
import
platform
import
platform
import
shutil
import
http.client
import
http.client
import
urllib.request
import
urllib.request
import
urllib.error
import
urllib.error
...
@@ -27,6 +28,7 @@ from packaging.version import parse
...
@@ -27,6 +28,7 @@ from packaging.version import parse
import
torch.version
import
torch.version
from
wheel.bdist_wheel
import
bdist_wheel
as
_bdist_wheel
from
wheel.bdist_wheel
import
bdist_wheel
as
_bdist_wheel
from
setuptools
import
setup
,
Extension
from
setuptools
import
setup
,
Extension
from
cpufeature.extension
import
CPUFeature
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
class
CpuInstructInfo
:
class
CpuInstructInfo
:
...
@@ -67,6 +69,8 @@ class VersionInfo:
...
@@ -67,6 +69,8 @@ class VersionInfo:
"""
"""
if
sys
.
platform
.
startswith
(
"linux"
):
if
sys
.
platform
.
startswith
(
"linux"
):
return
f
'linux_
{
platform
.
uname
().
machine
}
'
return
f
'linux_
{
platform
.
uname
().
machine
}
'
elif
sys
.
platform
==
"win32"
:
return
"win_amd64"
else
:
else
:
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
...
@@ -97,6 +101,15 @@ class VersionInfo:
...
@@ -97,6 +101,15 @@ class VersionInfo:
return
'avx2'
return
'avx2'
raise
ValueError
(
raise
ValueError
(
"Unsupported cpu Instructions: {}"
.
format
(
flags_line
))
"Unsupported cpu Instructions: {}"
.
format
(
flags_line
))
elif
sys
.
platform
==
"win32"
:
if
CPUFeature
.
get
(
"AVX512bw"
,
False
):
return
'fancy'
if
CPUFeature
.
get
(
"AVX512f"
,
False
):
return
'avx512'
if
CPUFeature
.
get
(
"AVX2"
,
False
):
return
'avx2'
raise
ValueError
(
"Unsupported cpu Instructions: {}"
.
format
(
str
(
CPUFeature
)))
else
:
else
:
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
...
@@ -154,7 +167,7 @@ class BuildWheelsCommand(_bdist_wheel):
...
@@ -154,7 +167,7 @@ class BuildWheelsCommand(_bdist_wheel):
wheel_path
=
os
.
path
.
join
(
self
.
dist_dir
,
archive_basename
+
".whl"
)
wheel_path
=
os
.
path
.
join
(
self
.
dist_dir
,
archive_basename
+
".whl"
)
print
(
"Raw wheel path"
,
wheel_path
)
print
(
"Raw wheel path"
,
wheel_path
)
os
.
renam
e
(
wheel_filename
,
wheel_path
)
shutil
.
mov
e
(
wheel_filename
,
wheel_path
)
except
(
urllib
.
error
.
HTTPError
,
urllib
.
error
.
URLError
,
http
.
client
.
RemoteDisconnected
):
except
(
urllib
.
error
.
HTTPError
,
urllib
.
error
.
URLError
,
http
.
client
.
RemoteDisconnected
):
print
(
"Precompiled wheel not found. Building from source..."
)
print
(
"Precompiled wheel not found. Building from source..."
)
# If the wheel could not be downloaded, build from source
# If the wheel could not be downloaded, build from source
...
...
third_party/llamafile/iqk_mul_mat.inc
View file @
11544ef2
...
@@ -22,7 +22,7 @@
...
@@ -22,7 +22,7 @@
#include <cstring>
#include <cstring>
#include <type_traits>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__
#if defined __x86_64__ || defined __aarch64__
|| defined(_M_X64)
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
...
@@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
...
@@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
return
true
;
return
true
;
}
}
#if defined __x86_64__
#if defined __x86_64__
|| defined(_M_X64)
#if defined HAVE_FANCY_SIMD
#if defined HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
...
@@ -1412,7 +1412,8 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1412,7 +1412,8 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
if
(
ne00
%
ggml_blck_size
(
GGML_TYPE_Q8_K
)
==
0
)
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
switch
(
typeA
)
{
switch
(
typeA
)
{
case
GGML_TYPE_Q2_K
:
case
GGML_TYPE_Q2_K
:
...
...
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
View file @
11544ef2
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
// Copyrigth 2024 Iwan Kawrakow.
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#ifdef
__x86_64__
#if
def
ined(
__x86_64__
) || defined(_M_X64)
#include "iqk_mul_mat.inc"
#include "iqk_mul_mat.inc"
#endif // __x86_64__
#endif // __x86_64__
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
View file @
11544ef2
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
// Copyrigth 2024 Iwan Kawrakow.
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#ifdef
__x86_64__
#if
def
ined(
__x86_64__
) || defined(_M_X64)
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#include "iqk_mul_mat.inc"
#include "iqk_mul_mat.inc"
...
...
third_party/llamafile/sgemm.cpp
View file @
11544ef2
...
@@ -22,19 +22,22 @@
...
@@ -22,19 +22,22 @@
#include "sgemm.h"
#include "sgemm.h"
// #include <cosmo.h>
// #include <cosmo.h>
#include <cpuid.h>
//
#include <cpuid.h>
// #include <libc/sysv/consts/hwcap.h>
// #include <libc/sysv/consts/hwcap.h>
#include <stdio.h>
#include <stdio.h>
#include <sys/auxv.h>
//
#include <sys/auxv.h>
#include <cassert>
#include <cassert>
// #include "llamafile.h"
// #include "llamafile.h"
static
const
struct
GemmFuncs
{
static
const
struct
GemmFuncs
{
typeof
(
llamafile_sgemm
)
*
sgemm
;
bool
(
*
sgemm
)(
long
,
long
,
long
,
const
void
*
,
long
,
const
void
*
,
long
,
void
*
,
long
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
typeof
(
llamafile_mixmul
)
*
mixmul
;
bool
(
*
mixmul
)(
const
struct
ggml_compute_params
*
,
const
struct
ggml_tensor
*
,
const
struct
ggml_tensor
*
,
const
struct
ggml_tensor
*
,
struct
ggml_tensor
*
);
typeof
(
llamafile_mixmul_iqk
)
*
iqk_mixmul
=
iqk_mul_mat_moe_unsupported
;
bool
(
*
iqk_mixmul
)(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
// typeof(llamafile_sgemm)* sgemm;
// typeof(llamafile_mixmul)* mixmul;
// typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
GemmFuncs
()
{
GemmFuncs
()
{
#ifdef
__x86_64__
#if
def
ined(
__x86_64__
) || defined(_M_X64)
// if (X86_HAVE(AVX)) {
// if (X86_HAVE(AVX)) {
// if (X86_HAVE(FMA)) {
// if (X86_HAVE(FMA)) {
// if (X86_HAVE(AVX2)) {
// if (X86_HAVE(AVX2)) {
...
@@ -86,10 +89,12 @@ static const struct GemmFuncs {
...
@@ -86,10 +89,12 @@ static const struct GemmFuncs {
// sgemm = llamafile_sgemm_unsupported;
// sgemm = llamafile_sgemm_unsupported;
// mixmul = llamafile_mixmul_unsupported;
// mixmul = llamafile_mixmul_unsupported;
// }
// }
#if defined(__AVX__)
#if defined(__AVX__)
#if defined(__FMA__)
#if defined(__FMA__)
|| (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX2__)
#if defined(__AVX512F__)
#if defined(__AVX512F__)
printf
(
"__AVX512F__
\n
"
);
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
// AMD Zen4+ (2023-)
// AMD Zen4+ (2023-)
sgemm
=
llamafile_sgemm_amd_zen4
;
sgemm
=
llamafile_sgemm_amd_zen4
;
...
...
third_party/llamafile/tinyblas_cpu.h
View file @
11544ef2
...
@@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
...
@@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
}
}
#endif
#endif
#if defined(__FMA__)
#if defined(__FMA__)
|| (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template
<
>
template
<
>
inline
__m256
madd
(
__m256
a
,
__m256
b
,
__m256
c
)
{
inline
__m256
madd
(
__m256
a
,
__m256
b
,
__m256
c
)
{
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment