Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
11544ef2
Unverified
Commit
11544ef2
authored
Aug 08, 2024
by
UnicornChan
Committed by
GitHub
Aug 08, 2024
Browse files
Merge pull request #25 from kvcache-ai/windows
Windows Support
parents
442e13bc
0e613b60
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
258 additions
and
96 deletions
+258
-96
ktransformers/ktransformers_ext/CMakeLists.txt
ktransformers/ktransformers_ext/CMakeLists.txt
+19
-4
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+41
-3
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
...formers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+54
-19
ktransformers/ktransformers_ext/cuda/setup.py
ktransformers/ktransformers_ext/cuda/setup.py
+1
-1
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
+5
-5
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+18
-18
ktransformers/ktransformers_ext/operators/llamafile/moe.h
ktransformers/ktransformers_ext/operators/llamafile/moe.h
+1
-1
ktransformers/local_chat.py
ktransformers/local_chat.py
+1
-1
ktransformers/models/custom_cache.py
ktransformers/models/custom_cache.py
+1
-0
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+10
-3
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+1
-1
ktransformers/util/custom_gguf.py
ktransformers/util/custom_gguf.py
+62
-17
ktransformers/util/utils.py
ktransformers/util/utils.py
+8
-7
pyproject.toml
pyproject.toml
+2
-1
setup.py
setup.py
+15
-2
third_party/llamafile/iqk_mul_mat.inc
third_party/llamafile/iqk_mul_mat.inc
+4
-3
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
+1
-1
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
+1
-1
third_party/llamafile/sgemm.cpp
third_party/llamafile/sgemm.cpp
+12
-7
third_party/llamafile/tinyblas_cpu.h
third_party/llamafile/tinyblas_cpu.h
+1
-1
No files found.
ktransformers/ktransformers_ext/CMakeLists.txt
View file @
11544ef2
cmake_minimum_required
(
VERSION 3.1
6
)
cmake_minimum_required
(
VERSION 3.1
7
)
project
(
cpuinfer_ext VERSION 0.1.0
)
project
(
cpuinfer_ext VERSION 0.1.0
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
...
@@ -190,7 +190,13 @@ else()
...
@@ -190,7 +190,13 @@ else()
message
(
STATUS
"Unknown architecture"
)
message
(
STATUS
"Unknown architecture"
)
endif
()
endif
()
find_package
(
CUDA REQUIRED
)
# message(STATUS "CUDAToolkit_ROOT:${CUDAToolkit_ROOT}")
# find_package(FindCUDAToolkit REQUIRED)
# if(CUDAToolkit_FOUND)
# message(STATUS "Found CUDA cudart lib at:${CUDAToolkit_LIBRARY_DIR}")
# else()
# message(STATUS "Can't found CUDA lib")
# endif()
add_compile_options
(
"$<$<COMPILE_LANGUAGE:CXX>:
${
ARCH_FLAGS
}
>"
)
add_compile_options
(
"$<$<COMPILE_LANGUAGE:CXX>:
${
ARCH_FLAGS
}
>"
)
add_compile_options
(
"$<$<COMPILE_LANGUAGE:C>:
${
ARCH_FLAGS
}
>"
)
add_compile_options
(
"$<$<COMPILE_LANGUAGE:C>:
${
ARCH_FLAGS
}
>"
)
...
@@ -199,7 +205,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
...
@@ -199,7 +205,12 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/pybind11 ${CMAKE_
add_subdirectory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llama.cpp
${
CMAKE_CURRENT_BINARY_DIR
}
/third_party/llama.cpp
)
add_subdirectory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/llama.cpp
${
CMAKE_CURRENT_BINARY_DIR
}
/third_party/llama.cpp
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party
)
include_directories
(
"
${
CUDA_INCLUDE_DIRS
}
"
)
if
(
WIN32
)
include_directories
(
"$ENV{CUDA_PATH}/include"
)
elseif
(
UNIX
)
find_package
(
CUDA REQUIRED
)
include_directories
(
"
${
CUDA_INCLUDE_DIRS
}
"
)
endif
()
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
SOURCE_DIR1
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
SOURCE_DIR1
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cpu_backend SOURCE_DIR2
)
aux_source_directory
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/cpu_backend SOURCE_DIR2
)
...
@@ -210,4 +221,8 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
...
@@ -210,4 +221,8 @@ message(STATUS "ALL_SOURCES: ${ALL_SOURCES}")
pybind11_add_module
(
${
PROJECT_NAME
}
MODULE
${
ALL_SOURCES
}
)
pybind11_add_module
(
${
PROJECT_NAME
}
MODULE
${
ALL_SOURCES
}
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE llama
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE llama
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
"/usr/local/cuda/lib64/libcudart.so"
)
if
(
WIN32
)
\ No newline at end of file
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
"$ENV{CUDA_PATH}/lib/x64/cudart.lib"
)
#CUDA::cudart
elseif
(
UNIX
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
"$ENV{CUDA_HOME}/lib64/libcudart.so"
)
endif
()
\ No newline at end of file
ktransformers/ktransformers_ext/cpu_backend/task_queue.h
View file @
11544ef2
...
@@ -3,8 +3,8 @@
...
@@ -3,8 +3,8 @@
* @Author : chenht2022
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @Version : 1.0.0
* @LastEditors : chen
ht2022
* @LastEditors : chen
xl
* @LastEditTime : 2024-0
7-25 10:33:47
* @LastEditTime : 2024-0
8-08 04:23:51
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
**/
#ifndef CPUINFER_TASKQUEUE_H
#ifndef CPUINFER_TASKQUEUE_H
...
@@ -17,6 +17,44 @@
...
@@ -17,6 +17,44 @@
#include <queue>
#include <queue>
#include <thread>
#include <thread>
#include <vector>
#include <vector>
#ifdef _WIN32
#include <windows.h>
#endif
class
custom_mutex
{
private:
#ifdef _WIN32
HANDLE
global_mutex
;
#else
std
::
mutex
global_mutex
;
#endif
public:
custom_mutex
()
{
#ifdef _WIN32
HANDLE
global_mutex
;
#endif
}
void
lock
()
{
#ifdef _WIN32
WaitForSingleObject
(
global_mutex
,
INFINITE
);
#else
global_mutex
.
lock
();
#endif
}
void
unlock
()
{
#ifdef _WIN32
ReleaseMutex
(
global_mutex
);
#else
global_mutex
.
lock
();
#endif
}
};
class
TaskQueue
{
class
TaskQueue
{
public:
public:
...
@@ -32,7 +70,7 @@ class TaskQueue {
...
@@ -32,7 +70,7 @@ class TaskQueue {
std
::
queue
<
std
::
function
<
void
()
>>
tasks
;
std
::
queue
<
std
::
function
<
void
()
>>
tasks
;
std
::
thread
worker
;
std
::
thread
worker
;
std
::
mutex
mutex
;
custom_
mutex
mutex
;
std
::
atomic
<
bool
>
sync_flag
;
std
::
atomic
<
bool
>
sync_flag
;
std
::
atomic
<
bool
>
exit_flag
;
std
::
atomic
<
bool
>
exit_flag
;
};
};
...
...
ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
View file @
11544ef2
...
@@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
...
@@ -1703,28 +1703,63 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s,
thread_m_blocks
=
exec_cfg
.
max_m_blocks
;
thread_m_blocks
=
exec_cfg
.
max_m_blocks
;
}
}
// Define kernel configurations
// Define kernel configurations
if
(
false
)
{
#define undefined_error TORCH_CHECK(false, "Unsupported shapes: MNK = [" + str(prob_m) + ", " + \
str(prob_n) + ", " + str(prob_k) + "]" + \
", has_act_order = " + str(has_act_order) + \
", num_groups = " + str(num_groups) + \
", group_size = " + str(group_size) + \
", thread_m_blocks = " + str(thread_m_blocks) + \
", thread_n_blocks = " + str(thread_n_blocks) + \
", thread_k_blocks = " + str(thread_k_blocks));
if
(
num_bits
==
4
&&
num_threads
==
256
)
{
if
(
false
)
{
}
CALL_IF
(
4
,
32
,
2
,
256
)
CALL_IF
(
4
,
16
,
4
,
256
)
CALL_IF
(
4
,
8
,
8
,
256
)
else
{
undefined_error
}
}
else
if
(
num_bits
==
4
&&
num_threads
==
128
)
{
if
(
false
)
{
}
CALL_IF
(
4
,
8
,
4
,
128
)
CALL_IF
(
4
,
4
,
8
,
128
)
else
{
undefined_error
}
}
else
if
(
num_bits
==
8
&&
num_threads
==
256
)
{
if
(
false
)
{
}
CALL_IF
(
8
,
32
,
2
,
256
)
CALL_IF
(
8
,
16
,
4
,
256
)
CALL_IF
(
8
,
8
,
8
,
256
)
else
{
undefined_error
}
}
else
if
(
num_bits
==
8
&&
num_threads
==
128
)
{
if
(
false
)
{
}
CALL_IF
(
8
,
8
,
4
,
128
)
CALL_IF
(
8
,
4
,
8
,
128
)
else
{
undefined_error
}
}
}
CALL_IF
(
4
,
32
,
2
,
256
)
CALL_IF
(
4
,
16
,
4
,
256
)
CALL_IF
(
4
,
8
,
8
,
256
)
CALL_IF
(
4
,
8
,
4
,
128
)
CALL_IF
(
4
,
4
,
8
,
128
)
CALL_IF
(
8
,
32
,
2
,
256
)
CALL_IF
(
8
,
16
,
4
,
256
)
CALL_IF
(
8
,
8
,
8
,
256
)
CALL_IF
(
8
,
8
,
4
,
128
)
CALL_IF
(
8
,
4
,
8
,
128
)
else
{
else
{
TORCH_CHECK
(
false
,
"Unsupported shapes: MNK = ["
+
str
(
prob_m
)
+
", "
+
undefined_error
str
(
prob_n
)
+
", "
+
str
(
prob_k
)
+
"]"
+
", has_act_order = "
+
str
(
has_act_order
)
+
", num_groups = "
+
str
(
num_groups
)
+
", group_size = "
+
str
(
group_size
)
+
", thread_m_blocks = "
+
str
(
thread_m_blocks
)
+
", thread_n_blocks = "
+
str
(
thread_n_blocks
)
+
", thread_k_blocks = "
+
str
(
thread_k_blocks
));
}
}
A_ptr
+=
16
*
thread_m_blocks
*
(
prob_k
/
8
)
*
par
;
A_ptr
+=
16
*
thread_m_blocks
*
(
prob_k
/
8
)
*
par
;
...
...
ktransformers/ktransformers_ext/cuda/setup.py
View file @
11544ef2
...
@@ -10,7 +10,7 @@ setup(name='KTransformersOps',
...
@@ -10,7 +10,7 @@ setup(name='KTransformersOps',
'custom_gguf/dequant.cu'
,
'custom_gguf/dequant.cu'
,
'binding.cpp'
,
'binding.cpp'
,
'gptq_marlin/gptq_marlin.cu'
,
'gptq_marlin/gptq_marlin.cu'
,
# 'gptq_marlin_repack.cu',
# 'gptq_marlin_repack.cu',
])
])
],
],
cmdclass
=
{
'build_ext'
:
BuildExtension
cmdclass
=
{
'build_ext'
:
BuildExtension
...
...
ktransformers/ktransformers_ext/operators/llamafile/mlp.cpp
View file @
11544ef2
...
@@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -68,10 +68,10 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
int
nth
=
config_
.
intermediate_size
/
config_
.
stride
;
int
nth
=
config_
.
intermediate_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
int
ith
=
task_id
;
void
*
gate_proj_ptr
=
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
gate_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
gate_output_ptr
=
gate_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
ith
*
config_
.
stride
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
up_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
up_output_ptr
=
up_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -79,7 +79,7 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
}
}
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
==
0
)
{
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
.
data
()
+
ith
*
config_
.
stride
;
float
*
intermediate_fp32_ptr
=
intermediate_fp32_
.
data
()
+
ith
*
config_
.
stride
;
void
*
down_input_ptr
=
down_input_
.
data
()
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
void
*
down_input_ptr
=
(
uint8_t
*
)
down_input_
.
data
()
+
ith
*
config_
.
stride
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
from_float
(
intermediate_fp32_ptr
,
down_input_ptr
,
config_
.
stride
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
}
}
});
});
...
@@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
...
@@ -89,11 +89,11 @@ void MLP::forward(const void* input, void* output, Backend* backend) {
nth
=
config_
.
hidden_size
/
config_
.
stride
;
nth
=
config_
.
hidden_size
/
config_
.
stride
;
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
backend
->
do_work_stealing_job
(
nth
,
[
&
](
int
task_id
)
{
int
ith
=
task_id
;
int
ith
=
task_id
;
void
*
down_proj_ptr
=
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
ith
*
config_
.
stride
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
down_output_
.
data
()
+
ith
*
config_
.
stride
;
float
*
down_output_ptr
=
down_output_
.
data
()
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
.
data
(),
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_
.
data
(),
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
void
*
output_ptr
=
(
uint8_t
*
)
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
down_output_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
from_float
(
down_output_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
});
...
...
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
View file @
11544ef2
...
@@ -9,9 +9,9 @@
...
@@ -9,9 +9,9 @@
**/
**/
#include "moe.h"
#include "moe.h"
#include <iostream>
#include <iostream>
#include
"unistd.h"
#include
<cstdint>
void
*
MOE
::
buffer_
=
nullptr
;
uint8_t
*
MOE
::
buffer_
=
nullptr
;
MOE
::
MOE
(
MOEConfig
config
)
{
MOE
::
MOE
(
MOEConfig
config
)
{
config_
=
config
;
config_
=
config
;
...
@@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) {
...
@@ -32,7 +32,7 @@ MOE::MOE(MOEConfig config) {
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
buffer_size
+=
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
intermediate_size
*
ggml_type_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
)
/
ggml_blck_size
(
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
);
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
routed_expert_num
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_size
+=
sizeof
(
float
)
*
config_
.
group_max_len
*
config_
.
hidden_size
;
buffer_
=
malloc
(
buffer_size
);
buffer_
=
(
uint8_t
*
)
malloc
(
buffer_size
);
}
}
uint64_t
offset
=
0
;
uint64_t
offset
=
0
;
...
@@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) {
...
@@ -95,7 +95,7 @@ MOE::MOE(MOEConfig config) {
m_local_pos_
.
resize
(
config_
.
group_max_len
);
m_local_pos_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_local_pos_
[
i
].
res
erv
e
(
config_
.
expert_num
);
m_local_pos_
[
i
].
res
iz
e
(
config_
.
routed_
expert_num
);
}
}
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
...
@@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -156,10 +156,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -184,7 +184,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -193,7 +193,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
void
*
output_ptr
=
(
uint8_t
*
)
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
});
...
@@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -226,9 +226,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
const
void
*
gate_input_ptr
;
const
void
*
gate_input_ptr
;
const
void
*
up_input_ptr
;
const
void
*
up_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
gate_input_ptr
=
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
else
{
}
else
{
to_float
(
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
to_float
(
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
...
@@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -237,13 +237,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
m_gate_input_
[
i
];
gate_input_ptr
=
m_gate_input_
[
i
];
}
else
{
}
else
{
gate_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
m_up_input_
[
i
];
up_input_ptr
=
m_up_input_
[
i
];
}
else
{
}
else
{
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
}
}
}
}
...
@@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -258,11 +258,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_proj_ptr
=
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
...
@@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -280,7 +280,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
});
});
...
@@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -293,18 +293,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
}
}
}
}
from_float
(
m_output_fp32_
[
i
],
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
m_output_fp32_
[
i
],
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
});
});
}
}
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
return
;
return
;
}
}
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
(
uint8_t
*
)
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/moe.h
View file @
11544ef2
...
@@ -54,7 +54,7 @@ class MOE {
...
@@ -54,7 +54,7 @@ class MOE {
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
void
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
);
private:
private:
static
void
*
buffer_
;
static
uint8_t
*
buffer_
;
MOEConfig
config_
;
MOEConfig
config_
;
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
gate_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void
*
up_proj_
;
// [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
...
...
ktransformers/local_chat.py
View file @
11544ef2
...
@@ -112,4 +112,4 @@ def local_chat(
...
@@ -112,4 +112,4 @@ def local_chat(
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
)
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
fire
.
Fire
(
local_chat
)
fire
.
Fire
(
local_chat
)
\ No newline at end of file
ktransformers/models/custom_cache.py
View file @
11544ef2
...
@@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
...
@@ -46,6 +46,7 @@ class StaticCache(transformers.StaticCache):
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
...
...
ktransformers/operators/experts.py
View file @
11544ef2
...
@@ -19,7 +19,8 @@ import torch
...
@@ -19,7 +19,8 @@ import torch
import
sys
,
os
import
sys
,
os
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.operators.base_operator
import
BaseInjectedModule
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
"/../ktransformers_ext/build"
)
#sys.path.append(os.path.dirname(__file__) + "/../ktransformers_ext/build/")
sys
.
path
.
append
(
os
.
path
.
dirname
(
__file__
)
+
"
\\
..
\\
ktransformers_ext
\\
build
\\
Release"
)
import
cpuinfer_ext
import
cpuinfer_ext
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
import
ctypes
import
ctypes
...
@@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase):
...
@@ -179,6 +180,7 @@ class MLPCPUExperts(MLPExpertsBase):
def
forward
(
self
,
input_tensor
,
expert_ids
,
weights
):
def
forward
(
self
,
input_tensor
,
expert_ids
,
weights
):
# generate, capture and run cuda graph
# generate, capture and run cuda graph
if
input_tensor
.
size
(
0
)
==
1
:
if
input_tensor
.
size
(
0
)
==
1
:
# TODO: this branch is unreachable, but the shape of input_tensor([1,hidden_size]) and input_tensor_cpu([hidden_size]) is not compatible
#print("capturing experts")
#print("capturing experts")
MLPCPUExperts
.
input_tensor_cpu
.
copy_
(
input_tensor
,
non_blocking
=
True
)
MLPCPUExperts
.
input_tensor_cpu
.
copy_
(
input_tensor
,
non_blocking
=
True
)
MLPCPUExperts
.
expert_ids_cpu
.
copy_
(
expert_ids
,
non_blocking
=
True
)
MLPCPUExperts
.
expert_ids_cpu
.
copy_
(
expert_ids
,
non_blocking
=
True
)
...
@@ -359,7 +361,12 @@ class MLPExpertsTorch(MLPExpertsBase):
...
@@ -359,7 +361,12 @@ class MLPExpertsTorch(MLPExpertsBase):
self
.
down
=
None
self
.
down
=
None
def
forward
(
self
,
hidden_states_cpu
:
torch
.
Tensor
,
selected_experts_cpu
:
torch
.
Tensor
,
routing_weights_cpu
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states_cpu
:
torch
.
Tensor
,
selected_experts_cpu
:
torch
.
Tensor
,
routing_weights_cpu
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# TODO: forward should transfer data to gpu, and make the data transfering capturable using pin memory,
# just like CPUInfer MLPCPUExperts. There may be a base class of experts on cpu
hidden_states_cpu
=
hidden_states_cpu
.
to
(
"cpu"
)
selected_experts_cpu
=
selected_experts_cpu
.
to
(
"cpu"
)
routing_weights_cpu
=
routing_weights_cpu
.
to
(
"cpu"
)
batch_sequence_length
,
hidden_dim
=
hidden_states_cpu
.
size
()
batch_sequence_length
,
hidden_dim
=
hidden_states_cpu
.
size
()
final_hidden_states
=
torch
.
zeros
(
final_hidden_states
=
torch
.
zeros
(
...
@@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
...
@@ -587,7 +594,7 @@ class DeepseekV2MoEInjected(BaseInjectedModule, DeepseekV2MoE):
topk_idx
,
topk_weight
,
aux_loss
=
self
.
gate
(
hidden_states
)
topk_idx
,
topk_weight
,
aux_loss
=
self
.
gate
(
hidden_states
)
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
])
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
])
if
sequence_length
==
1
:
if
sequence_length
==
1
and
hasattr
(
self
.
experts
.
generate_experts
,
"submit_for_one_decode"
)
:
self
.
experts
.
generate_experts
.
submit_for_one_decode
(
hidden_states
[
0
],
topk_idx
[
0
],
topk_weight
[
0
])
self
.
experts
.
generate_experts
.
submit_for_one_decode
(
hidden_states
[
0
],
topk_idx
[
0
],
topk_weight
[
0
])
if
self
.
config
.
n_shared_experts
is
not
None
:
if
self
.
config
.
n_shared_experts
is
not
None
:
y_
=
self
.
shared_experts
(
identity
).
squeeze
(
0
)
y_
=
self
.
shared_experts
(
identity
).
squeeze
(
0
)
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
View file @
11544ef2
...
@@ -26,7 +26,7 @@
...
@@ -26,7 +26,7 @@
prefill_device
:
"
cuda"
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
-
match
:
...
...
ktransformers/util/custom_gguf.py
View file @
11544ef2
...
@@ -7,6 +7,9 @@ Date : 2024-07-26 08:48:54
...
@@ -7,6 +7,9 @@ Date : 2024-07-26 08:48:54
Version : 1.0.0
Version : 1.0.0
LastEditors : Azure
LastEditors : Azure
LastEditTime : 2024-07-26 09:28:25
LastEditTime : 2024-07-26 09:28:25
Adapted from https://github.com/99991/pygguf/blob/main/gguf.py
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2024 Thomas Germer
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
# copied from llama.cpp/gguf-py/gguf/constants.py to satisfy dependence of gguf
...
@@ -96,6 +99,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
...
@@ -96,6 +99,8 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
GGML_TYPES
=
{
GGML_TYPES
=
{
"F32"
:
0
,
"F32"
:
0
,
"F16"
:
1
,
"F16"
:
1
,
"Q4_0"
:
2
,
"Q5_0"
:
6
,
"Q8_0"
:
8
,
"Q8_0"
:
8
,
"Q2_K"
:
10
,
"Q2_K"
:
10
,
"Q3_K"
:
11
,
"Q3_K"
:
11
,
...
@@ -109,6 +114,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
...
@@ -109,6 +114,8 @@ GGML_NAMES = {ggml_type: name for name, ggml_type in GGML_TYPES.items()}
GGML_BLOCK_SIZES
=
{
GGML_BLOCK_SIZES
=
{
"F32"
:
4
,
"F32"
:
4
,
"F16"
:
2
,
"F16"
:
2
,
"Q4_0"
:
2
+
16
,
"Q5_0"
:
2
+
4
+
16
,
"Q8_0"
:
2
+
32
,
"Q8_0"
:
2
+
32
,
"Q2_K"
:
256
//
16
+
256
//
4
+
2
+
2
,
"Q2_K"
:
256
//
16
+
256
//
4
+
2
+
2
,
"Q3_K"
:
256
//
8
+
256
//
4
+
12
+
2
,
"Q3_K"
:
256
//
8
+
256
//
4
+
12
+
2
,
...
@@ -120,6 +127,8 @@ GGML_BLOCK_SIZES = {
...
@@ -120,6 +127,8 @@ GGML_BLOCK_SIZES = {
GGML_ELEMENTS_PER_BLOCK
=
{
GGML_ELEMENTS_PER_BLOCK
=
{
"F32"
:
1
,
"F32"
:
1
,
"F16"
:
1
,
"F16"
:
1
,
"Q4_0"
:
32
,
"Q5_0"
:
32
,
"Q8_0"
:
32
,
"Q8_0"
:
32
,
"Q2_K"
:
256
,
"Q2_K"
:
256
,
"Q3_K"
:
256
,
"Q3_K"
:
256
,
...
@@ -128,14 +137,6 @@ GGML_ELEMENTS_PER_BLOCK = {
...
@@ -128,14 +137,6 @@ GGML_ELEMENTS_PER_BLOCK = {
"Q6_K"
:
256
,
"Q6_K"
:
256
,
}
}
# DATA_TYPES = {
# "uint32": 4,
# "int32": 5,
# "float32": 6,
# "string": 8,
# "array": 9,
# "uint64": 10,
# }
DATA_TYPES
=
{
DATA_TYPES
=
{
"uint8"
:
0
,
"uint8"
:
0
,
"int8"
:
1
,
"int8"
:
1
,
...
@@ -272,7 +273,7 @@ class GGUFLoader:
...
@@ -272,7 +273,7 @@ class GGUFLoader:
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
def
load_gguf_tensor
(
self
,
name
:
str
,
device
:
str
=
"cpu"
)
->
torch
.
Tensor
:
t
=
self
.
tensor_info
[
name
]
t
=
self
.
tensor_info
[
name
]
shape
=
t
[
"shape"
]
shape
=
t
[
"shape"
]
ggml_type
=
t
[
"ggml_type"
]
ggml_type
=
t
[
"ggml_type"
]
...
@@ -282,10 +283,12 @@ class GGUFLoader:
...
@@ -282,10 +283,12 @@ class GGUFLoader:
ggml_name
=
GGML_NAMES
[
ggml_type
]
ggml_name
=
GGML_NAMES
[
ggml_type
]
data
=
self
.
get_mmap_tensor
(
name
)
data
=
self
.
get_mmap_tensor
(
name
)
if
"cuda"
in
device
.
lower
():
if
"cuda"
in
device
.
lower
():
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
values
=
GGML_DEQUANTIZE_GPU
[
ggml_name
](
data
,
device
)
#values = GGML_DEQUANTIZE[ggml_name](data)
#print("load_gguf_tensor")
#values = torch.from_numpy(values).to(device = device)
else
:
else
:
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
GGML_DEQUANTIZE
[
ggml_name
](
data
)
values
=
torch
.
from_numpy
(
values
)
values
=
torch
.
from_numpy
(
values
)
...
@@ -375,7 +378,7 @@ def dequantize_q2_k(data):
...
@@ -375,7 +378,7 @@ def dequantize_q2_k(data):
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
return
d
*
(
scales
&
15
)
*
(
tmp
&
3
)
-
dmin
*
(
scales
>>
4
)
def
dequantize_q2_k_gpu
(
data
):
def
dequantize_q2_k_gpu
(
data
):
pass
raise
NotImplementedError
()
def
dequantize_q3_k
(
data
):
def
dequantize_q3_k
(
data
):
# C implementation
# C implementation
...
@@ -420,7 +423,7 @@ def dequantize_q3_k(data):
...
@@ -420,7 +423,7 @@ def dequantize_q3_k(data):
],
axis
=
1
)
],
axis
=
1
)
def
dequantize_q3_k_gpu
(
data
):
def
dequantize_q3_k_gpu
(
data
):
pass
raise
NotImplementedError
()
def
dequantize_q4_k
(
data
):
def
dequantize_q4_k
(
data
):
# C implementation
# C implementation
...
@@ -429,20 +432,16 @@ def dequantize_q4_k(data):
...
@@ -429,20 +432,16 @@ def dequantize_q4_k(data):
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
block_size
=
GGML_BLOCK_SIZES
[
"Q4_K"
]
block_size
=
GGML_BLOCK_SIZES
[
"Q4_K"
]
num_blocks
=
len
(
data
)
//
block_size
num_blocks
=
len
(
data
)
//
block_size
data_f16
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
block_size
//
2
)
data_f16
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
block_size
//
2
)
data_u8
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
block_size
)
data_u8
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
block_size
)
# Casting to float32 because float16 is very slow on CPU
# Casting to float32 because float16 is very slow on CPU
scale_factors
=
data_f16
[:,
0
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
scale_factors
=
data_f16
[:,
0
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
scale_offsets
=
data_f16
[:,
1
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
scale_offsets
=
data_f16
[:,
1
].
reshape
(
num_blocks
,
1
,
1
).
astype
(
np
.
float32
)
qs1
=
data_u8
[:,
4
:
16
].
reshape
(
num_blocks
,
12
,
1
)
qs1
=
data_u8
[:,
4
:
16
].
reshape
(
num_blocks
,
12
,
1
)
qs2
=
data_u8
[:,
16
:].
reshape
(
num_blocks
,
4
,
32
)
qs2
=
data_u8
[:,
16
:].
reshape
(
num_blocks
,
4
,
32
)
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
factors
=
scale_factors
*
np
.
concatenate
([
qs1
[:,
0
:
4
]
&
0b111111
,
(
qs1
[:,
8
:]
&
15
)
|
((
qs1
[:,
0
:
4
]
>>
6
)
<<
4
)],
axis
=
1
)
factors
=
scale_factors
*
np
.
concatenate
([
qs1
[:,
0
:
4
]
&
0b111111
,
(
qs1
[:,
8
:]
&
15
)
|
((
qs1
[:,
0
:
4
]
>>
6
)
<<
4
)],
axis
=
1
)
offsets
=
scale_offsets
*
np
.
concatenate
([
qs1
[:,
4
:
8
]
&
0b111111
,
(
qs1
[:,
8
:]
>>
4
)
|
((
qs1
[:,
4
:
8
]
>>
6
)
<<
4
)],
axis
=
1
)
offsets
=
scale_offsets
*
np
.
concatenate
([
qs1
[:,
4
:
8
]
&
0b111111
,
(
qs1
[:,
8
:]
>>
4
)
|
((
qs1
[:,
4
:
8
]
>>
6
)
<<
4
)],
axis
=
1
)
# Interleave low and high quantized bits
# Interleave low and high quantized bits
qs2
=
np
.
stack
([
qs2
&
0xf
,
qs2
>>
4
],
axis
=
2
).
reshape
(
num_blocks
,
8
,
32
)
qs2
=
np
.
stack
([
qs2
&
0xf
,
qs2
>>
4
],
axis
=
2
).
reshape
(
num_blocks
,
8
,
32
)
# Dequantize final weights using scales and offsets
# Dequantize final weights using scales and offsets
...
@@ -513,7 +512,7 @@ def dequantize_q5_k(data):
...
@@ -513,7 +512,7 @@ def dequantize_q5_k(data):
],
axis
=
1
)
],
axis
=
1
)
def
dequantize_q5_k_gpu
(
data
):
def
dequantize_q5_k_gpu
(
data
):
pass
raise
NotImplementedError
()
def
dequantize_q6_k
(
data
):
def
dequantize_q6_k
(
data
):
...
@@ -573,6 +572,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
...
@@ -573,6 +572,48 @@ def dequantize_q6_k_gpu(data: np.ndarray, device:str = "cuda"):
data
=
torch
.
from_numpy
(
data
)
data
=
torch
.
from_numpy
(
data
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
210
,
device
)
return
KTransformersOps
.
dequantize_q6_k
(
data
,
210
,
device
)
def
dequantize_q4_0
(
data
):
# C implementation
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1515
# C struct definition
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L141
num_blocks
=
len
(
data
)
//
GGML_BLOCK_SIZES
[
"Q4_0"
]
scales
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
1
+
8
)[:,
:
1
].
astype
(
np
.
float32
)
qs
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
2
+
16
)[:,
2
:]
return
np
.
concatenate
([
scales
*
((
qs
&
0xf
).
astype
(
np
.
int8
)
-
8
),
scales
*
((
qs
>>
4
).
astype
(
np
.
int8
)
-
8
),
],
axis
=
1
)
def
dequantize_q4_0_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q5_0
(
data
):
# C implementation
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-quants.c#L1556
# C struct definition
# https://github.com/ggerganov/ggml/blob/a3c0188a4b5d3dec052ff87c9f773baa53631d70/src/ggml-common.h#L161
num_blocks
=
len
(
data
)
//
GGML_BLOCK_SIZES
[
"Q5_0"
]
scales
=
np
.
frombuffer
(
data
,
dtype
=
np
.
float16
).
reshape
(
num_blocks
,
1
+
2
+
8
)[:,
:
1
].
astype
(
np
.
float32
)
qh
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
2
+
4
+
16
)[:,
2
:
2
+
4
]
qs
=
np
.
frombuffer
(
data
,
dtype
=
np
.
uint8
).
reshape
(
num_blocks
,
2
+
4
+
16
)[:,
2
+
4
:]
bits
=
np
.
unpackbits
(
qh
,
axis
=-
1
,
bitorder
=
"little"
)
x0
=
((
qs
&
0xf
).
astype
(
np
.
int8
)
|
(
bits
[:,
:
16
]
<<
4
))
-
16
x1
=
((
qs
>>
4
).
astype
(
np
.
int8
)
|
(
bits
[:,
16
:]
<<
4
))
-
16
return
np
.
concatenate
([
scales
*
x0
,
scales
*
x1
,
],
axis
=
1
)
def
dequantize_q5_0_gpu
(
data
):
raise
NotImplementedError
()
def
dequantize_q8_0
(
data
):
def
dequantize_q8_0
(
data
):
# C struct definition
# C struct definition
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
...
@@ -615,6 +656,8 @@ def dequantize_f16_gpu(data, device):
...
@@ -615,6 +656,8 @@ def dequantize_f16_gpu(data, device):
GGML_DEQUANTIZE
=
{
GGML_DEQUANTIZE
=
{
"F32"
:
dequantize_f32
,
"F32"
:
dequantize_f32
,
"F16"
:
dequantize_f16
,
"F16"
:
dequantize_f16
,
"Q4_0"
:
dequantize_q4_0
,
"Q5_0"
:
dequantize_q5_0
,
"Q8_0"
:
dequantize_q8_0
,
"Q8_0"
:
dequantize_q8_0
,
"Q2_K"
:
dequantize_q2_k
,
"Q2_K"
:
dequantize_q2_k
,
"Q3_K"
:
dequantize_q3_k
,
"Q3_K"
:
dequantize_q3_k
,
...
@@ -626,6 +669,8 @@ GGML_DEQUANTIZE = {
...
@@ -626,6 +669,8 @@ GGML_DEQUANTIZE = {
GGML_DEQUANTIZE_GPU
=
{
GGML_DEQUANTIZE_GPU
=
{
"F32"
:
dequantize_f32_gpu
,
"F32"
:
dequantize_f32_gpu
,
"F16"
:
dequantize_f16_gpu
,
"F16"
:
dequantize_f16_gpu
,
"Q4_0"
:
dequantize_q4_0_gpu
,
"Q5_0"
:
dequantize_q5_0_gpu
,
"Q8_0"
:
dequantize_q8_0_gpu
,
"Q8_0"
:
dequantize_q8_0_gpu
,
"Q2_K"
:
dequantize_q2_k_gpu
,
"Q2_K"
:
dequantize_q2_k_gpu
,
"Q3_K"
:
dequantize_q3_k_gpu
,
"Q3_K"
:
dequantize_q3_k_gpu
,
...
...
ktransformers/util/utils.py
View file @
11544ef2
...
@@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
...
@@ -79,13 +79,15 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
logits
=
cuda_graph_runner
(
cur_token
,
position_ids
,
cache_position
)
past_key_values
.
change_seq_length
(
1
)
past_key_values
.
change_seq_length
(
1
)
"""
"""
inputs_embeds = model.model.embed_tokens(cur_token.to("cpu")).to("cuda")
custom_stream = torch.cuda.Stream()
with torch.cuda.stream(custom_stream):
with torch.cuda.stream(custom_stream):
logits=model(
cur_token
,
logits=model(
inputs_embeds = inputs_embeds
,
position_ids
=
position_ids,
position_ids
=
position_ids,
cache_position
=
cache_position,
cache_position
=
cache_position,
past_key_values
=
past_key_values,
past_key_values
=
past_key_values,
return_dict
=
False, use_cache
=
True)[0]
return_dict
=
False, use_cache
=
True)
[0]
#
"""
"""
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
#print(logits)
#print(logits)
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
next_token_scores
=
logits_warper
(
inputs
,
logits
[:,
-
1
,
:])
...
@@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
...
@@ -108,7 +110,6 @@ def prefill_and_generate(model, tokenizer, inputs, max_new_tokens=10000):
generated_ids
[:,
cache_position
]
=
inputs
.
to
(
torch_device
).
to
(
torch
.
int
)
generated_ids
[:,
cache_position
]
=
inputs
.
to
(
torch_device
).
to
(
torch
.
int
)
past_key_values
.
cur_idx
=
cache_position
past_key_values
.
cur_idx
=
cache_position
start_time
=
time
.
time
()
start_time
=
time
.
time
()
#custom_stream = torch.cuda.Stream()
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
"cuda"
)
inputs_embeds
=
model
.
model
.
embed_tokens
(
inputs
.
to
(
"cpu"
)).
to
(
"cuda"
)
logits
=
model
(
logits
=
model
(
...
...
pyproject.toml
View file @
11544ef2
...
@@ -3,7 +3,8 @@ requires = [
...
@@ -3,7 +3,8 @@ requires = [
"setuptools"
,
"setuptools"
,
"torch >= 2.3.0"
,
"torch >= 2.3.0"
,
"ninja"
,
"ninja"
,
"packaging"
"packaging"
,
"cpufeature"
]
]
build-backend
=
"setuptools.build_meta"
build-backend
=
"setuptools.build_meta"
...
...
setup.py
View file @
11544ef2
...
@@ -6,7 +6,7 @@ Author : chenxl
...
@@ -6,7 +6,7 @@ Author : chenxl
Date : 2024-07-27 16:15:27
Date : 2024-07-27 16:15:27
Version : 1.0.0
Version : 1.0.0
LastEditors : chenxl
LastEditors : chenxl
LastEditTime : 2024-0
7-31
0
9
:4
4:46
LastEditTime : 2024-0
8-08
0
2
:4
5:15
Adapted from:
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
Copyright (c) 2023, Tri Dao.
...
@@ -19,6 +19,7 @@ import re
...
@@ -19,6 +19,7 @@ import re
import
ast
import
ast
import
subprocess
import
subprocess
import
platform
import
platform
import
shutil
import
http.client
import
http.client
import
urllib.request
import
urllib.request
import
urllib.error
import
urllib.error
...
@@ -27,6 +28,7 @@ from packaging.version import parse
...
@@ -27,6 +28,7 @@ from packaging.version import parse
import
torch.version
import
torch.version
from
wheel.bdist_wheel
import
bdist_wheel
as
_bdist_wheel
from
wheel.bdist_wheel
import
bdist_wheel
as
_bdist_wheel
from
setuptools
import
setup
,
Extension
from
setuptools
import
setup
,
Extension
from
cpufeature.extension
import
CPUFeature
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
class
CpuInstructInfo
:
class
CpuInstructInfo
:
...
@@ -67,6 +69,8 @@ class VersionInfo:
...
@@ -67,6 +69,8 @@ class VersionInfo:
"""
"""
if
sys
.
platform
.
startswith
(
"linux"
):
if
sys
.
platform
.
startswith
(
"linux"
):
return
f
'linux_
{
platform
.
uname
().
machine
}
'
return
f
'linux_
{
platform
.
uname
().
machine
}
'
elif
sys
.
platform
==
"win32"
:
return
"win_amd64"
else
:
else
:
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
...
@@ -97,6 +101,15 @@ class VersionInfo:
...
@@ -97,6 +101,15 @@ class VersionInfo:
return
'avx2'
return
'avx2'
raise
ValueError
(
raise
ValueError
(
"Unsupported cpu Instructions: {}"
.
format
(
flags_line
))
"Unsupported cpu Instructions: {}"
.
format
(
flags_line
))
elif
sys
.
platform
==
"win32"
:
if
CPUFeature
.
get
(
"AVX512bw"
,
False
):
return
'fancy'
if
CPUFeature
.
get
(
"AVX512f"
,
False
):
return
'avx512'
if
CPUFeature
.
get
(
"AVX2"
,
False
):
return
'avx2'
raise
ValueError
(
"Unsupported cpu Instructions: {}"
.
format
(
str
(
CPUFeature
)))
else
:
else
:
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
...
@@ -154,7 +167,7 @@ class BuildWheelsCommand(_bdist_wheel):
...
@@ -154,7 +167,7 @@ class BuildWheelsCommand(_bdist_wheel):
wheel_path
=
os
.
path
.
join
(
self
.
dist_dir
,
archive_basename
+
".whl"
)
wheel_path
=
os
.
path
.
join
(
self
.
dist_dir
,
archive_basename
+
".whl"
)
print
(
"Raw wheel path"
,
wheel_path
)
print
(
"Raw wheel path"
,
wheel_path
)
os
.
renam
e
(
wheel_filename
,
wheel_path
)
shutil
.
mov
e
(
wheel_filename
,
wheel_path
)
except
(
urllib
.
error
.
HTTPError
,
urllib
.
error
.
URLError
,
http
.
client
.
RemoteDisconnected
):
except
(
urllib
.
error
.
HTTPError
,
urllib
.
error
.
URLError
,
http
.
client
.
RemoteDisconnected
):
print
(
"Precompiled wheel not found. Building from source..."
)
print
(
"Precompiled wheel not found. Building from source..."
)
# If the wheel could not be downloaded, build from source
# If the wheel could not be downloaded, build from source
...
...
third_party/llamafile/iqk_mul_mat.inc
View file @
11544ef2
...
@@ -22,7 +22,7 @@
...
@@ -22,7 +22,7 @@
#include <cstring>
#include <cstring>
#include <type_traits>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__
#if defined __x86_64__ || defined __aarch64__
|| defined(_M_X64)
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml-quants.h"
...
@@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
...
@@ -225,7 +225,7 @@ bool iqk_mul_mat_moe(long Nx, long Ny, long ne00, int ne11, int typeA, const voi
return
true
;
return
true
;
}
}
#if defined __x86_64__
#if defined __x86_64__
|| defined(_M_X64)
#if defined HAVE_FANCY_SIMD
#if defined HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
...
@@ -1412,7 +1412,8 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
...
@@ -1412,7 +1412,8 @@ template <typename Dequantizer> void MulMat::set_functions(MulMat& m) {
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
if
(
ne00
%
ggml_blck_size
(
GGML_TYPE_Q8_K
)
==
0
)
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
switch
(
typeA
)
{
switch
(
typeA
)
{
case
GGML_TYPE_Q2_K
:
case
GGML_TYPE_Q2_K
:
...
...
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
View file @
11544ef2
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
// Copyrigth 2024 Iwan Kawrakow.
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#ifdef
__x86_64__
#if
def
ined(
__x86_64__
) || defined(_M_X64)
#include "iqk_mul_mat.inc"
#include "iqk_mul_mat.inc"
#endif // __x86_64__
#endif // __x86_64__
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
View file @
11544ef2
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
// Copyrigth 2024 Iwan Kawrakow.
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#ifdef
__x86_64__
#if
def
ined(
__x86_64__
) || defined(_M_X64)
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#include "iqk_mul_mat.inc"
#include "iqk_mul_mat.inc"
...
...
third_party/llamafile/sgemm.cpp
View file @
11544ef2
...
@@ -22,19 +22,22 @@
...
@@ -22,19 +22,22 @@
#include "sgemm.h"
#include "sgemm.h"
// #include <cosmo.h>
// #include <cosmo.h>
#include <cpuid.h>
//
#include <cpuid.h>
// #include <libc/sysv/consts/hwcap.h>
// #include <libc/sysv/consts/hwcap.h>
#include <stdio.h>
#include <stdio.h>
#include <sys/auxv.h>
//
#include <sys/auxv.h>
#include <cassert>
#include <cassert>
// #include "llamafile.h"
// #include "llamafile.h"
static
const
struct
GemmFuncs
{
static
const
struct
GemmFuncs
{
typeof
(
llamafile_sgemm
)
*
sgemm
;
bool
(
*
sgemm
)(
long
,
long
,
long
,
const
void
*
,
long
,
const
void
*
,
long
,
void
*
,
long
,
int
,
int
,
int
,
int
,
int
,
int
,
int
);
typeof
(
llamafile_mixmul
)
*
mixmul
;
bool
(
*
mixmul
)(
const
struct
ggml_compute_params
*
,
const
struct
ggml_tensor
*
,
const
struct
ggml_tensor
*
,
const
struct
ggml_tensor
*
,
struct
ggml_tensor
*
);
typeof
(
llamafile_mixmul_iqk
)
*
iqk_mixmul
=
iqk_mul_mat_moe_unsupported
;
bool
(
*
iqk_mixmul
)(
long
,
long
,
long
,
int
,
int
,
const
void
*
,
const
void
*
,
float
*
,
long
,
long
,
const
void
*
,
int
,
int
);
// typeof(llamafile_sgemm)* sgemm;
// typeof(llamafile_mixmul)* mixmul;
// typeof(llamafile_mixmul_iqk)* iqk_mixmul = iqk_mul_mat_moe_unsupported;
GemmFuncs
()
{
GemmFuncs
()
{
#ifdef
__x86_64__
#if
def
ined(
__x86_64__
) || defined(_M_X64)
// if (X86_HAVE(AVX)) {
// if (X86_HAVE(AVX)) {
// if (X86_HAVE(FMA)) {
// if (X86_HAVE(FMA)) {
// if (X86_HAVE(AVX2)) {
// if (X86_HAVE(AVX2)) {
...
@@ -86,10 +89,12 @@ static const struct GemmFuncs {
...
@@ -86,10 +89,12 @@ static const struct GemmFuncs {
// sgemm = llamafile_sgemm_unsupported;
// sgemm = llamafile_sgemm_unsupported;
// mixmul = llamafile_mixmul_unsupported;
// mixmul = llamafile_mixmul_unsupported;
// }
// }
#if defined(__AVX__)
#if defined(__AVX__)
#if defined(__FMA__)
#if defined(__FMA__)
|| (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX2__)
#if defined(__AVX2__)
#if defined(__AVX512F__)
#if defined(__AVX512F__)
printf
(
"__AVX512F__
\n
"
);
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
#if defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__) && defined(__AVX512VNNI__) && defined(__AVX512BF16__)
// AMD Zen4+ (2023-)
// AMD Zen4+ (2023-)
sgemm
=
llamafile_sgemm_amd_zen4
;
sgemm
=
llamafile_sgemm_amd_zen4
;
...
...
third_party/llamafile/tinyblas_cpu.h
View file @
11544ef2
...
@@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
...
@@ -223,7 +223,7 @@ inline float32x4_t badder(float32x4_t a, float b, float32x4_t c, float32x4_t* e)
}
}
#endif
#endif
#if defined(__FMA__)
#if defined(__FMA__)
|| (defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)))
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template
<
>
template
<
>
inline
__m256
madd
(
__m256
a
,
__m256
b
,
__m256
c
)
{
inline
__m256
madd
(
__m256
a
,
__m256
b
,
__m256
c
)
{
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment