Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Qwen_lmdeploy
Commits
e38ee081
Commit
e38ee081
authored
Nov 14, 2023
by
xiabo
Browse files
Adapt to rocm
parent
56942c43
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
301 additions
and
144 deletions
+301
-144
CMakeLists.txt
CMakeLists.txt
+21
-18
generate.sh
generate.sh
+4
-1
src/turbomind/kernels/CMakeLists.txt
src/turbomind/kernels/CMakeLists.txt
+29
-27
src/turbomind/kernels/activation_kernels.cu
src/turbomind/kernels/activation_kernels.cu
+8
-4
src/turbomind/kernels/custom_ar_kernels.cu
src/turbomind/kernels/custom_ar_kernels.cu
+20
-6
src/turbomind/kernels/custom_ar_kernels.h
src/turbomind/kernels/custom_ar_kernels.h
+2
-1
src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
...attention/decoder_masked_multihead_attention_template.cuh
+76
-16
src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
...bomind/kernels/decoder_masked_multihead_attention_utils.h
+40
-13
src/turbomind/kernels/gpt_kernels.cu
src/turbomind/kernels/gpt_kernels.cu
+2
-1
src/turbomind/kernels/logprob_kernels.cu
src/turbomind/kernels/logprob_kernels.cu
+2
-1
src/turbomind/kernels/sampling_topk_kernels.cu
src/turbomind/kernels/sampling_topk_kernels.cu
+2
-1
src/turbomind/kernels/sampling_topp_kernels.cu
src/turbomind/kernels/sampling_topp_kernels.cu
+2
-1
src/turbomind/kernels/stop_criteria_kernels.cu
src/turbomind/kernels/stop_criteria_kernels.cu
+2
-1
src/turbomind/kernels/unfused_attention_kernels.cu
src/turbomind/kernels/unfused_attention_kernels.cu
+21
-4
src/turbomind/layers/CMakeLists.txt
src/turbomind/layers/CMakeLists.txt
+7
-5
src/turbomind/layers/sampling_layers/CMakeLists.txt
src/turbomind/layers/sampling_layers/CMakeLists.txt
+14
-10
src/turbomind/models/llama/CMakeLists.txt
src/turbomind/models/llama/CMakeLists.txt
+12
-9
src/turbomind/models/llama/LlamaContextAttentionLayer.cc
src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+8
-5
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+11
-2
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+18
-18
No files found.
CMakeLists.txt
View file @
e38ee081
...
@@ -44,18 +44,18 @@ option(BUILD_TEST "Build tests" OFF)
...
@@ -44,18 +44,18 @@ option(BUILD_TEST "Build tests" OFF)
include
(
FetchContent
)
include
(
FetchContent
)
FetchContent_Declare
(
#
FetchContent_Declare(
repo-cutlass
#
repo-cutlass
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
#
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
GIT_TAG
6f47420213f757831fae65c686aa471749fa8d60
#
GIT_TAG
cc85b64cf676c45f98a17e3a47c0aafcf817f088
)
#
)
set
(
CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL
"Enable only the header library"
)
#
set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_MakeAvailable
(
repo-cutlass
)
#
FetchContent_MakeAvailable(repo-cutlass)
set
(
CUTLASS_HEADER_DIR
${
PROJECT_SOURCE_DIR
}
/3rdparty/cutlass/include
)
#
set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
set
(
CUTLASS_EXTENSIONS_DIR
${
PROJECT_SOURCE_DIR
}
/src/turbomind/cutlass_extensions/include
)
#
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
option
(
SPARSITY_SUPPORT
"Build project with Ampere sparsity feature support"
OFF
)
option
(
SPARSITY_SUPPORT
"Build project with Ampere sparsity feature support"
OFF
)
...
@@ -110,6 +110,7 @@ endif()
...
@@ -110,6 +110,7 @@ endif()
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
"
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-Xcompiler -Wall -ldl"
)
# -Xptxas -v
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-Xcompiler -Wall -ldl"
)
# -Xptxas -v
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
--gpu-max-threads-per-block=1024"
)
set
(
SM_SETS 52 60 61 70 75 80 86 89 90
)
set
(
SM_SETS 52 60 61 70 75 80 86 89 90
)
set
(
USING_WMMA False
)
set
(
USING_WMMA False
)
...
@@ -266,7 +267,8 @@ endif()
...
@@ -266,7 +267,8 @@ endif()
if
(
BUILD_MULTI_GPU
)
if
(
BUILD_MULTI_GPU
)
list
(
APPEND COMMON_HEADER_DIRS
${
MPI_INCLUDE_PATH
}
)
list
(
APPEND COMMON_HEADER_DIRS
${
MPI_INCLUDE_PATH
}
)
list
(
APPEND COMMON_LIB_DIRS /usr/local/mpi/lib
)
#list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
list
(
APPEND COMMON_LIB_DIRS /opt/mpi/lib
)
endif
()
endif
()
if
(
USE_TRITONSERVER_DATATYPE
)
if
(
USE_TRITONSERVER_DATATYPE
)
...
@@ -311,11 +313,11 @@ endif()
...
@@ -311,11 +313,11 @@ endif()
add_library
(
transformer-shared SHARED
add_library
(
transformer-shared SHARED
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:llama_fmha>
#
$<TARGET_OBJECTS:llama_fmha>
$<TARGET_OBJECTS:flash_attention2>
$<TARGET_OBJECTS:flash_attention2>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:gemm_s4_f16>
#
$<TARGET_OBJECTS:gemm_s4_f16>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TransformerTritonBackend>
$<TARGET_OBJECTS:TransformerTritonBackend>
...
@@ -353,15 +355,16 @@ target_link_libraries(transformer-shared PUBLIC
...
@@ -353,15 +355,16 @@ target_link_libraries(transformer-shared PUBLIC
endif
()
endif
()
if
(
USE_NVTX
)
if
(
USE_NVTX
)
target_link_libraries
(
transformer-shared PUBLIC
#
target_link_libraries(transformer-shared PUBLIC
-lnvToolsExt
#
-lnvToolsExt
)
#
)
endif
()
endif
()
set_target_properties
(
transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON
)
#
set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties
(
transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties
(
transformer-shared PROPERTIES LINKER_LANGUAGE CXX
)
set_target_properties
(
transformer-shared PROPERTIES LINKER_LANGUAGE CXX
)
target_link_libraries
(
transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand
)
#target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
target_link_libraries
(
transformer-shared PUBLIC -lcudart -lcublas -lcurand
)
include
(
GNUInstallDirs
)
include
(
GNUInstallDirs
)
set
(
INSTALL_CONFIGDIR
${
CMAKE_INSTALL_LIBDIR
}
/cmake/TurboMind
)
set
(
INSTALL_CONFIGDIR
${
CMAKE_INSTALL_LIBDIR
}
/cmake/TurboMind
)
...
...
generate.sh
View file @
e38ee081
#!/bin/sh
#!/bin/sh
cmake ..
\
cmake ..
\
-DCMAKE_CXX_COMPILER
=
nvcc
\
-DCMAKE_C_COMPILER
=
nvcc
\
-DCMAKE_BUILD_TYPE
=
RelWithDebInfo
\
-DCMAKE_BUILD_TYPE
=
RelWithDebInfo
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
1
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
1
\
-DCMAKE_INSTALL_PREFIX
=
./install
\
-DCMAKE_INSTALL_PREFIX
=
./install
\
-DBUILD_PY_FFI
=
ON
\
-DBUILD_PY_FFI
=
ON
\
-DBUILD_MULTI_GPU
=
ON
\
-DBUILD_MULTI_GPU
=
ON
\
-DCMAKE_CUDA_FLAGS
=
"-lineinfo"
\
-DCMAKE_CUDA_FLAGS
=
"-lineinfo"
\
-DUSE_NVTX
=
ON
-DUSE_NVTX
=
OFF
\
# -DBUILD_TEST=ON
src/turbomind/kernels/CMakeLists.txt
View file @
e38ee081
...
@@ -13,61 +13,63 @@
...
@@ -13,61 +13,63 @@
# limitations under the License.
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
ban_bad_words STATIC ban_bad_words.cu
)
add_library
(
ban_bad_words STATIC ban_bad_words.cu
)
set_property
(
TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
stop_criteria STATIC stop_criteria_kernels.cu
)
add_library
(
stop_criteria STATIC stop_criteria_kernels.cu
)
set_property
(
TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
activation_kernels STATIC activation_kernels.cu
)
add_library
(
activation_kernels STATIC activation_kernels.cu
)
set_property
(
TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
logprob_kernels STATIC logprob_kernels.cu
)
add_library
(
logprob_kernels STATIC logprob_kernels.cu
)
set_property
(
TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
unfused_attention_kernels STATIC unfused_attention_kernels.cu
)
add_library
(
unfused_attention_kernels STATIC unfused_attention_kernels.cu
)
set_property
(
TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
bert_preprocess_kernels STATIC bert_preprocess_kernels.cu
)
add_library
(
bert_preprocess_kernels STATIC bert_preprocess_kernels.cu
)
set_property
(
TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set
(
decoder_masked_multihead_attention_files
set
(
decoder_masked_multihead_attention_files
decoder_masked_multihead_attention.cu
decoder_masked_multihead_attention.cu
)
)
file
(
GLOB decoder_masked_multihead_attention_files
${
decoder_masked_multihead_attention_files
}
./decoder_masked_multihead_attention/*.cu
)
file
(
GLOB decoder_masked_multihead_attention_files
${
decoder_masked_multihead_attention_files
}
./decoder_masked_multihead_attention/*.cu
)
add_library
(
decoder_masked_multihead_attention STATIC
${
decoder_masked_multihead_attention_files
}
)
add_library
(
decoder_masked_multihead_attention STATIC
${
decoder_masked_multihead_attention_files
}
)
set_property
(
TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
decoding_kernels STATIC decoding_kernels.cu
)
add_library
(
decoding_kernels STATIC decoding_kernels.cu
)
set_property
(
TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
gpt_kernels STATIC gpt_kernels.cu
)
add_library
(
gpt_kernels STATIC gpt_kernels.cu
)
set_property
(
TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
sampling_topk_kernels STATIC sampling_topk_kernels.cu
)
add_library
(
sampling_topk_kernels STATIC sampling_topk_kernels.cu
)
set_property
(
TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
sampling_topp_kernels STATIC sampling_topp_kernels.cu
)
add_library
(
sampling_topp_kernels STATIC sampling_topp_kernels.cu
)
set_property
(
TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
sampling_penalty_kernels STATIC sampling_penalty_kernels.cu
)
add_library
(
sampling_penalty_kernels STATIC sampling_penalty_kernels.cu
)
set_property
(
TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
custom_ar_kernels STATIC custom_ar_kernels.cu
)
add_library
(
custom_ar_kernels STATIC custom_ar_kernels.cu
)
set_property
(
TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_subdirectory
(
gemm_s_f16
)
#
add_subdirectory(gemm_s_f16)
src/turbomind/kernels/activation_kernels.cu
View file @
e38ee081
...
@@ -106,7 +106,8 @@ struct ReluActivation<half2> {
...
@@ -106,7 +106,8 @@ struct ReluActivation<half2> {
static
__device__
__forceinline__
half2
apply
(
const
half2
&
val
)
static
__device__
__forceinline__
half2
apply
(
const
half2
&
val
)
{
{
const
half
zero_half
=
static_cast
<
half
>
(
0.0
f
);
const
half
zero_half
=
static_cast
<
half
>
(
0.0
f
);
return
make_half2
(
val
.
x
>
zero_half
?
val
.
x
:
zero_half
,
val
.
y
>
zero_half
?
val
.
y
:
zero_half
);
// return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
return
make_half2
(
static_cast
<
half
>
(
val
.
data
[
0
])
>
zero_half
?
static_cast
<
half
>
(
val
.
data
[
0
])
:
zero_half
,
static_cast
<
half
>
(
val
.
data
[
1
])
>
zero_half
?
static_cast
<
half
>
(
val
.
data
[
1
])
:
zero_half
);
}
}
};
};
...
@@ -117,7 +118,8 @@ struct ReluActivation<__nv_bfloat162> {
...
@@ -117,7 +118,8 @@ struct ReluActivation<__nv_bfloat162> {
static
__device__
__forceinline__
__nv_bfloat162
apply
(
const
__nv_bfloat162
&
val
)
static
__device__
__forceinline__
__nv_bfloat162
apply
(
const
__nv_bfloat162
&
val
)
{
{
const
__nv_bfloat16
zero_bf16
=
static_cast
<
__nv_bfloat16
>
(
0.0
f
);
const
__nv_bfloat16
zero_bf16
=
static_cast
<
__nv_bfloat16
>
(
0.0
f
);
return
make_bfloat162
(
val
.
x
>
zero_bf16
?
val
.
x
:
zero_bf16
,
val
.
y
>
zero_bf16
?
val
.
y
:
zero_bf16
);
// return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16);
return
make_bfloat162
(
val
.
data
[
0
]
>
zero_bf16
?
val
.
data
[
0
]
:
zero_bf16
,
val
.
data
[
1
]
>
zero_bf16
?
val
.
data
[
1
]
:
zero_bf16
);
}
}
};
};
#endif
#endif
...
@@ -138,7 +140,8 @@ struct SiluActivation<half2> {
...
@@ -138,7 +140,8 @@ struct SiluActivation<half2> {
using
return_type
=
float2
;
using
return_type
=
float2
;
static
__device__
__forceinline__
float2
apply
(
const
half2
&
val
)
static
__device__
__forceinline__
float2
apply
(
const
half2
&
val
)
{
{
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
x
),
SiluActivation
<
float
>::
apply
(
val
.
y
));
// return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
data
[
0
]),
SiluActivation
<
float
>::
apply
(
val
.
data
[
1
]));
}
}
};
};
...
@@ -148,7 +151,8 @@ struct SiluActivation<__nv_bfloat162> {
...
@@ -148,7 +151,8 @@ struct SiluActivation<__nv_bfloat162> {
using
return_type
=
float2
;
using
return_type
=
float2
;
static
__device__
__forceinline__
float2
apply
(
const
__nv_bfloat162
&
val
)
static
__device__
__forceinline__
float2
apply
(
const
__nv_bfloat162
&
val
)
{
{
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
x
),
SiluActivation
<
float
>::
apply
(
val
.
y
));
// return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
data
[
0
]),
SiluActivation
<
float
>::
apply
(
val
.
data
[
1
]));
}
}
};
};
#endif // ENABLE_BF16
#endif // ENABLE_BF16
...
...
src/turbomind/kernels/custom_ar_kernels.cu
View file @
e38ee081
...
@@ -24,7 +24,12 @@ namespace turbomind {
...
@@ -24,7 +24,12 @@ namespace turbomind {
static
inline
__device__
uint32_t
hadd2
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
static
inline
__device__
uint32_t
hadd2
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"add.f16x2 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
const
__half
*
ha
=
reinterpret_cast
<
const
__half
*>
(
&
a
);
const
__half
*
hb
=
reinterpret_cast
<
const
__half
*>
(
&
b
);
__half2
h2c
=
make_half2
(
ha
[
0
]
+
hb
[
0
],
ha
[
1
]
+
hb
[
1
]);
__builtin_memcpy
(
&
c
,
&
h2c
,
sizeof
(
h2c
));
// asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
return
c
;
return
c
;
}
}
...
@@ -33,7 +38,12 @@ static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
...
@@ -33,7 +38,12 @@ static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
static
inline
__device__
uint32_t
fadd
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
static
inline
__device__
uint32_t
fadd
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"add.f32 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
union
{
float
*
f_p
;
const
uint32_t
*
u_p
;}
x
,
y
,
z
;
x
.
u_p
=
&
a
;
y
.
u_p
=
&
b
;
z
.
u_p
=
&
c
;
*
z
.
f_p
=
*
x
.
f_p
+
*
y
.
f_p
;
return
c
;
return
c
;
}
}
...
@@ -42,10 +52,12 @@ static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
...
@@ -42,10 +52,12 @@ static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
static
inline
__device__
void
st_flag_release
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
static
inline
__device__
void
st_flag_release
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
{
{
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ >= 700
asm
volatile
(
"st.global.release.sys.b32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
// asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
*
flag_addr
=
flag
;
#else
#else
__threadfence_system
();
__threadfence_system
();
asm
volatile
(
"st.global.volatile.b32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
// asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
*
flag_addr
=
flag
;
#endif
#endif
}
}
...
@@ -54,9 +66,11 @@ static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_add
...
@@ -54,9 +66,11 @@ static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_add
static
inline
__device__
void
ld_flag_acquire
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
static
inline
__device__
void
ld_flag_acquire
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
{
{
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ >= 700
asm
volatile
(
"ld.global.acquire.sys.b32 %0, [%1];"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
// asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
flag
=
*
flag_addr
;
#else
#else
asm
volatile
(
"ld.global.volatile.b32 %0, [%1];"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
// asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
flag
=
*
flag_addr
;
#endif
#endif
}
}
...
...
src/turbomind/kernels/custom_ar_kernels.h
View file @
e38ee081
...
@@ -27,7 +27,8 @@
...
@@ -27,7 +27,8 @@
#define MAX_ALL_REDUCE_BLOCKS 24
#define MAX_ALL_REDUCE_BLOCKS 24
#define FLAG(a) ((uint32_t)((a) % 0x146))
#define FLAG(a) ((uint32_t)((a) % 0x146))
#define RANKS_PER_NODE 8
#define RANKS_PER_NODE 8
#define WARP_SIZE 32
// #define WARP_SIZE 32
#define WARP_SIZE 64
#define DEFAULT_BLOCK_SIZE 1024
#define DEFAULT_BLOCK_SIZE 1024
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
...
...
src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
View file @
e38ee081
...
@@ -628,19 +628,68 @@ struct Qk_dot {
...
@@ -628,19 +628,68 @@ struct Qk_dot {
};
};
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__device__
inline
void
f16mulf16addf32
(
uint32_t
&
a
,
uint32_t
&
b
,
const
float
*
c
,
float
*
d
){
// uint32_t res = 0;
// asm volatile("v_pk_fma_f16 %0, %1,%2,%3" : "=v"(res) : "v"(a), "v"(b), "v"(res));
// __half * h = reinterpret_cast<__half*>(&res);
__half
*
ha
=
reinterpret_cast
<
__half
*>
(
&
a
);
__half
*
hb
=
reinterpret_cast
<
__half
*>
(
&
b
);
*
d
=
*
c
+
__half2float
(
ha
[
0
])
*
__half2float
(
hb
[
0
])
+
__half2float
(
ha
[
1
])
*
__half2float
(
hb
[
1
]);
}
// row 8 col 4
__device__
inline
void
m16n8k8
(
const
uint32_t
*
A
,
const
uint32_t
*
B
,
/*const float * C,*/
float
*
D
)
{
int
tid
=
threadIdx
.
x
;
int
baseId
=
tid
/
32
*
32
;
__shared__
uint32_t
smem
[
1024
*
3
];
int
base
=
tid
*
3
;
__builtin_memcpy
(
smem
+
base
,
A
,
sizeof
(
uint32_t
));
__builtin_memcpy
(
smem
+
(
base
+
1
),
A
+
1
,
sizeof
(
uint32_t
));
__builtin_memcpy
(
smem
+
(
base
+
2
),
B
,
sizeof
(
uint32_t
));
__syncthreads
();
/* 站在D的视角,每个进程负责D数据的计算,从0线程开始循环,获取一行A和两列B
s为B矩阵的线程号
baseA为A的线程号
baseB0为当前线程获取B的第一列,baseB1为当前线程获取B的第二列
*/
int
s
=
baseId
+
(
tid
%
4
)
*
8
,
e
=
s
+
4
;
for
(
int
i
=
s
;
i
<
e
;
++
i
)
{
// A[0]->i A[1]->i+1 B[0]->i+2
int
baseA
=
(
tid
-
tid
%
4
+
i
-
s
)
*
3
;
// 当前tid所处行的第一列的进程号+stride 再*3
int
baseB0
=
i
*
3
,
baseB1
=
(
i
+
4
)
*
3
;
f16mulf16addf32
(
smem
[
baseA
],
smem
[
baseB0
+
2
],
D
,
D
);
f16mulf16addf32
(
smem
[
baseA
],
smem
[
baseB1
+
2
],
D
+
1
,
D
+
1
);
f16mulf16addf32
(
smem
[
baseA
+
1
],
smem
[
baseB0
+
2
],
D
+
2
,
D
+
2
);
f16mulf16addf32
(
smem
[
baseA
+
1
],
smem
[
baseB1
+
2
],
D
+
3
,
D
+
3
);
}
}
inline
__device__
float4
hmma_fp32
(
const
uint2
&
a
,
uint32_t
b
)
inline
__device__
float4
hmma_fp32
(
const
uint2
&
a
,
uint32_t
b
)
{
{
float4
c
;
float4
c
;
float
zero
=
0.
f
;
float
zero
=
0.
f
;
asm
volatile
(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32
\n
"
const
uint32_t
*
A
=
reinterpret_cast
<
const
uint32_t
*>
(
&
a
);
" {%0, %1, %2, %3},
\n
"
const
uint32_t
*
B
=
reinterpret_cast
<
const
uint32_t
*>
(
b
);
" {%4, %5},
\n
"
float
*
C
=
reinterpret_cast
<
float
*>
(
&
c
);
" {%6},
\n
"
m16n8k8
(
A
,
B
,
C
);
" {%7, %7, %7, %7};
\n
"
// asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
// " {%0, %1, %2, %3}, \n"
:
"=f"
(
c
.
x
),
"=f"
(
c
.
y
),
"=f"
(
c
.
z
),
"=f"
(
c
.
w
)
// " {%4, %5}, \n"
:
"r"
(
a
.
x
),
"r"
(
a
.
y
),
"r"
(
b
),
"f"
(
zero
));
// " {%6}, \n"
// " {%7, %7, %7, %7}; \n"
// : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
// : "r"(a.x), "r"(a.y), "r"(b), "f"(zero));
return
c
;
return
c
;
}
}
...
@@ -688,7 +737,8 @@ struct Qk_dot<uint16_t, 4> {
...
@@ -688,7 +737,8 @@ struct Qk_dot<uint16_t, 4> {
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
template
<
int
WARPS_PER_BLOCK
,
int
WARP_SIZE
=
32
>
// template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
template
<
int
WARPS_PER_BLOCK
,
int
WARP_SIZE
=
64
>
inline
__device__
float
block_sum
(
float
*
red_smem
,
float
sum
)
inline
__device__
float
block_sum
(
float
*
red_smem
,
float
sum
)
{
{
...
@@ -1110,12 +1160,21 @@ inline __device__ Float8_ dequant(int64_t a, const float scale, const float zp)
...
@@ -1110,12 +1160,21 @@ inline __device__ Float8_ dequant(int64_t a, const float scale, const float zp)
inline
__device__
int8_t
cast_to_int8
(
float
val
)
inline
__device__
int8_t
cast_to_int8
(
float
val
)
{
{
union
{
// union {
int8_t
int8
[
2
];
// int8_t int8[2];
int16_t
int16
;
// int16_t int16;
};
// };
asm
volatile
(
"cvt.rni.sat.s8.f32 %0, %1;"
:
"=h"
(
int16
)
:
"f"
(
val
));
// asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
return
int8
[
0
];
// return int8[0];
int8_t
dst
;
if
(
val
>=
128
){
dst
=
127
;
}
else
if
(
val
<
-
128
){
dst
=
-
128
;
}
else
{
dst
=
static_cast
<
int8_t
>
(
val
);
}
return
dst
;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -1239,7 +1298,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
...
@@ -1239,7 +1298,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
static_assert
(
Dh_MAX
%
THREADS_PER_VALUE
==
0
,
""
);
static_assert
(
Dh_MAX
%
THREADS_PER_VALUE
==
0
,
""
);
// The size of a warp.
// The size of a warp.
constexpr
int
WARP_SIZE
=
32
;
// constexpr int WARP_SIZE = 32;
constexpr
int
WARP_SIZE
=
64
;
// The number of warps in a threadblock.
// The number of warps in a threadblock.
constexpr
int
WARPS_PER_BLOCK
=
THREADS_PER_BLOCK
/
WARP_SIZE
;
constexpr
int
WARPS_PER_BLOCK
=
THREADS_PER_BLOCK
/
WARP_SIZE
;
...
...
src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
View file @
e38ee081
...
@@ -147,7 +147,8 @@ inline __device__ float4 add(float4 a, float4 b)
...
@@ -147,7 +147,8 @@ inline __device__ float4 add(float4 a, float4 b)
inline
__device__
uint16_t
add
(
uint16_t
a
,
uint16_t
b
)
inline
__device__
uint16_t
add
(
uint16_t
a
,
uint16_t
b
)
{
{
uint16_t
c
;
uint16_t
c
;
asm
volatile
(
"add.f16 %0, %1, %2;
\n
"
:
"=h"
(
c
)
:
"h"
(
a
),
"h"
(
b
));
// asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
asm
volatile
(
"v_add_f16 %0, %1, %2;"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
));
return
c
;
return
c
;
}
}
...
@@ -156,7 +157,11 @@ inline __device__ uint16_t add(uint16_t a, uint16_t b)
...
@@ -156,7 +157,11 @@ inline __device__ uint16_t add(uint16_t a, uint16_t b)
inline
__device__
uint32_t
add
(
uint32_t
a
,
uint32_t
b
)
inline
__device__
uint32_t
add
(
uint32_t
a
,
uint32_t
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"add.f16x2 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
const
__half
*
ha
=
reinterpret_cast
<
const
__half
*>
(
&
a
);
const
__half
*
hb
=
reinterpret_cast
<
const
__half
*>
(
&
b
);
__half2
h2c
=
make_half2
(
ha
[
0
]
+
hb
[
0
],
ha
[
1
]
+
hb
[
1
]);
__builtin_memcpy
(
&
c
,
&
h2c
,
sizeof
(
h2c
));
return
c
;
return
c
;
}
}
...
@@ -192,9 +197,13 @@ inline __device__ uint16_t float_to_half(float f)
...
@@ -192,9 +197,13 @@ inline __device__ uint16_t float_to_half(float f)
}
tmp
;
}
tmp
;
#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // Is it better?
#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // Is it better?
float zero = 0.f;
float zero = 0.f;
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
// asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
__half h=__float2half(f);
tmp.u16[0] = reinterpret_cast<const uint16_t&>(h);
#else
#else
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
tmp
.
u16
[
0
])
:
"f"
(
f
));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
__half
h
=
__float2half
(
f
);
tmp
.
u16
[
0
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h
);
#endif
#endif
return
tmp
.
u16
[
0
];
return
tmp
.
u16
[
0
];
}
}
...
@@ -208,10 +217,18 @@ inline __device__ uint32_t float2_to_half2(float2 f)
...
@@ -208,10 +217,18 @@ inline __device__ uint32_t float2_to_half2(float2 f)
uint16_t
u16
[
2
];
uint16_t
u16
[
2
];
}
tmp
;
}
tmp
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
asm
volatile
(
"cvt.rn.f16x2.f32 %0, %1, %2;
\n
"
:
"=r"
(
tmp
.
u32
)
:
"f"
(
f
.
y
),
"f"
(
f
.
x
));
// asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
__half
h1
=
__float2half
(
f
.
x
);
__half
h2
=
__float2half
(
f
.
y
);
tmp
.
u16
[
0
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h1
);
tmp
.
u16
[
1
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h2
);
#else
#else
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
tmp
.
u16
[
0
])
:
"f"
(
f
.
x
));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
tmp
.
u16
[
1
])
:
"f"
(
f
.
y
));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
__half
h1
=
__float2half
(
f
.
x
);
__half
h2
=
__float2half
(
f
.
y
);
tmp
.
u16
[
0
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h1
);
tmp
.
u16
[
1
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h2
);
#endif
#endif
return
tmp
.
u32
;
return
tmp
.
u32
;
}
}
...
@@ -221,7 +238,8 @@ inline __device__ uint32_t float2_to_half2(float2 f)
...
@@ -221,7 +238,8 @@ inline __device__ uint32_t float2_to_half2(float2 f)
inline
__device__
float
half_to_float
(
uint16_t
h
)
inline
__device__
float
half_to_float
(
uint16_t
h
)
{
{
float
f
;
float
f
;
asm
volatile
(
"cvt.f32.f16 %0, %1;
\n
"
:
"=f"
(
f
)
:
"h"
(
h
));
// asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
f
=
__half2float
(
reinterpret_cast
<
const
__half
&>
(
h
));
return
f
;
return
f
;
}
}
...
@@ -230,7 +248,9 @@ inline __device__ float half_to_float(uint16_t h)
...
@@ -230,7 +248,9 @@ inline __device__ float half_to_float(uint16_t h)
inline
__device__
float2
half2_to_float2
(
uint32_t
v
)
inline
__device__
float2
half2_to_float2
(
uint32_t
v
)
{
{
uint16_t
lo
,
hi
;
uint16_t
lo
,
hi
;
asm
volatile
(
"mov.b32 {%0, %1}, %2;
\n
"
:
"=h"
(
lo
),
"=h"
(
hi
)
:
"r"
(
v
));
// asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
lo
=
v
&
0xffff
;
hi
=
(
v
>>
16
)
&
0xffff
;
return
make_float2
(
half_to_float
(
lo
),
half_to_float
(
hi
));
return
make_float2
(
half_to_float
(
lo
),
half_to_float
(
hi
));
}
}
...
@@ -276,7 +296,11 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb)
...
@@ -276,7 +296,11 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb)
inline
__device__
uint32_t
h0_h0
(
uint16_t
a
)
inline
__device__
uint32_t
h0_h0
(
uint16_t
a
)
{
{
uint32_t
b
;
uint32_t
b
;
asm
volatile
(
"mov.b32 %0, {%1, %1};"
:
"=r"
(
b
)
:
"h"
(
a
));
// asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
uint16_t
tmp
[
2
];
tmp
[
0
]
=
a
;
tmp
[
1
]
=
a
;
__builtin_memcpy
(
&
b
,
tmp
,
sizeof
(
uint16_t
)
*
2
);
return
b
;
return
b
;
}
}
...
@@ -370,7 +394,8 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
...
@@ -370,7 +394,8 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
inline
__device__
uint32_t
fma
(
uint32_t
a
,
uint32_t
b
,
uint32_t
c
)
inline
__device__
uint32_t
fma
(
uint32_t
a
,
uint32_t
b
,
uint32_t
c
)
{
{
uint32_t
d
;
uint32_t
d
;
asm
volatile
(
"fma.rn.f16x2 %0, %1, %2, %3;
\n
"
:
"=r"
(
d
)
:
"r"
(
a
),
"r"
(
b
),
"r"
(
c
));
// asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
asm
volatile
(
"v_pk_fma_f16 %0, %1, %2, %3;
\n
"
:
"=v"
(
d
)
:
"v"
(
a
),
"v"
(
b
),
"v"
(
c
));
return
d
;
return
d
;
}
}
...
@@ -581,7 +606,8 @@ template<>
...
@@ -581,7 +606,8 @@ template<>
inline
__device__
uint16_t
mul
(
uint16_t
a
,
uint16_t
b
)
inline
__device__
uint16_t
mul
(
uint16_t
a
,
uint16_t
b
)
{
{
uint16_t
c
;
uint16_t
c
;
asm
volatile
(
"mul.f16 %0, %1, %2;
\n
"
:
"=h"
(
c
)
:
"h"
(
a
),
"h"
(
b
));
// asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
asm
volatile
(
"v_mul_f16 %0, %1, %2;
\n
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
));
return
c
;
return
c
;
}
}
...
@@ -591,7 +617,8 @@ template<>
...
@@ -591,7 +617,8 @@ template<>
inline
__device__
uint32_t
mul
(
uint32_t
a
,
uint32_t
b
)
inline
__device__
uint32_t
mul
(
uint32_t
a
,
uint32_t
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"mul.f16x2 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
asm
volatile
(
"v_pk_mul_f16 %0, %1, %2;
\n
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
));
return
c
;
return
c
;
}
}
...
...
src/turbomind/kernels/gpt_kernels.cu
View file @
e38ee081
...
@@ -20,7 +20,8 @@
...
@@ -20,7 +20,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/memory_utils.h"
...
...
src/turbomind/kernels/logprob_kernels.cu
View file @
e38ee081
...
@@ -23,7 +23,8 @@
...
@@ -23,7 +23,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/kernels/logprob_kernels.h"
...
...
src/turbomind/kernels/sampling_topk_kernels.cu
View file @
e38ee081
...
@@ -21,7 +21,8 @@
...
@@ -21,7 +21,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
...
...
src/turbomind/kernels/sampling_topp_kernels.cu
View file @
e38ee081
...
@@ -19,7 +19,8 @@
...
@@ -19,7 +19,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
...
...
src/turbomind/kernels/stop_criteria_kernels.cu
View file @
e38ee081
...
@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
...
@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
// Check if we have attained the sequence length limit. If so, stop the sequence.
// Check if we have attained the sequence length limit. If so, stop the sequence.
// In addition, check if all sequences are stopped and return the result in should_stop
// In addition, check if all sequences are stopped and return the result in should_stop
TM_LOG_DEBUG
(
"%s start"
,
__PRETTY_FUNCTION__
);
TM_LOG_DEBUG
(
"%s start"
,
__PRETTY_FUNCTION__
);
dim3
block
{
min
(
512
,
uint32_t
(
batch_size
*
beam_width
))};
// dim3 block{min(512, uint32_t(batch_size * beam_width))};
dim3
block
{
static_cast
<
unsigned
int
>
(
min
(
512
,
uint32_t
(
batch_size
*
beam_width
)))};
dim3
grid
{
1
};
dim3
grid
{
1
};
h_pinned_finished_sum_
[
0
]
=
-
1
;
h_pinned_finished_sum_
[
0
]
=
-
1
;
...
...
src/turbomind/kernels/unfused_attention_kernels.cu
View file @
e38ee081
...
@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
...
@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
qk_bias
=
hadd2
<
T2
>
(
qk_bias
,
hmul2
<
T2
>
(
hsub2
<
T2
>
(
ONE
,
mask_val
),
NEG_INFTY
));
qk_bias
=
hadd2
<
T2
>
(
qk_bias
,
hmul2
<
T2
>
(
hsub2
<
T2
>
(
ONE
,
mask_val
),
NEG_INFTY
));
data
[
i
]
=
hadd2
<
T2
>
(
hmul2
<
T2
>
(
qk
,
qk_scale_h2
),
qk_bias
);
data
[
i
]
=
hadd2
<
T2
>
(
hmul2
<
T2
>
(
qk
,
qk_scale_h2
),
qk_bias
);
local_max
=
fmax
(
local_max
,
fmax
((
float
)
data
[
i
].
x
,
(
float
)
data
[
i
].
y
));
// if (std::is_same<T2, half2>::value) {
local_max
=
fmax
(
local_max
,
fmax
((
float
)
data
[
i
].
data
[
0
],
(
float
)
data
[
i
].
data
[
1
]));
// } else {
// local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y));
// }
}
}
float
max_val
=
blockDim
.
x
<=
32
?
warpReduceMax
(
local_max
)
:
blockReduceMax
<
float
>
(
local_max
);
float
max_val
=
blockDim
.
x
<=
32
?
warpReduceMax
(
local_max
)
:
blockReduceMax
<
float
>
(
local_max
);
...
@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
...
@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
float
local_sum
=
0.0
f
;
float
local_sum
=
0.0
f
;
for
(
int
i
=
0
;
blockDim
.
x
*
i
+
threadIdx
.
x
<
(
k_length
/
2
)
&&
i
<
ITEMS_PER_THREAD
;
i
++
)
{
for
(
int
i
=
0
;
blockDim
.
x
*
i
+
threadIdx
.
x
<
(
k_length
/
2
)
&&
i
<
ITEMS_PER_THREAD
;
i
++
)
{
data
[
i
]
=
hexp2
<
T2
>
(
hsub2
<
T2
>
(
data
[
i
],
cuda_cast
<
T2
>
(
s_max
)));
data
[
i
]
=
hexp2
<
T2
>
(
hsub2
<
T2
>
(
data
[
i
],
cuda_cast
<
T2
>
(
s_max
)));
local_sum
+=
(
float
)(
data
[
i
].
x
+
data
[
i
].
y
);
// if (std::is_same<T2, half2>::value) {
local_sum
+=
(
float
)(
data
[
i
].
data
[
0
]
+
data
[
i
].
data
[
1
]);
// } else {
// local_sum += (float)(data[i].x + data[i].y);
// }
}
}
float
sum_val
=
blockDim
.
x
<=
32
?
warpReduceSum
(
local_sum
)
:
blockReduceSum
<
float
>
(
local_sum
);
float
sum_val
=
blockDim
.
x
<=
32
?
warpReduceSum
(
local_sum
)
:
blockReduceSum
<
float
>
(
local_sum
);
...
@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
...
@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
val
=
hadd2
<
T2
>
(
val
,
pos_bias
[
j
]);
val
=
hadd2
<
T2
>
(
val
,
pos_bias
[
j
]);
}
}
data
[
j
][
i
]
=
val
;
data
[
j
][
i
]
=
val
;
local_max
[
j
]
=
fmax
(
local_max
[
j
],
fmax
((
float
)
data
[
j
][
i
].
x
,
(
float
)
data
[
j
][
i
].
y
));
// if (std::is_same<T2, half2>::value) {
local_max
[
j
]
=
fmax
(
local_max
[
j
],
fmax
((
float
)
data
[
j
][
i
].
data
[
0
],
(
float
)
data
[
j
][
i
].
data
[
1
]));
// } else {
// local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y));
// }
}
}
}
}
...
@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
...
@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
#pragma unroll
#pragma unroll
for
(
int
j
=
0
;
j
<
Q_ITEMS
;
j
++
)
{
for
(
int
j
=
0
;
j
<
Q_ITEMS
;
j
++
)
{
local_sum
[
j
]
+=
(
float
)(
data
[
j
][
i
].
x
+
data
[
j
][
i
].
y
);
// if (std::is_same<T2, half2>::value) {
local_sum
[
j
]
+=
(
float
)(
data
[
j
][
i
].
data
[
0
]
+
data
[
j
][
i
].
data
[
1
]);
// } else {
// local_sum[j] += (float)(data[j][i].x + data[j][i].y);
// }
}
}
}
}
...
@@ -1878,6 +1894,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
...
@@ -1878,6 +1894,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
qk_scale
);
qk_scale
);
}
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
printf
(
"============xiabo_test %s:%d
\n
"
,
__FILE__
,
__LINE__
);
softmax_withRelPosBias_element2_kernel
<
half2
,
half
>
softmax_withRelPosBias_element2_kernel
<
half2
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
qk_buf
,
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
qk_buf
,
(
const
half2
*
)
attn_mask
,
(
const
half2
*
)
attn_mask
,
...
...
src/turbomind/layers/CMakeLists.txt
View file @
e38ee081
...
@@ -13,12 +13,14 @@
...
@@ -13,12 +13,14 @@
# limitations under the License.
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_subdirectory
(
sampling_layers
)
add_subdirectory
(
sampling_layers
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
set_property
(
TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
DynamicDecodeLayer PUBLIC
CUDA::
cudart TopKSamplingLayer
target_link_libraries
(
DynamicDecodeLayer PUBLIC cudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
src/turbomind/layers/sampling_layers/CMakeLists.txt
View file @
e38ee081
...
@@ -14,19 +14,23 @@
...
@@ -14,19 +14,23 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
BaseSamplingLayer STATIC BaseSamplingLayer.cc
)
add_library
(
BaseSamplingLayer STATIC BaseSamplingLayer.cc
)
set_property
(
TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
BaseSamplingLayer PUBLIC
CUDA::
cudart sampling_penalty_kernels memory_utils
)
target_link_libraries
(
BaseSamplingLayer PUBLIC cudart sampling_penalty_kernels memory_utils
)
add_library
(
TopKSamplingLayer STATIC TopKSamplingLayer.cu
)
add_library
(
TopKSamplingLayer STATIC TopKSamplingLayer.cu
)
set_property
(
TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
TopKSamplingLayer PUBLIC
CUDA::
cudart BaseSamplingLayer sampling_topk_kernels
)
target_link_libraries
(
TopKSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels
)
add_library
(
TopPSamplingLayer STATIC TopPSamplingLayer.cu
)
add_library
(
TopPSamplingLayer STATIC TopPSamplingLayer.cu
)
set_property
(
TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
TopPSamplingLayer PUBLIC
CUDA::
cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels
)
target_link_libraries
(
TopPSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels
)
src/turbomind/models/llama/CMakeLists.txt
View file @
e38ee081
...
@@ -2,9 +2,10 @@
...
@@ -2,9 +2,10 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
add_subdirectory
(
fused_multi_head_attention
)
#
add_subdirectory(fused_multi_head_attention)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
Llama STATIC
add_library
(
Llama STATIC
LlamaV2.cc
LlamaV2.cc
...
@@ -20,10 +21,12 @@ add_library(Llama STATIC
...
@@ -20,10 +21,12 @@ add_library(Llama STATIC
llama_kernels.cu
llama_kernels.cu
llama_decoder_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu
)
llama_utils.cu
)
set_property
(
TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set_property
(
TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
target_link_libraries
(
Llama PUBLIC CUDA::cudart
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
gemm_s4_f16
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
Llama PUBLIC cudart
# gemm_s4_f16
cublasMMWrapper
cublasMMWrapper
DynamicDecodeLayer
DynamicDecodeLayer
activation_kernels
activation_kernels
...
@@ -38,8 +41,8 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
...
@@ -38,8 +41,8 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
memory_utils
memory_utils
nccl_utils
nccl_utils
cuda_utils
cuda_utils
logger
logger
)
llama_fmha
)
#
llama_fmha)
if
(
NOT MSVC
)
if
(
NOT MSVC
)
add_subdirectory
(
flash_attention2
)
add_subdirectory
(
flash_attention2
)
...
@@ -47,5 +50,5 @@ if (NOT MSVC)
...
@@ -47,5 +50,5 @@ if (NOT MSVC)
endif
()
endif
()
add_executable
(
llama_gemm llama_gemm.cc
)
add_executable
(
llama_gemm llama_gemm.cc
)
target_link_libraries
(
llama_gemm PUBLIC
CUDA::
cudart gpt_gemm_func memory_utils cuda_utils logger
)
target_link_libraries
(
llama_gemm PUBLIC cudart gpt_gemm_func memory_utils cuda_utils logger
)
install
(
TARGETS llama_gemm DESTINATION
${
CMAKE_SOURCE_DIR
}
/lmdeploy/bin
)
install
(
TARGETS llama_gemm DESTINATION
${
CMAKE_SOURCE_DIR
}
/lmdeploy/bin
)
src/turbomind/models/llama/LlamaContextAttentionLayer.cc
View file @
e38ee081
...
@@ -52,7 +52,8 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
...
@@ -52,7 +52,8 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
k_buf_2_
=
q_buf_2_
+
local_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
k_buf_2_
=
q_buf_2_
+
local_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
v_buf_2_
=
k_buf_2_
+
local_kv_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
v_buf_2_
=
k_buf_2_
+
local_kv_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
if
(
use_fmha_
)
{
// if (use_fmha_) {
if
(
0
)
{
FlashAttentionOp
<
T
>
flash_attention
(
batch_size
,
local_head_num_
,
max_k_len
,
max_q_len
,
size_per_head_
);
FlashAttentionOp
<
T
>
flash_attention
(
batch_size
,
local_head_num_
,
max_k_len
,
max_q_len
,
size_per_head_
);
if
(
flash_attention
.
get_workspace_size
()
>
0
)
{
if
(
flash_attention
.
get_workspace_size
()
>
0
)
{
qk_buf_float_
=
(
float
*
)
allocator_
->
reMalloc
(
qk_buf_float_
,
flash_attention
.
get_workspace_size
(),
true
);
qk_buf_float_
=
(
float
*
)
allocator_
->
reMalloc
(
qk_buf_float_
,
flash_attention
.
get_workspace_size
(),
true
);
...
@@ -86,7 +87,8 @@ void LlamaContextAttentionLayer<T>::freeBuffer()
...
@@ -86,7 +87,8 @@ void LlamaContextAttentionLayer<T>::freeBuffer()
allocator_
->
free
((
void
**
)(
&
qkv_buf_
));
allocator_
->
free
((
void
**
)(
&
qkv_buf_
));
allocator_
->
free
((
void
**
)(
&
q_buf_2_
));
allocator_
->
free
((
void
**
)(
&
q_buf_2_
));
if
(
use_fmha_
)
{
// if (use_fmha_) {
if
(
0
)
{
allocator_
->
free
((
void
**
)
&
qk_buf_float_
);
allocator_
->
free
((
void
**
)
&
qk_buf_float_
);
}
}
else
{
else
{
...
@@ -209,7 +211,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
...
@@ -209,7 +211,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
weights
->
past_kv_scale
.
data
());
weights
->
past_kv_scale
.
data
());
sync_check_cuda_error
();
sync_check_cuda_error
();
if
(
use_fmha_
)
{
// if (use_fmha_) {
if
(
0
)
{
fusedMultiHeadAttention
(
k_cache_ptrs
,
fusedMultiHeadAttention
(
k_cache_ptrs
,
v_cache_ptrs
,
v_cache_ptrs
,
layer_offset
,
layer_offset
,
...
@@ -252,7 +255,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
...
@@ -252,7 +255,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
}
}
sync_check_cuda_error
();
sync_check_cuda_error
();
}
}
#if 0
template<typename T>
template<typename T>
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
T** val_cache_ptrs,
...
@@ -311,7 +314,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
...
@@ -311,7 +314,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
//
//
flash_attention(attn_params, stream_);
flash_attention(attn_params, stream_);
}
}
#endif
template
<
typename
T
>
template
<
typename
T
>
void
LlamaContextAttentionLayer
<
T
>::
unfusedMultiHeadAttention
(
T
**
key_cache_ptrs
,
void
LlamaContextAttentionLayer
<
T
>::
unfusedMultiHeadAttention
(
T
**
key_cache_ptrs
,
T
**
val_cache_ptrs
,
T
**
val_cache_ptrs
,
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
View file @
e38ee081
...
@@ -22,10 +22,18 @@
...
@@ -22,10 +22,18 @@
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include <filesystem>
// #include <filesystem>
#include <experimental/filesystem>
#include <sys/stat.h>
#include <string>
namespace
turbomind
{
namespace
turbomind
{
bool
fileExists
(
const
std
::
string
&
path
)
{
struct
stat
buffer
;
return
(
stat
(
path
.
c_str
(),
&
buffer
)
==
0
);
}
template
<
typename
T
>
template
<
typename
T
>
LlamaDecoderLayerWeight
<
T
>::
LlamaDecoderLayerWeight
(
size_t
head_num
,
LlamaDecoderLayerWeight
<
T
>::
LlamaDecoderLayerWeight
(
size_t
head_num
,
size_t
kv_head_num
,
size_t
kv_head_num
,
...
@@ -129,7 +137,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
...
@@ -129,7 +137,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
}
}
else
{
else
{
// Disable slice if weight has already been sliced
// Disable slice if weight has already been sliced
if
(
std
::
filesystem
::
exists
(
max_prefix
+
".weight"
)
||
std
::
filesystem
::
exists
(
max_prefix
+
".qweight"
))
{
// if (std::filesystem::exists(max_prefix + ".weight") || std::filesystem::exists(max_prefix + ".qweight")) {
if
(
fileExists
(
max_prefix
+
".weight"
)
||
fileExists
(
max_prefix
+
".qweight"
))
{
TM_LOG_DEBUG
(
"TP weight exists. Disable runtime TP."
);
TM_LOG_DEBUG
(
"TP weight exists. Disable runtime TP."
);
enable_slice
=
false
;
enable_slice
=
false
;
}
}
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
e38ee081
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#pragma once
#pragma once
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
//
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
...
@@ -61,29 +61,29 @@ private:
...
@@ -61,29 +61,29 @@ private:
void
forwardInt4
(
T
*
output_data
,
const
T
*
input_data
,
int
batch_size
,
const
LlamaDenseWeight
<
T
>&
weight
,
Type
type
)
void
forwardInt4
(
T
*
output_data
,
const
T
*
input_data
,
int
batch_size
,
const
LlamaDenseWeight
<
T
>&
weight
,
Type
type
)
{
{
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
//
if constexpr (std::is_same_v<T, half>) {
gemm_s4_f16_
.
Run
(
output_data
,
//
gemm_s4_f16_.Run(output_data,
(
const
uint
*
)
weight
.
kernel
,
//
(const uint*)weight.kernel,
input_data
,
//
input_data,
(
const
half2
*
)
weight
.
scales_and_zeros
,
//
(const half2*)weight.scales_and_zeros,
weight
.
output_dims
,
//
weight.output_dims,
batch_size
,
//
batch_size,
weight
.
input_dims
,
//
weight.input_dims,
weight
.
group_size
,
//
weight.group_size,
type
==
kFusedSiluFfn
?
GemmS4F16
::
kFusedSiluFfn
:
GemmS4F16
::
kGemm
,
//
type == kFusedSiluFfn ? GemmS4F16::kFusedSiluFfn : GemmS4F16::kGemm,
-
1
,
//
-1,
stream_
);
//
stream_);
sync_check_cuda_error
();
//
sync_check_cuda_error();
}
//
}
else
{
//
else {
FT_CHECK_WITH_INFO
(
0
,
"Not implemented"
);
FT_CHECK_WITH_INFO
(
0
,
"Not implemented"
);
}
//
}
}
}
private:
private:
cublasMMWrapper
*
cublas_wrapper_
;
cublasMMWrapper
*
cublas_wrapper_
;
cudaStream_t
stream_
{};
cudaStream_t
stream_
{};
GemmS4F16
gemm_s4_f16_
;
//
GemmS4F16 gemm_s4_f16_;
};
};
}
// namespace turbomind
}
// namespace turbomind
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment