Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
e38ee081
Commit
e38ee081
authored
Nov 14, 2023
by
xiabo
Browse files
Adapt to rocm
parent
56942c43
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
301 additions
and
144 deletions
+301
-144
CMakeLists.txt
CMakeLists.txt
+21
-18
generate.sh
generate.sh
+4
-1
src/turbomind/kernels/CMakeLists.txt
src/turbomind/kernels/CMakeLists.txt
+29
-27
src/turbomind/kernels/activation_kernels.cu
src/turbomind/kernels/activation_kernels.cu
+8
-4
src/turbomind/kernels/custom_ar_kernels.cu
src/turbomind/kernels/custom_ar_kernels.cu
+20
-6
src/turbomind/kernels/custom_ar_kernels.h
src/turbomind/kernels/custom_ar_kernels.h
+2
-1
src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
...attention/decoder_masked_multihead_attention_template.cuh
+76
-16
src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
...bomind/kernels/decoder_masked_multihead_attention_utils.h
+40
-13
src/turbomind/kernels/gpt_kernels.cu
src/turbomind/kernels/gpt_kernels.cu
+2
-1
src/turbomind/kernels/logprob_kernels.cu
src/turbomind/kernels/logprob_kernels.cu
+2
-1
src/turbomind/kernels/sampling_topk_kernels.cu
src/turbomind/kernels/sampling_topk_kernels.cu
+2
-1
src/turbomind/kernels/sampling_topp_kernels.cu
src/turbomind/kernels/sampling_topp_kernels.cu
+2
-1
src/turbomind/kernels/stop_criteria_kernels.cu
src/turbomind/kernels/stop_criteria_kernels.cu
+2
-1
src/turbomind/kernels/unfused_attention_kernels.cu
src/turbomind/kernels/unfused_attention_kernels.cu
+21
-4
src/turbomind/layers/CMakeLists.txt
src/turbomind/layers/CMakeLists.txt
+7
-5
src/turbomind/layers/sampling_layers/CMakeLists.txt
src/turbomind/layers/sampling_layers/CMakeLists.txt
+14
-10
src/turbomind/models/llama/CMakeLists.txt
src/turbomind/models/llama/CMakeLists.txt
+12
-9
src/turbomind/models/llama/LlamaContextAttentionLayer.cc
src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+8
-5
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+11
-2
src/turbomind/models/llama/LlamaLinear.h
src/turbomind/models/llama/LlamaLinear.h
+18
-18
No files found.
CMakeLists.txt
View file @
e38ee081
...
@@ -44,18 +44,18 @@ option(BUILD_TEST "Build tests" OFF)
...
@@ -44,18 +44,18 @@ option(BUILD_TEST "Build tests" OFF)
include
(
FetchContent
)
include
(
FetchContent
)
FetchContent_Declare
(
#
FetchContent_Declare(
repo-cutlass
#
repo-cutlass
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
#
GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
GIT_TAG
6f47420213f757831fae65c686aa471749fa8d60
#
GIT_TAG
cc85b64cf676c45f98a17e3a47c0aafcf817f088
)
#
)
set
(
CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL
"Enable only the header library"
)
#
set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_MakeAvailable
(
repo-cutlass
)
#
FetchContent_MakeAvailable(repo-cutlass)
set
(
CUTLASS_HEADER_DIR
${
PROJECT_SOURCE_DIR
}
/3rdparty/cutlass/include
)
#
set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
set
(
CUTLASS_EXTENSIONS_DIR
${
PROJECT_SOURCE_DIR
}
/src/turbomind/cutlass_extensions/include
)
#
set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/turbomind/cutlass_extensions/include)
option
(
SPARSITY_SUPPORT
"Build project with Ampere sparsity feature support"
OFF
)
option
(
SPARSITY_SUPPORT
"Build project with Ampere sparsity feature support"
OFF
)
...
@@ -110,6 +110,7 @@ endif()
...
@@ -110,6 +110,7 @@ endif()
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
"
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-Xcompiler -Wall -ldl"
)
# -Xptxas -v
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-Xcompiler -Wall -ldl"
)
# -Xptxas -v
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
--gpu-max-threads-per-block=1024"
)
set
(
SM_SETS 52 60 61 70 75 80 86 89 90
)
set
(
SM_SETS 52 60 61 70 75 80 86 89 90
)
set
(
USING_WMMA False
)
set
(
USING_WMMA False
)
...
@@ -266,7 +267,8 @@ endif()
...
@@ -266,7 +267,8 @@ endif()
if
(
BUILD_MULTI_GPU
)
if
(
BUILD_MULTI_GPU
)
list
(
APPEND COMMON_HEADER_DIRS
${
MPI_INCLUDE_PATH
}
)
list
(
APPEND COMMON_HEADER_DIRS
${
MPI_INCLUDE_PATH
}
)
list
(
APPEND COMMON_LIB_DIRS /usr/local/mpi/lib
)
#list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
list
(
APPEND COMMON_LIB_DIRS /opt/mpi/lib
)
endif
()
endif
()
if
(
USE_TRITONSERVER_DATATYPE
)
if
(
USE_TRITONSERVER_DATATYPE
)
...
@@ -311,11 +313,11 @@ endif()
...
@@ -311,11 +313,11 @@ endif()
add_library
(
transformer-shared SHARED
add_library
(
transformer-shared SHARED
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:llama_fmha>
#
$<TARGET_OBJECTS:llama_fmha>
$<TARGET_OBJECTS:flash_attention2>
$<TARGET_OBJECTS:flash_attention2>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:gemm_s4_f16>
#
$<TARGET_OBJECTS:gemm_s4_f16>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TransformerTritonBackend>
$<TARGET_OBJECTS:TransformerTritonBackend>
...
@@ -353,15 +355,16 @@ target_link_libraries(transformer-shared PUBLIC
...
@@ -353,15 +355,16 @@ target_link_libraries(transformer-shared PUBLIC
endif
()
endif
()
if
(
USE_NVTX
)
if
(
USE_NVTX
)
target_link_libraries
(
transformer-shared PUBLIC
#
target_link_libraries(transformer-shared PUBLIC
-lnvToolsExt
#
-lnvToolsExt
)
#
)
endif
()
endif
()
set_target_properties
(
transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON
)
#
set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties
(
transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set_target_properties
(
transformer-shared PROPERTIES LINKER_LANGUAGE CXX
)
set_target_properties
(
transformer-shared PROPERTIES LINKER_LANGUAGE CXX
)
target_link_libraries
(
transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand
)
#target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
target_link_libraries
(
transformer-shared PUBLIC -lcudart -lcublas -lcurand
)
include
(
GNUInstallDirs
)
include
(
GNUInstallDirs
)
set
(
INSTALL_CONFIGDIR
${
CMAKE_INSTALL_LIBDIR
}
/cmake/TurboMind
)
set
(
INSTALL_CONFIGDIR
${
CMAKE_INSTALL_LIBDIR
}
/cmake/TurboMind
)
...
...
generate.sh
View file @
e38ee081
#!/bin/sh
#!/bin/sh
cmake ..
\
cmake ..
\
-DCMAKE_CXX_COMPILER
=
nvcc
\
-DCMAKE_C_COMPILER
=
nvcc
\
-DCMAKE_BUILD_TYPE
=
RelWithDebInfo
\
-DCMAKE_BUILD_TYPE
=
RelWithDebInfo
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
1
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
1
\
-DCMAKE_INSTALL_PREFIX
=
./install
\
-DCMAKE_INSTALL_PREFIX
=
./install
\
-DBUILD_PY_FFI
=
ON
\
-DBUILD_PY_FFI
=
ON
\
-DBUILD_MULTI_GPU
=
ON
\
-DBUILD_MULTI_GPU
=
ON
\
-DCMAKE_CUDA_FLAGS
=
"-lineinfo"
\
-DCMAKE_CUDA_FLAGS
=
"-lineinfo"
\
-DUSE_NVTX
=
ON
-DUSE_NVTX
=
OFF
\
# -DBUILD_TEST=ON
src/turbomind/kernels/CMakeLists.txt
View file @
e38ee081
...
@@ -13,61 +13,63 @@
...
@@ -13,61 +13,63 @@
# limitations under the License.
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
ban_bad_words STATIC ban_bad_words.cu
)
add_library
(
ban_bad_words STATIC ban_bad_words.cu
)
set_property
(
TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
stop_criteria STATIC stop_criteria_kernels.cu
)
add_library
(
stop_criteria STATIC stop_criteria_kernels.cu
)
set_property
(
TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
activation_kernels STATIC activation_kernels.cu
)
add_library
(
activation_kernels STATIC activation_kernels.cu
)
set_property
(
TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
logprob_kernels STATIC logprob_kernels.cu
)
add_library
(
logprob_kernels STATIC logprob_kernels.cu
)
set_property
(
TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
unfused_attention_kernels STATIC unfused_attention_kernels.cu
)
add_library
(
unfused_attention_kernels STATIC unfused_attention_kernels.cu
)
set_property
(
TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
bert_preprocess_kernels STATIC bert_preprocess_kernels.cu
)
add_library
(
bert_preprocess_kernels STATIC bert_preprocess_kernels.cu
)
set_property
(
TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
set
(
decoder_masked_multihead_attention_files
set
(
decoder_masked_multihead_attention_files
decoder_masked_multihead_attention.cu
decoder_masked_multihead_attention.cu
)
)
file
(
GLOB decoder_masked_multihead_attention_files
${
decoder_masked_multihead_attention_files
}
./decoder_masked_multihead_attention/*.cu
)
file
(
GLOB decoder_masked_multihead_attention_files
${
decoder_masked_multihead_attention_files
}
./decoder_masked_multihead_attention/*.cu
)
add_library
(
decoder_masked_multihead_attention STATIC
${
decoder_masked_multihead_attention_files
}
)
add_library
(
decoder_masked_multihead_attention STATIC
${
decoder_masked_multihead_attention_files
}
)
set_property
(
TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
decoding_kernels STATIC decoding_kernels.cu
)
add_library
(
decoding_kernels STATIC decoding_kernels.cu
)
set_property
(
TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
gpt_kernels STATIC gpt_kernels.cu
)
add_library
(
gpt_kernels STATIC gpt_kernels.cu
)
set_property
(
TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
sampling_topk_kernels STATIC sampling_topk_kernels.cu
)
add_library
(
sampling_topk_kernels STATIC sampling_topk_kernels.cu
)
set_property
(
TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
sampling_topp_kernels STATIC sampling_topp_kernels.cu
)
add_library
(
sampling_topp_kernels STATIC sampling_topp_kernels.cu
)
set_property
(
TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
sampling_penalty_kernels STATIC sampling_penalty_kernels.cu
)
add_library
(
sampling_penalty_kernels STATIC sampling_penalty_kernels.cu
)
set_property
(
TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
custom_ar_kernels STATIC custom_ar_kernels.cu
)
add_library
(
custom_ar_kernels STATIC custom_ar_kernels.cu
)
set_property
(
TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_subdirectory
(
gemm_s_f16
)
#
add_subdirectory(gemm_s_f16)
src/turbomind/kernels/activation_kernels.cu
View file @
e38ee081
...
@@ -106,7 +106,8 @@ struct ReluActivation<half2> {
...
@@ -106,7 +106,8 @@ struct ReluActivation<half2> {
static
__device__
__forceinline__
half2
apply
(
const
half2
&
val
)
static
__device__
__forceinline__
half2
apply
(
const
half2
&
val
)
{
{
const
half
zero_half
=
static_cast
<
half
>
(
0.0
f
);
const
half
zero_half
=
static_cast
<
half
>
(
0.0
f
);
return
make_half2
(
val
.
x
>
zero_half
?
val
.
x
:
zero_half
,
val
.
y
>
zero_half
?
val
.
y
:
zero_half
);
// return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
return
make_half2
(
static_cast
<
half
>
(
val
.
data
[
0
])
>
zero_half
?
static_cast
<
half
>
(
val
.
data
[
0
])
:
zero_half
,
static_cast
<
half
>
(
val
.
data
[
1
])
>
zero_half
?
static_cast
<
half
>
(
val
.
data
[
1
])
:
zero_half
);
}
}
};
};
...
@@ -117,7 +118,8 @@ struct ReluActivation<__nv_bfloat162> {
...
@@ -117,7 +118,8 @@ struct ReluActivation<__nv_bfloat162> {
static
__device__
__forceinline__
__nv_bfloat162
apply
(
const
__nv_bfloat162
&
val
)
static
__device__
__forceinline__
__nv_bfloat162
apply
(
const
__nv_bfloat162
&
val
)
{
{
const
__nv_bfloat16
zero_bf16
=
static_cast
<
__nv_bfloat16
>
(
0.0
f
);
const
__nv_bfloat16
zero_bf16
=
static_cast
<
__nv_bfloat16
>
(
0.0
f
);
return
make_bfloat162
(
val
.
x
>
zero_bf16
?
val
.
x
:
zero_bf16
,
val
.
y
>
zero_bf16
?
val
.
y
:
zero_bf16
);
// return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16);
return
make_bfloat162
(
val
.
data
[
0
]
>
zero_bf16
?
val
.
data
[
0
]
:
zero_bf16
,
val
.
data
[
1
]
>
zero_bf16
?
val
.
data
[
1
]
:
zero_bf16
);
}
}
};
};
#endif
#endif
...
@@ -138,7 +140,8 @@ struct SiluActivation<half2> {
...
@@ -138,7 +140,8 @@ struct SiluActivation<half2> {
using
return_type
=
float2
;
using
return_type
=
float2
;
static
__device__
__forceinline__
float2
apply
(
const
half2
&
val
)
static
__device__
__forceinline__
float2
apply
(
const
half2
&
val
)
{
{
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
x
),
SiluActivation
<
float
>::
apply
(
val
.
y
));
// return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
data
[
0
]),
SiluActivation
<
float
>::
apply
(
val
.
data
[
1
]));
}
}
};
};
...
@@ -148,7 +151,8 @@ struct SiluActivation<__nv_bfloat162> {
...
@@ -148,7 +151,8 @@ struct SiluActivation<__nv_bfloat162> {
using
return_type
=
float2
;
using
return_type
=
float2
;
static
__device__
__forceinline__
float2
apply
(
const
__nv_bfloat162
&
val
)
static
__device__
__forceinline__
float2
apply
(
const
__nv_bfloat162
&
val
)
{
{
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
x
),
SiluActivation
<
float
>::
apply
(
val
.
y
));
// return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
return
make_float2
(
SiluActivation
<
float
>::
apply
(
val
.
data
[
0
]),
SiluActivation
<
float
>::
apply
(
val
.
data
[
1
]));
}
}
};
};
#endif // ENABLE_BF16
#endif // ENABLE_BF16
...
...
src/turbomind/kernels/custom_ar_kernels.cu
View file @
e38ee081
...
@@ -24,7 +24,12 @@ namespace turbomind {
...
@@ -24,7 +24,12 @@ namespace turbomind {
static
inline
__device__
uint32_t
hadd2
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
static
inline
__device__
uint32_t
hadd2
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"add.f16x2 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
const
__half
*
ha
=
reinterpret_cast
<
const
__half
*>
(
&
a
);
const
__half
*
hb
=
reinterpret_cast
<
const
__half
*>
(
&
b
);
__half2
h2c
=
make_half2
(
ha
[
0
]
+
hb
[
0
],
ha
[
1
]
+
hb
[
1
]);
__builtin_memcpy
(
&
c
,
&
h2c
,
sizeof
(
h2c
));
// asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
return
c
;
return
c
;
}
}
...
@@ -33,7 +38,12 @@ static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
...
@@ -33,7 +38,12 @@ static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
static
inline
__device__
uint32_t
fadd
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
static
inline
__device__
uint32_t
fadd
(
const
uint32_t
&
a
,
const
uint32_t
&
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"add.f32 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
union
{
float
*
f_p
;
const
uint32_t
*
u_p
;}
x
,
y
,
z
;
x
.
u_p
=
&
a
;
y
.
u_p
=
&
b
;
z
.
u_p
=
&
c
;
*
z
.
f_p
=
*
x
.
f_p
+
*
y
.
f_p
;
return
c
;
return
c
;
}
}
...
@@ -42,10 +52,12 @@ static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
...
@@ -42,10 +52,12 @@ static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
static
inline
__device__
void
st_flag_release
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
static
inline
__device__
void
st_flag_release
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
{
{
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ >= 700
asm
volatile
(
"st.global.release.sys.b32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
// asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
*
flag_addr
=
flag
;
#else
#else
__threadfence_system
();
__threadfence_system
();
asm
volatile
(
"st.global.volatile.b32 [%1], %0;"
::
"r"
(
flag
),
"l"
(
flag_addr
));
// asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
*
flag_addr
=
flag
;
#endif
#endif
}
}
...
@@ -54,9 +66,11 @@ static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_add
...
@@ -54,9 +66,11 @@ static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_add
static
inline
__device__
void
ld_flag_acquire
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
static
inline
__device__
void
ld_flag_acquire
(
uint32_t
&
flag
,
uint32_t
*
flag_addr
)
{
{
#if __CUDA_ARCH__ >= 700
#if __CUDA_ARCH__ >= 700
asm
volatile
(
"ld.global.acquire.sys.b32 %0, [%1];"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
// asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
flag
=
*
flag_addr
;
#else
#else
asm
volatile
(
"ld.global.volatile.b32 %0, [%1];"
:
"=r"
(
flag
)
:
"l"
(
flag_addr
));
// asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
flag
=
*
flag_addr
;
#endif
#endif
}
}
...
...
src/turbomind/kernels/custom_ar_kernels.h
View file @
e38ee081
...
@@ -27,7 +27,8 @@
...
@@ -27,7 +27,8 @@
#define MAX_ALL_REDUCE_BLOCKS 24
#define MAX_ALL_REDUCE_BLOCKS 24
#define FLAG(a) ((uint32_t)((a) % 0x146))
#define FLAG(a) ((uint32_t)((a) % 0x146))
#define RANKS_PER_NODE 8
#define RANKS_PER_NODE 8
#define WARP_SIZE 32
// #define WARP_SIZE 32
#define WARP_SIZE 64
#define DEFAULT_BLOCK_SIZE 1024
#define DEFAULT_BLOCK_SIZE 1024
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
...
...
src/turbomind/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
View file @
e38ee081
...
@@ -628,19 +628,68 @@ struct Qk_dot {
...
@@ -628,19 +628,68 @@ struct Qk_dot {
};
};
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
__device__
inline
void
f16mulf16addf32
(
uint32_t
&
a
,
uint32_t
&
b
,
const
float
*
c
,
float
*
d
){
// uint32_t res = 0;
// asm volatile("v_pk_fma_f16 %0, %1,%2,%3" : "=v"(res) : "v"(a), "v"(b), "v"(res));
// __half * h = reinterpret_cast<__half*>(&res);
__half
*
ha
=
reinterpret_cast
<
__half
*>
(
&
a
);
__half
*
hb
=
reinterpret_cast
<
__half
*>
(
&
b
);
*
d
=
*
c
+
__half2float
(
ha
[
0
])
*
__half2float
(
hb
[
0
])
+
__half2float
(
ha
[
1
])
*
__half2float
(
hb
[
1
]);
}
// row 8 col 4
__device__
inline
void
m16n8k8
(
const
uint32_t
*
A
,
const
uint32_t
*
B
,
/*const float * C,*/
float
*
D
)
{
int
tid
=
threadIdx
.
x
;
int
baseId
=
tid
/
32
*
32
;
__shared__
uint32_t
smem
[
1024
*
3
];
int
base
=
tid
*
3
;
__builtin_memcpy
(
smem
+
base
,
A
,
sizeof
(
uint32_t
));
__builtin_memcpy
(
smem
+
(
base
+
1
),
A
+
1
,
sizeof
(
uint32_t
));
__builtin_memcpy
(
smem
+
(
base
+
2
),
B
,
sizeof
(
uint32_t
));
__syncthreads
();
/* 站在D的视角,每个进程负责D数据的计算,从0线程开始循环,获取一行A和两列B
s为B矩阵的线程号
baseA为A的线程号
baseB0为当前线程获取B的第一列,baseB1为当前线程获取B的第二列
*/
int
s
=
baseId
+
(
tid
%
4
)
*
8
,
e
=
s
+
4
;
for
(
int
i
=
s
;
i
<
e
;
++
i
)
{
// A[0]->i A[1]->i+1 B[0]->i+2
int
baseA
=
(
tid
-
tid
%
4
+
i
-
s
)
*
3
;
// 当前tid所处行的第一列的进程号+stride 再*3
int
baseB0
=
i
*
3
,
baseB1
=
(
i
+
4
)
*
3
;
f16mulf16addf32
(
smem
[
baseA
],
smem
[
baseB0
+
2
],
D
,
D
);
f16mulf16addf32
(
smem
[
baseA
],
smem
[
baseB1
+
2
],
D
+
1
,
D
+
1
);
f16mulf16addf32
(
smem
[
baseA
+
1
],
smem
[
baseB0
+
2
],
D
+
2
,
D
+
2
);
f16mulf16addf32
(
smem
[
baseA
+
1
],
smem
[
baseB1
+
2
],
D
+
3
,
D
+
3
);
}
}
inline
__device__
float4
hmma_fp32
(
const
uint2
&
a
,
uint32_t
b
)
inline
__device__
float4
hmma_fp32
(
const
uint2
&
a
,
uint32_t
b
)
{
{
float4
c
;
float4
c
;
float
zero
=
0.
f
;
float
zero
=
0.
f
;
asm
volatile
(
"mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32
\n
"
const
uint32_t
*
A
=
reinterpret_cast
<
const
uint32_t
*>
(
&
a
);
" {%0, %1, %2, %3},
\n
"
const
uint32_t
*
B
=
reinterpret_cast
<
const
uint32_t
*>
(
b
);
" {%4, %5},
\n
"
float
*
C
=
reinterpret_cast
<
float
*>
(
&
c
);
" {%6},
\n
"
m16n8k8
(
A
,
B
,
C
);
" {%7, %7, %7, %7};
\n
"
// asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
// " {%0, %1, %2, %3}, \n"
:
"=f"
(
c
.
x
),
"=f"
(
c
.
y
),
"=f"
(
c
.
z
),
"=f"
(
c
.
w
)
// " {%4, %5}, \n"
:
"r"
(
a
.
x
),
"r"
(
a
.
y
),
"r"
(
b
),
"f"
(
zero
));
// " {%6}, \n"
// " {%7, %7, %7, %7}; \n"
// : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
// : "r"(a.x), "r"(a.y), "r"(b), "f"(zero));
return
c
;
return
c
;
}
}
...
@@ -688,7 +737,8 @@ struct Qk_dot<uint16_t, 4> {
...
@@ -688,7 +737,8 @@ struct Qk_dot<uint16_t, 4> {
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
template
<
int
WARPS_PER_BLOCK
,
int
WARP_SIZE
=
32
>
// template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
template
<
int
WARPS_PER_BLOCK
,
int
WARP_SIZE
=
64
>
inline
__device__
float
block_sum
(
float
*
red_smem
,
float
sum
)
inline
__device__
float
block_sum
(
float
*
red_smem
,
float
sum
)
{
{
...
@@ -1110,12 +1160,21 @@ inline __device__ Float8_ dequant(int64_t a, const float scale, const float zp)
...
@@ -1110,12 +1160,21 @@ inline __device__ Float8_ dequant(int64_t a, const float scale, const float zp)
inline
__device__
int8_t
cast_to_int8
(
float
val
)
inline
__device__
int8_t
cast_to_int8
(
float
val
)
{
{
union
{
// union {
int8_t
int8
[
2
];
// int8_t int8[2];
int16_t
int16
;
// int16_t int16;
};
// };
asm
volatile
(
"cvt.rni.sat.s8.f32 %0, %1;"
:
"=h"
(
int16
)
:
"f"
(
val
));
// asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
return
int8
[
0
];
// return int8[0];
int8_t
dst
;
if
(
val
>=
128
){
dst
=
127
;
}
else
if
(
val
<
-
128
){
dst
=
-
128
;
}
else
{
dst
=
static_cast
<
int8_t
>
(
val
);
}
return
dst
;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -1239,7 +1298,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
...
@@ -1239,7 +1298,8 @@ __global__ void masked_multihead_attention_kernel(Multihead_attention_params<T>
static_assert
(
Dh_MAX
%
THREADS_PER_VALUE
==
0
,
""
);
static_assert
(
Dh_MAX
%
THREADS_PER_VALUE
==
0
,
""
);
// The size of a warp.
// The size of a warp.
constexpr
int
WARP_SIZE
=
32
;
// constexpr int WARP_SIZE = 32;
constexpr
int
WARP_SIZE
=
64
;
// The number of warps in a threadblock.
// The number of warps in a threadblock.
constexpr
int
WARPS_PER_BLOCK
=
THREADS_PER_BLOCK
/
WARP_SIZE
;
constexpr
int
WARPS_PER_BLOCK
=
THREADS_PER_BLOCK
/
WARP_SIZE
;
...
...
src/turbomind/kernels/decoder_masked_multihead_attention_utils.h
View file @
e38ee081
...
@@ -147,7 +147,8 @@ inline __device__ float4 add(float4 a, float4 b)
...
@@ -147,7 +147,8 @@ inline __device__ float4 add(float4 a, float4 b)
inline
__device__
uint16_t
add
(
uint16_t
a
,
uint16_t
b
)
inline
__device__
uint16_t
add
(
uint16_t
a
,
uint16_t
b
)
{
{
uint16_t
c
;
uint16_t
c
;
asm
volatile
(
"add.f16 %0, %1, %2;
\n
"
:
"=h"
(
c
)
:
"h"
(
a
),
"h"
(
b
));
// asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
asm
volatile
(
"v_add_f16 %0, %1, %2;"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
));
return
c
;
return
c
;
}
}
...
@@ -156,7 +157,11 @@ inline __device__ uint16_t add(uint16_t a, uint16_t b)
...
@@ -156,7 +157,11 @@ inline __device__ uint16_t add(uint16_t a, uint16_t b)
inline
__device__
uint32_t
add
(
uint32_t
a
,
uint32_t
b
)
inline
__device__
uint32_t
add
(
uint32_t
a
,
uint32_t
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"add.f16x2 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
const
__half
*
ha
=
reinterpret_cast
<
const
__half
*>
(
&
a
);
const
__half
*
hb
=
reinterpret_cast
<
const
__half
*>
(
&
b
);
__half2
h2c
=
make_half2
(
ha
[
0
]
+
hb
[
0
],
ha
[
1
]
+
hb
[
1
]);
__builtin_memcpy
(
&
c
,
&
h2c
,
sizeof
(
h2c
));
return
c
;
return
c
;
}
}
...
@@ -192,9 +197,13 @@ inline __device__ uint16_t float_to_half(float f)
...
@@ -192,9 +197,13 @@ inline __device__ uint16_t float_to_half(float f)
}
tmp
;
}
tmp
;
#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // Is it better?
#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // Is it better?
float zero = 0.f;
float zero = 0.f;
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
// asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
__half h=__float2half(f);
tmp.u16[0] = reinterpret_cast<const uint16_t&>(h);
#else
#else
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
tmp
.
u16
[
0
])
:
"f"
(
f
));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
__half
h
=
__float2half
(
f
);
tmp
.
u16
[
0
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h
);
#endif
#endif
return
tmp
.
u16
[
0
];
return
tmp
.
u16
[
0
];
}
}
...
@@ -208,10 +217,18 @@ inline __device__ uint32_t float2_to_half2(float2 f)
...
@@ -208,10 +217,18 @@ inline __device__ uint32_t float2_to_half2(float2 f)
uint16_t
u16
[
2
];
uint16_t
u16
[
2
];
}
tmp
;
}
tmp
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
asm
volatile
(
"cvt.rn.f16x2.f32 %0, %1, %2;
\n
"
:
"=r"
(
tmp
.
u32
)
:
"f"
(
f
.
y
),
"f"
(
f
.
x
));
// asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
__half
h1
=
__float2half
(
f
.
x
);
__half
h2
=
__float2half
(
f
.
y
);
tmp
.
u16
[
0
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h1
);
tmp
.
u16
[
1
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h2
);
#else
#else
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
tmp
.
u16
[
0
])
:
"f"
(
f
.
x
));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
asm
volatile
(
"cvt.rn.f16.f32 %0, %1;
\n
"
:
"=h"
(
tmp
.
u16
[
1
])
:
"f"
(
f
.
y
));
// asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
__half
h1
=
__float2half
(
f
.
x
);
__half
h2
=
__float2half
(
f
.
y
);
tmp
.
u16
[
0
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h1
);
tmp
.
u16
[
1
]
=
reinterpret_cast
<
const
uint16_t
&>
(
h2
);
#endif
#endif
return
tmp
.
u32
;
return
tmp
.
u32
;
}
}
...
@@ -221,7 +238,8 @@ inline __device__ uint32_t float2_to_half2(float2 f)
...
@@ -221,7 +238,8 @@ inline __device__ uint32_t float2_to_half2(float2 f)
inline
__device__
float
half_to_float
(
uint16_t
h
)
inline
__device__
float
half_to_float
(
uint16_t
h
)
{
{
float
f
;
float
f
;
asm
volatile
(
"cvt.f32.f16 %0, %1;
\n
"
:
"=f"
(
f
)
:
"h"
(
h
));
// asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
f
=
__half2float
(
reinterpret_cast
<
const
__half
&>
(
h
));
return
f
;
return
f
;
}
}
...
@@ -230,7 +248,9 @@ inline __device__ float half_to_float(uint16_t h)
...
@@ -230,7 +248,9 @@ inline __device__ float half_to_float(uint16_t h)
inline
__device__
float2
half2_to_float2
(
uint32_t
v
)
inline
__device__
float2
half2_to_float2
(
uint32_t
v
)
{
{
uint16_t
lo
,
hi
;
uint16_t
lo
,
hi
;
asm
volatile
(
"mov.b32 {%0, %1}, %2;
\n
"
:
"=h"
(
lo
),
"=h"
(
hi
)
:
"r"
(
v
));
// asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
lo
=
v
&
0xffff
;
hi
=
(
v
>>
16
)
&
0xffff
;
return
make_float2
(
half_to_float
(
lo
),
half_to_float
(
hi
));
return
make_float2
(
half_to_float
(
lo
),
half_to_float
(
hi
));
}
}
...
@@ -276,7 +296,11 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb)
...
@@ -276,7 +296,11 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb)
inline
__device__
uint32_t
h0_h0
(
uint16_t
a
)
inline
__device__
uint32_t
h0_h0
(
uint16_t
a
)
{
{
uint32_t
b
;
uint32_t
b
;
asm
volatile
(
"mov.b32 %0, {%1, %1};"
:
"=r"
(
b
)
:
"h"
(
a
));
// asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
uint16_t
tmp
[
2
];
tmp
[
0
]
=
a
;
tmp
[
1
]
=
a
;
__builtin_memcpy
(
&
b
,
tmp
,
sizeof
(
uint16_t
)
*
2
);
return
b
;
return
b
;
}
}
...
@@ -370,7 +394,8 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
...
@@ -370,7 +394,8 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
inline
__device__
uint32_t
fma
(
uint32_t
a
,
uint32_t
b
,
uint32_t
c
)
inline
__device__
uint32_t
fma
(
uint32_t
a
,
uint32_t
b
,
uint32_t
c
)
{
{
uint32_t
d
;
uint32_t
d
;
asm
volatile
(
"fma.rn.f16x2 %0, %1, %2, %3;
\n
"
:
"=r"
(
d
)
:
"r"
(
a
),
"r"
(
b
),
"r"
(
c
));
// asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
asm
volatile
(
"v_pk_fma_f16 %0, %1, %2, %3;
\n
"
:
"=v"
(
d
)
:
"v"
(
a
),
"v"
(
b
),
"v"
(
c
));
return
d
;
return
d
;
}
}
...
@@ -581,7 +606,8 @@ template<>
...
@@ -581,7 +606,8 @@ template<>
inline
__device__
uint16_t
mul
(
uint16_t
a
,
uint16_t
b
)
inline
__device__
uint16_t
mul
(
uint16_t
a
,
uint16_t
b
)
{
{
uint16_t
c
;
uint16_t
c
;
asm
volatile
(
"mul.f16 %0, %1, %2;
\n
"
:
"=h"
(
c
)
:
"h"
(
a
),
"h"
(
b
));
// asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
asm
volatile
(
"v_mul_f16 %0, %1, %2;
\n
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
));
return
c
;
return
c
;
}
}
...
@@ -591,7 +617,8 @@ template<>
...
@@ -591,7 +617,8 @@ template<>
inline
__device__
uint32_t
mul
(
uint32_t
a
,
uint32_t
b
)
inline
__device__
uint32_t
mul
(
uint32_t
a
,
uint32_t
b
)
{
{
uint32_t
c
;
uint32_t
c
;
asm
volatile
(
"mul.f16x2 %0, %1, %2;
\n
"
:
"=r"
(
c
)
:
"r"
(
a
),
"r"
(
b
));
// asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
asm
volatile
(
"v_pk_mul_f16 %0, %1, %2;
\n
"
:
"=v"
(
c
)
:
"v"
(
a
),
"v"
(
b
));
return
c
;
return
c
;
}
}
...
...
src/turbomind/kernels/gpt_kernels.cu
View file @
e38ee081
...
@@ -20,7 +20,8 @@
...
@@ -20,7 +20,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/memory_utils.h"
...
...
src/turbomind/kernels/logprob_kernels.cu
View file @
e38ee081
...
@@ -23,7 +23,8 @@
...
@@ -23,7 +23,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/kernels/logprob_kernels.h"
...
...
src/turbomind/kernels/sampling_topk_kernels.cu
View file @
e38ee081
...
@@ -21,7 +21,8 @@
...
@@ -21,7 +21,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
...
...
src/turbomind/kernels/sampling_topp_kernels.cu
View file @
e38ee081
...
@@ -19,7 +19,8 @@
...
@@ -19,7 +19,8 @@
#elif (CUDART_VERSION >= 11000)
#elif (CUDART_VERSION >= 11000)
#include <cub/cub.cuh>
#include <cub/cub.cuh>
#else
#else
#include "3rdparty/cub/cub.cuh"
// #include "3rdparty/cub/cub.cuh"
#include <cub/cub.cuh>
#endif
#endif
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
...
...
src/turbomind/kernels/stop_criteria_kernels.cu
View file @
e38ee081
...
@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
...
@@ -145,7 +145,8 @@ void invokeLengthCriterion(bool* finished,
// Check if we have attained the sequence length limit. If so, stop the sequence.
// Check if we have attained the sequence length limit. If so, stop the sequence.
// In addition, check if all sequences are stopped and return the result in should_stop
// In addition, check if all sequences are stopped and return the result in should_stop
TM_LOG_DEBUG
(
"%s start"
,
__PRETTY_FUNCTION__
);
TM_LOG_DEBUG
(
"%s start"
,
__PRETTY_FUNCTION__
);
dim3
block
{
min
(
512
,
uint32_t
(
batch_size
*
beam_width
))};
// dim3 block{min(512, uint32_t(batch_size * beam_width))};
dim3
block
{
static_cast
<
unsigned
int
>
(
min
(
512
,
uint32_t
(
batch_size
*
beam_width
)))};
dim3
grid
{
1
};
dim3
grid
{
1
};
h_pinned_finished_sum_
[
0
]
=
-
1
;
h_pinned_finished_sum_
[
0
]
=
-
1
;
...
...
src/turbomind/kernels/unfused_attention_kernels.cu
View file @
e38ee081
...
@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
...
@@ -178,7 +178,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
qk_bias
=
hadd2
<
T2
>
(
qk_bias
,
hmul2
<
T2
>
(
hsub2
<
T2
>
(
ONE
,
mask_val
),
NEG_INFTY
));
qk_bias
=
hadd2
<
T2
>
(
qk_bias
,
hmul2
<
T2
>
(
hsub2
<
T2
>
(
ONE
,
mask_val
),
NEG_INFTY
));
data
[
i
]
=
hadd2
<
T2
>
(
hmul2
<
T2
>
(
qk
,
qk_scale_h2
),
qk_bias
);
data
[
i
]
=
hadd2
<
T2
>
(
hmul2
<
T2
>
(
qk
,
qk_scale_h2
),
qk_bias
);
local_max
=
fmax
(
local_max
,
fmax
((
float
)
data
[
i
].
x
,
(
float
)
data
[
i
].
y
));
// if (std::is_same<T2, half2>::value) {
local_max
=
fmax
(
local_max
,
fmax
((
float
)
data
[
i
].
data
[
0
],
(
float
)
data
[
i
].
data
[
1
]));
// } else {
// local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y));
// }
}
}
float
max_val
=
blockDim
.
x
<=
32
?
warpReduceMax
(
local_max
)
:
blockReduceMax
<
float
>
(
local_max
);
float
max_val
=
blockDim
.
x
<=
32
?
warpReduceMax
(
local_max
)
:
blockReduceMax
<
float
>
(
local_max
);
...
@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
...
@@ -190,7 +194,11 @@ __global__ void softmax_kernel_h2(T* attn_score,
float
local_sum
=
0.0
f
;
float
local_sum
=
0.0
f
;
for
(
int
i
=
0
;
blockDim
.
x
*
i
+
threadIdx
.
x
<
(
k_length
/
2
)
&&
i
<
ITEMS_PER_THREAD
;
i
++
)
{
for
(
int
i
=
0
;
blockDim
.
x
*
i
+
threadIdx
.
x
<
(
k_length
/
2
)
&&
i
<
ITEMS_PER_THREAD
;
i
++
)
{
data
[
i
]
=
hexp2
<
T2
>
(
hsub2
<
T2
>
(
data
[
i
],
cuda_cast
<
T2
>
(
s_max
)));
data
[
i
]
=
hexp2
<
T2
>
(
hsub2
<
T2
>
(
data
[
i
],
cuda_cast
<
T2
>
(
s_max
)));
local_sum
+=
(
float
)(
data
[
i
].
x
+
data
[
i
].
y
);
// if (std::is_same<T2, half2>::value) {
local_sum
+=
(
float
)(
data
[
i
].
data
[
0
]
+
data
[
i
].
data
[
1
]);
// } else {
// local_sum += (float)(data[i].x + data[i].y);
// }
}
}
float
sum_val
=
blockDim
.
x
<=
32
?
warpReduceSum
(
local_sum
)
:
blockReduceSum
<
float
>
(
local_sum
);
float
sum_val
=
blockDim
.
x
<=
32
?
warpReduceSum
(
local_sum
)
:
blockReduceSum
<
float
>
(
local_sum
);
...
@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
...
@@ -310,7 +318,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
val
=
hadd2
<
T2
>
(
val
,
pos_bias
[
j
]);
val
=
hadd2
<
T2
>
(
val
,
pos_bias
[
j
]);
}
}
data
[
j
][
i
]
=
val
;
data
[
j
][
i
]
=
val
;
local_max
[
j
]
=
fmax
(
local_max
[
j
],
fmax
((
float
)
data
[
j
][
i
].
x
,
(
float
)
data
[
j
][
i
].
y
));
// if (std::is_same<T2, half2>::value) {
local_max
[
j
]
=
fmax
(
local_max
[
j
],
fmax
((
float
)
data
[
j
][
i
].
data
[
0
],
(
float
)
data
[
j
][
i
].
data
[
1
]));
// } else {
// local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y));
// }
}
}
}
}
...
@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
...
@@ -343,7 +355,11 @@ __global__ void softmax_kernel_h2_v2(T* attn_score,
#pragma unroll
#pragma unroll
for
(
int
j
=
0
;
j
<
Q_ITEMS
;
j
++
)
{
for
(
int
j
=
0
;
j
<
Q_ITEMS
;
j
++
)
{
local_sum
[
j
]
+=
(
float
)(
data
[
j
][
i
].
x
+
data
[
j
][
i
].
y
);
// if (std::is_same<T2, half2>::value) {
local_sum
[
j
]
+=
(
float
)(
data
[
j
][
i
].
data
[
0
]
+
data
[
j
][
i
].
data
[
1
]);
// } else {
// local_sum[j] += (float)(data[j][i].x + data[j][i].y);
// }
}
}
}
}
...
@@ -1878,6 +1894,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
...
@@ -1878,6 +1894,7 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
qk_scale
);
qk_scale
);
}
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
printf
(
"============xiabo_test %s:%d
\n
"
,
__FILE__
,
__LINE__
);
softmax_withRelPosBias_element2_kernel
<
half2
,
half
>
softmax_withRelPosBias_element2_kernel
<
half2
,
half
>
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
qk_buf
,
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
qk_buf
,
(
const
half2
*
)
attn_mask
,
(
const
half2
*
)
attn_mask
,
...
...
src/turbomind/layers/CMakeLists.txt
View file @
e38ee081
...
@@ -13,12 +13,14 @@
...
@@ -13,12 +13,14 @@
# limitations under the License.
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_subdirectory
(
sampling_layers
)
add_subdirectory
(
sampling_layers
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
set_property
(
TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
DynamicDecodeLayer PUBLIC
CUDA::
cudart TopKSamplingLayer
target_link_libraries
(
DynamicDecodeLayer PUBLIC cudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
src/turbomind/layers/sampling_layers/CMakeLists.txt
View file @
e38ee081
...
@@ -14,19 +14,23 @@
...
@@ -14,19 +14,23 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
BaseSamplingLayer STATIC BaseSamplingLayer.cc
)
add_library
(
BaseSamplingLayer STATIC BaseSamplingLayer.cc
)
set_property
(
TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
BaseSamplingLayer PUBLIC
CUDA::
cudart sampling_penalty_kernels memory_utils
)
target_link_libraries
(
BaseSamplingLayer PUBLIC cudart sampling_penalty_kernels memory_utils
)
add_library
(
TopKSamplingLayer STATIC TopKSamplingLayer.cu
)
add_library
(
TopKSamplingLayer STATIC TopKSamplingLayer.cu
)
set_property
(
TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
TopKSamplingLayer PUBLIC
CUDA::
cudart BaseSamplingLayer sampling_topk_kernels
)
target_link_libraries
(
TopKSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels
)
add_library
(
TopPSamplingLayer STATIC TopPSamplingLayer.cu
)
add_library
(
TopPSamplingLayer STATIC TopPSamplingLayer.cu
)
set_property
(
TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
TopPSamplingLayer PUBLIC
CUDA::
cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels
)
target_link_libraries
(
TopPSamplingLayer PUBLIC cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels
)
src/turbomind/models/llama/CMakeLists.txt
View file @
e38ee081
...
@@ -2,9 +2,10 @@
...
@@ -2,9 +2,10 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
add_subdirectory
(
fused_multi_head_attention
)
#
add_subdirectory(fused_multi_head_attention)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
Llama STATIC
add_library
(
Llama STATIC
LlamaV2.cc
LlamaV2.cc
...
@@ -20,10 +21,12 @@ add_library(Llama STATIC
...
@@ -20,10 +21,12 @@ add_library(Llama STATIC
llama_kernels.cu
llama_kernels.cu
llama_decoder_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu
)
llama_utils.cu
)
set_property
(
TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set_property
(
TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
target_link_libraries
(
Llama PUBLIC CUDA::cudart
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
gemm_s4_f16
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
Llama PUBLIC cudart
# gemm_s4_f16
cublasMMWrapper
cublasMMWrapper
DynamicDecodeLayer
DynamicDecodeLayer
activation_kernels
activation_kernels
...
@@ -38,8 +41,8 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
...
@@ -38,8 +41,8 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
memory_utils
memory_utils
nccl_utils
nccl_utils
cuda_utils
cuda_utils
logger
logger
)
llama_fmha
)
#
llama_fmha)
if
(
NOT MSVC
)
if
(
NOT MSVC
)
add_subdirectory
(
flash_attention2
)
add_subdirectory
(
flash_attention2
)
...
@@ -47,5 +50,5 @@ if (NOT MSVC)
...
@@ -47,5 +50,5 @@ if (NOT MSVC)
endif
()
endif
()
add_executable
(
llama_gemm llama_gemm.cc
)
add_executable
(
llama_gemm llama_gemm.cc
)
target_link_libraries
(
llama_gemm PUBLIC
CUDA::
cudart gpt_gemm_func memory_utils cuda_utils logger
)
target_link_libraries
(
llama_gemm PUBLIC cudart gpt_gemm_func memory_utils cuda_utils logger
)
install
(
TARGETS llama_gemm DESTINATION
${
CMAKE_SOURCE_DIR
}
/lmdeploy/bin
)
install
(
TARGETS llama_gemm DESTINATION
${
CMAKE_SOURCE_DIR
}
/lmdeploy/bin
)
src/turbomind/models/llama/LlamaContextAttentionLayer.cc
View file @
e38ee081
...
@@ -52,7 +52,8 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
...
@@ -52,7 +52,8 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
k_buf_2_
=
q_buf_2_
+
local_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
k_buf_2_
=
q_buf_2_
+
local_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
v_buf_2_
=
k_buf_2_
+
local_kv_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
v_buf_2_
=
k_buf_2_
+
local_kv_head_num_
*
batch_size
*
max_q_len
*
size_per_head_
;
if
(
use_fmha_
)
{
// if (use_fmha_) {
if
(
0
)
{
FlashAttentionOp
<
T
>
flash_attention
(
batch_size
,
local_head_num_
,
max_k_len
,
max_q_len
,
size_per_head_
);
FlashAttentionOp
<
T
>
flash_attention
(
batch_size
,
local_head_num_
,
max_k_len
,
max_q_len
,
size_per_head_
);
if
(
flash_attention
.
get_workspace_size
()
>
0
)
{
if
(
flash_attention
.
get_workspace_size
()
>
0
)
{
qk_buf_float_
=
(
float
*
)
allocator_
->
reMalloc
(
qk_buf_float_
,
flash_attention
.
get_workspace_size
(),
true
);
qk_buf_float_
=
(
float
*
)
allocator_
->
reMalloc
(
qk_buf_float_
,
flash_attention
.
get_workspace_size
(),
true
);
...
@@ -86,7 +87,8 @@ void LlamaContextAttentionLayer<T>::freeBuffer()
...
@@ -86,7 +87,8 @@ void LlamaContextAttentionLayer<T>::freeBuffer()
allocator_
->
free
((
void
**
)(
&
qkv_buf_
));
allocator_
->
free
((
void
**
)(
&
qkv_buf_
));
allocator_
->
free
((
void
**
)(
&
q_buf_2_
));
allocator_
->
free
((
void
**
)(
&
q_buf_2_
));
if
(
use_fmha_
)
{
// if (use_fmha_) {
if
(
0
)
{
allocator_
->
free
((
void
**
)
&
qk_buf_float_
);
allocator_
->
free
((
void
**
)
&
qk_buf_float_
);
}
}
else
{
else
{
...
@@ -209,7 +211,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
...
@@ -209,7 +211,8 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
weights
->
past_kv_scale
.
data
());
weights
->
past_kv_scale
.
data
());
sync_check_cuda_error
();
sync_check_cuda_error
();
if
(
use_fmha_
)
{
// if (use_fmha_) {
if
(
0
)
{
fusedMultiHeadAttention
(
k_cache_ptrs
,
fusedMultiHeadAttention
(
k_cache_ptrs
,
v_cache_ptrs
,
v_cache_ptrs
,
layer_offset
,
layer_offset
,
...
@@ -252,7 +255,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
...
@@ -252,7 +255,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
}
}
sync_check_cuda_error
();
sync_check_cuda_error
();
}
}
#if 0
template<typename T>
template<typename T>
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
T** val_cache_ptrs,
...
@@ -311,7 +314,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
...
@@ -311,7 +314,7 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
//
//
flash_attention(attn_params, stream_);
flash_attention(attn_params, stream_);
}
}
#endif
template
<
typename
T
>
template
<
typename
T
>
void
LlamaContextAttentionLayer
<
T
>::
unfusedMultiHeadAttention
(
T
**
key_cache_ptrs
,
void
LlamaContextAttentionLayer
<
T
>::
unfusedMultiHeadAttention
(
T
**
key_cache_ptrs
,
T
**
val_cache_ptrs
,
T
**
val_cache_ptrs
,
...
...
src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
View file @
e38ee081
...
@@ -22,10 +22,18 @@
...
@@ -22,10 +22,18 @@
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include <filesystem>
// #include <filesystem>
#include <experimental/filesystem>
#include <sys/stat.h>
#include <string>
namespace
turbomind
{
namespace
turbomind
{
bool
fileExists
(
const
std
::
string
&
path
)
{
struct
stat
buffer
;
return
(
stat
(
path
.
c_str
(),
&
buffer
)
==
0
);
}
template
<
typename
T
>
template
<
typename
T
>
LlamaDecoderLayerWeight
<
T
>::
LlamaDecoderLayerWeight
(
size_t
head_num
,
LlamaDecoderLayerWeight
<
T
>::
LlamaDecoderLayerWeight
(
size_t
head_num
,
size_t
kv_head_num
,
size_t
kv_head_num
,
...
@@ -129,7 +137,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
...
@@ -129,7 +137,8 @@ void loadWeights(LlamaDenseWeight<T>& w,
}
}
else
{
else
{
// Disable slice if weight has already been sliced
// Disable slice if weight has already been sliced
if
(
std
::
filesystem
::
exists
(
max_prefix
+
".weight"
)
||
std
::
filesystem
::
exists
(
max_prefix
+
".qweight"
))
{
// if (std::filesystem::exists(max_prefix + ".weight") || std::filesystem::exists(max_prefix + ".qweight")) {
if
(
fileExists
(
max_prefix
+
".weight"
)
||
fileExists
(
max_prefix
+
".qweight"
))
{
TM_LOG_DEBUG
(
"TP weight exists. Disable runtime TP."
);
TM_LOG_DEBUG
(
"TP weight exists. Disable runtime TP."
);
enable_slice
=
false
;
enable_slice
=
false
;
}
}
...
...
src/turbomind/models/llama/LlamaLinear.h
View file @
e38ee081
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#pragma once
#pragma once
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
//
#include "src/turbomind/kernels/gemm_s_f16/gemm_s4_f16.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
...
@@ -61,29 +61,29 @@ private:
...
@@ -61,29 +61,29 @@ private:
void
forwardInt4
(
T
*
output_data
,
const
T
*
input_data
,
int
batch_size
,
const
LlamaDenseWeight
<
T
>&
weight
,
Type
type
)
void
forwardInt4
(
T
*
output_data
,
const
T
*
input_data
,
int
batch_size
,
const
LlamaDenseWeight
<
T
>&
weight
,
Type
type
)
{
{
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
//
if constexpr (std::is_same_v<T, half>) {
gemm_s4_f16_
.
Run
(
output_data
,
//
gemm_s4_f16_.Run(output_data,
(
const
uint
*
)
weight
.
kernel
,
//
(const uint*)weight.kernel,
input_data
,
//
input_data,
(
const
half2
*
)
weight
.
scales_and_zeros
,
//
(const half2*)weight.scales_and_zeros,
weight
.
output_dims
,
//
weight.output_dims,
batch_size
,
//
batch_size,
weight
.
input_dims
,
//
weight.input_dims,
weight
.
group_size
,
//
weight.group_size,
type
==
kFusedSiluFfn
?
GemmS4F16
::
kFusedSiluFfn
:
GemmS4F16
::
kGemm
,
//
type == kFusedSiluFfn ? GemmS4F16::kFusedSiluFfn : GemmS4F16::kGemm,
-
1
,
//
-1,
stream_
);
//
stream_);
sync_check_cuda_error
();
//
sync_check_cuda_error();
}
//
}
else
{
//
else {
FT_CHECK_WITH_INFO
(
0
,
"Not implemented"
);
FT_CHECK_WITH_INFO
(
0
,
"Not implemented"
);
}
//
}
}
}
private:
private:
cublasMMWrapper
*
cublas_wrapper_
;
cublasMMWrapper
*
cublas_wrapper_
;
cudaStream_t
stream_
{};
cudaStream_t
stream_
{};
GemmS4F16
gemm_s4_f16_
;
//
GemmS4F16 gemm_s4_f16_;
};
};
}
// namespace turbomind
}
// namespace turbomind
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment