Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
e38ee081
Commit
e38ee081
authored
Nov 14, 2023
by
xiabo
Browse files
Adapt to rocm
parent
56942c43
Changes
41
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
543 additions
and
424 deletions
+543
-424
src/turbomind/models/llama/llama_decoder_kernels.cu
src/turbomind/models/llama/llama_decoder_kernels.cu
+51
-15
src/turbomind/models/llama/llama_kernels.cu
src/turbomind/models/llama/llama_kernels.cu
+2
-1
src/turbomind/python/bind.cpp
src/turbomind/python/bind.cpp
+16
-15
src/turbomind/triton_backend/CMakeLists.txt
src/turbomind/triton_backend/CMakeLists.txt
+30
-16
src/turbomind/triton_backend/llama/CMakeLists.txt
src/turbomind/triton_backend/llama/CMakeLists.txt
+5
-3
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+1
-1
src/turbomind/utils/CMakeLists.txt
src/turbomind/utils/CMakeLists.txt
+49
-42
src/turbomind/utils/allocator.h
src/turbomind/utils/allocator.h
+30
-30
src/turbomind/utils/cublasAlgoMap.cc
src/turbomind/utils/cublasAlgoMap.cc
+2
-1
src/turbomind/utils/cublasMMWrapper.cc
src/turbomind/utils/cublasMMWrapper.cc
+22
-20
src/turbomind/utils/cuda_type_utils.cuh
src/turbomind/utils/cuda_type_utils.cuh
+24
-11
src/turbomind/utils/gemm.cc
src/turbomind/utils/gemm.cc
+6
-4
src/turbomind/utils/gemm_test/CMakeLists.txt
src/turbomind/utils/gemm_test/CMakeLists.txt
+45
-32
src/turbomind/utils/gemm_test/decoding_gemm_func.cc
src/turbomind/utils/gemm_test/decoding_gemm_func.cc
+43
-38
src/turbomind/utils/gemm_test/encoder_gemm_func.cc
src/turbomind/utils/gemm_test/encoder_gemm_func.cc
+35
-30
src/turbomind/utils/gemm_test/encoder_igemm_func.cc
src/turbomind/utils/gemm_test/encoder_igemm_func.cc
+16
-16
src/turbomind/utils/gemm_test/gpt_gemm_func.cc
src/turbomind/utils/gemm_test/gpt_gemm_func.cc
+53
-46
src/turbomind/utils/gemm_test/swin_gemm_func.cc
src/turbomind/utils/gemm_test/swin_gemm_func.cc
+35
-30
src/turbomind/utils/gemm_test/swin_igemm_func.cc
src/turbomind/utils/gemm_test/swin_igemm_func.cc
+17
-17
src/turbomind/utils/gemm_test/t5_gemm_func.cc
src/turbomind/utils/gemm_test/t5_gemm_func.cc
+61
-56
No files found.
src/turbomind/models/llama/llama_decoder_kernels.cu
View file @
e38ee081
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cooperative_groups.h>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
//
#include <cooperative_groups/reduce.h>
#include <cuda_fp16.h>
#include <cuda_fp16.h>
namespace
cg
=
cooperative_groups
;
namespace
cg
=
cooperative_groups
;
...
@@ -83,22 +83,57 @@ struct res_norm_ops_t<float> {
...
@@ -83,22 +83,57 @@ struct res_norm_ops_t<float> {
}
}
};
};
template
<
typename
T
>
//
template<typename T>
__device__
T
blockReduceSum
(
const
cg
::
thread_block
&
block
,
T
value
)
//
__device__ T blockReduceSum(const cg::thread_block& block, T value)
{
//
{
__shared__
float
partial
[
32
];
//
__shared__ float partial[32];
auto
tile
=
cg
::
tiled_partition
<
32
>
(
block
);
//
auto tile = cg::tiled_partition<32>(block);
value
=
cg
::
reduce
(
tile
,
value
,
cg
::
plus
<
float
>
{});
//
value = cg::reduce(tile, value, cg::plus<float>{});
if
(
tile
.
thread_rank
()
==
0
)
{
// if (tile.thread_rank() == 0) {
partial
[
tile
.
meta_group_rank
()]
=
value
;
// partial[tile.meta_group_rank()] = value;
}
// }
// block.sync();
block
.
sync
();
// value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{};
// return cg::reduce(tile, value, cg::plus<float>{});
// }
#define WARPSIZE 64
template
<
typename
T
>
__inline__
__device__
T
warpReduceSum_xiabo
(
T
value
)
{
#pragma unroll
for
(
int
offset
=
WARPSIZE
/
2
;
offset
>
0
;
offset
>>=
1
)
value
+=
__shfl_down_sync
(
0xffffffff
,
value
,
offset
);
return
value
;
}
value
=
tile
.
thread_rank
()
<
tile
.
meta_group_size
()
?
partial
[
tile
.
thread_rank
()]
:
T
{};
template
<
typename
T
>
return
cg
::
reduce
(
tile
,
value
,
cg
::
plus
<
float
>
{});
__inline__
__device__
T
blockReduceSum_xiabo
(
T
val
)
{
T
sum
=
(
T
)(
0.0
f
);
__shared__
T
shared
[
WARPSIZE
];
sum
=
warpReduceSum_xiabo
(
val
);
__syncthreads
();
int
tid
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
if
(
tid
%
WARPSIZE
==
0
)
{
shared
[
tid
/
WARPSIZE
]
=
sum
;
}
if
(
tid
>=
blockDim
.
x
*
blockDim
.
y
/
WARPSIZE
&&
tid
<
WARPSIZE
)
{
shared
[
tid
]
=
(
T
)(
0.0
f
);
}
__syncthreads
();
if
(
tid
/
WARPSIZE
==
0
)
{
sum
=
warpReduceSum_xiabo
(
shared
[
tid
]);
if
(
tid
==
0
)
{
shared
[
0
]
=
sum
;
}
}
__syncthreads
();
return
shared
[
0
];
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -111,7 +146,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
...
@@ -111,7 +146,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
int
n_dims
)
int
n_dims
)
{
{
auto
block
=
cg
::
this_thread_block
();
auto
block
=
cg
::
this_thread_block
();
auto
grid
=
cg
::
this_grid
();
//
auto grid = cg::this_grid();
constexpr
int
PACK_DIM
=
sizeof
(
uint4
)
/
sizeof
(
T
);
constexpr
int
PACK_DIM
=
sizeof
(
uint4
)
/
sizeof
(
T
);
...
@@ -131,7 +166,8 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
...
@@ -131,7 +166,8 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
r_ptr
[
i
]
=
r
;
r_ptr
[
i
]
=
r
;
}
}
auto
total_sum
=
blockReduceSum
(
block
,
thread_sum
);
// auto total_sum = blockReduceSum(block, thread_sum);
auto
total_sum
=
blockReduceSum_xiabo
(
thread_sum
);
float
s_inv_mean
=
rsqrt
(
total_sum
/
n_dims
+
eps
);
float
s_inv_mean
=
rsqrt
(
total_sum
/
n_dims
+
eps
);
...
...
src/turbomind/models/llama/llama_kernels.cu
View file @
e38ee081
...
@@ -315,7 +315,8 @@ static inline __device__ half4 char4_scale_to_half4(char4 value, const float sca
...
@@ -315,7 +315,8 @@ static inline __device__ half4 char4_scale_to_half4(char4 value, const float sca
static
inline
__device__
uint32_t
float4_to_char4
(
float
x
,
float
y
,
float
z
,
float
w
)
static
inline
__device__
uint32_t
float4_to_char4
(
float
x
,
float
y
,
float
z
,
float
w
)
{
{
uint32_t
dst
;
uint32_t
dst
;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
#if 0
uint32_t a;
uint32_t a;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
uint32_t b;
uint32_t b;
...
...
src/turbomind/python/bind.cpp
View file @
e38ee081
#include "src/turbomind/kernels/gemm_s_f16/format.h"
//
#include "src/turbomind/kernels/gemm_s_f16/format.h"
#include "src/turbomind/python/dlpack.h"
#include "src/turbomind/python/dlpack.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
...
@@ -47,7 +47,8 @@ DLDevice getDLDevice(triton::Tensor& tensor)
...
@@ -47,7 +47,8 @@ DLDevice getDLDevice(triton::Tensor& tensor)
case
triton
::
MEMORY_CPU_PINNED
:
case
triton
::
MEMORY_CPU_PINNED
:
device
.
device_type
=
DLDeviceType
::
kDLCUDAHost
;
device
.
device_type
=
DLDeviceType
::
kDLCUDAHost
;
case
triton
::
MEMORY_GPU
:
case
triton
::
MEMORY_GPU
:
device
.
device_type
=
DLDeviceType
::
kDLCUDA
;
// device.device_type = DLDeviceType::kDLCUDA;
device
.
device_type
=
DLDeviceType
::
kDLROCM
;
break
;
break
;
default:
default:
break
;
break
;
...
@@ -415,15 +416,15 @@ PYBIND11_MODULE(_turbomind, m)
...
@@ -415,15 +416,15 @@ PYBIND11_MODULE(_turbomind, m)
auto
src_tensor
=
GetDLTensor
(
src
);
auto
src_tensor
=
GetDLTensor
(
src
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
turbomind
::
transpose_qk_s4_k_m8_hf
(
//
turbomind::transpose_qk_s4_k_m8_hf(
(
uint32_t
*
)
dst_tensor
.
data
,
(
const
uint32_t
*
)
src_tensor
.
data
,
m
,
k
,
size_per_head
,
nullptr
);
//
(uint32_t*)dst_tensor.data, (const uint32_t*)src_tensor.data, m, k, size_per_head, nullptr);
});
});
m
.
def
(
"fuse_w1_w3_s4_k_m8"
,
[](
py
::
object
src
,
py
::
object
dst
,
int
m
,
int
k
)
{
m
.
def
(
"fuse_w1_w3_s4_k_m8"
,
[](
py
::
object
src
,
py
::
object
dst
,
int
m
,
int
k
)
{
auto
src_tensor
=
GetDLTensor
(
src
);
auto
src_tensor
=
GetDLTensor
(
src
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
auto
dst_tensor
=
GetDLTensor
(
dst
);
turbomind
::
fuse_w1_w3_s4_k_m8
((
uint32_t
*
)
dst_tensor
.
data
,
(
const
uint32_t
*
)
src_tensor
.
data
,
m
,
k
,
nullptr
);
//
turbomind::fuse_w1_w3_s4_k_m8((uint32_t*)dst_tensor.data, (const uint32_t*)src_tensor.data, m, k, nullptr);
});
});
m
.
def
(
"convert_s4_k_m8"
,
m
.
def
(
"convert_s4_k_m8"
,
...
@@ -443,16 +444,16 @@ PYBIND11_MODULE(_turbomind, m)
...
@@ -443,16 +444,16 @@ PYBIND11_MODULE(_turbomind, m)
auto
s
=
GetDLTensor
(
scales
);
auto
s
=
GetDLTensor
(
scales
);
auto
qz
=
GetDLTensor
(
qzeros
);
auto
qz
=
GetDLTensor
(
qzeros
);
turbomind
::
convert_s4_k_m8
((
uint32_t
*
)
a_dst
.
data
,
//
turbomind::convert_s4_k_m8((uint32_t*)a_dst.data,
(
half2
*
)
q_dst
.
data
,
//
(half2*)q_dst.data,
(
half
*
)
w
.
data
,
//
(half*)w.data,
(
const
uint32_t
*
)
a_src
.
data
,
//
(const uint32_t*)a_src.data,
(
const
half
*
)
s
.
data
,
//
(const half*)s.data,
(
const
uint32_t
*
)
qz
.
data
,
//
(const uint32_t*)qz.data,
m
,
//
m,
k
,
//
k,
group_size
,
//
group_size,
nullptr
);
//
nullptr);
});
});
m
.
def
(
"dequantize_s4"
,
[](
py
::
object
src
,
py
::
object
dst
)
{
m
.
def
(
"dequantize_s4"
,
[](
py
::
object
src
,
py
::
object
dst
)
{
...
...
src/turbomind/triton_backend/CMakeLists.txt
View file @
e38ee081
...
@@ -24,13 +24,17 @@
...
@@ -24,13 +24,17 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required
(
VERSION 3.18
)
#cmake_minimum_required (VERSION 3.18)
cmake_minimum_required
(
VERSION 3.16
)
project
(
tritonturbomindbackend LANGUAGES C CXX
)
project
(
tritonturbomindbackend LANGUAGES C CXX
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
TransformerTritonBackend STATIC transformer_triton_backend.cpp
)
add_library
(
TransformerTritonBackend STATIC transformer_triton_backend.cpp
)
target_link_libraries
(
TransformerTritonBackend PUBLIC nccl_utils
)
target_link_libraries
(
TransformerTritonBackend PUBLIC nccl_utils
)
set_property
(
TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET TransformerTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
install
(
TARGETS TransformerTritonBackend DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
)
install
(
TARGETS TransformerTritonBackend DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
)
add_subdirectory
(
llama
)
add_subdirectory
(
llama
)
...
@@ -70,21 +74,24 @@ include(FetchContent)
...
@@ -70,21 +74,24 @@ include(FetchContent)
FetchContent_Declare
(
FetchContent_Declare
(
repo-common
repo-common
GIT_REPOSITORY https://github.com/triton-inference-server/common.git
URL ../../../3rdparty/common-r22.12
GIT_TAG
${
TRITON_COMMON_REPO_TAG
}
#GIT_REPOSITORY https://github.com/triton-inference-server/common.git
GIT_SHALLOW ON
#GIT_TAG ${TRITON_COMMON_REPO_TAG}
#GIT_SHALLOW ON
)
)
FetchContent_Declare
(
FetchContent_Declare
(
repo-core
repo-core
GIT_REPOSITORY https://github.com/triton-inference-server/core.git
URL ../../../3rdparty/core-r22.12
GIT_TAG
${
TRITON_CORE_REPO_TAG
}
#GIT_REPOSITORY https://github.com/triton-inference-server/core.git
GIT_SHALLOW ON
#GIT_TAG ${TRITON_CORE_REPO_TAG}
#GIT_SHALLOW ON
)
)
FetchContent_Declare
(
FetchContent_Declare
(
repo-backend
repo-backend
GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
URL ../../../3rdparty/backend-r22.12
GIT_TAG
${
TRITON_BACKEND_REPO_TAG
}
#GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
GIT_SHALLOW ON
#GIT_TAG ${TRITON_BACKEND_REPO_TAG}
#GIT_SHALLOW ON
)
)
FetchContent_MakeAvailable
(
repo-common repo-core repo-backend
)
FetchContent_MakeAvailable
(
repo-common repo-core repo-backend
)
...
@@ -92,7 +99,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend)
...
@@ -92,7 +99,8 @@ FetchContent_MakeAvailable(repo-common repo-core repo-backend)
# CUDA
# CUDA
#
#
if
(
${
TRITON_ENABLE_GPU
}
)
if
(
${
TRITON_ENABLE_GPU
}
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
endif
()
# TRITON_ENABLE_GPU
endif
()
# TRITON_ENABLE_GPU
#
#
...
@@ -109,7 +117,8 @@ add_library(
...
@@ -109,7 +117,8 @@ add_library(
TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
)
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
find_package
(
CUDA 10.1 REQUIRED
)
find_package
(
CUDA 10.1 REQUIRED
)
if
(
${
CUDA_VERSION
}
GREATER_EQUAL 11.0
)
if
(
${
CUDA_VERSION
}
GREATER_EQUAL 11.0
)
message
(
STATUS
"Add DCUDA11_MODE"
)
message
(
STATUS
"Add DCUDA11_MODE"
)
...
@@ -158,10 +167,14 @@ if(${TRITON_ENABLE_GPU})
...
@@ -158,10 +167,14 @@ if(${TRITON_ENABLE_GPU})
)
)
endif
()
# TRITON_ENABLE_GPU
endif
()
# TRITON_ENABLE_GPU
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
set_target_properties
(
set_target_properties
(
triton-turbomind-backend
triton-turbomind-backend
PROPERTIES
PROPERTIES
POSITION_INDEPENDENT_CODE ON
# POSITION_INDEPENDENT_CODE ON
POSITION_INDEPENDENT_CODE OFF
OUTPUT_NAME triton_turbomind
OUTPUT_NAME triton_turbomind
SKIP_BUILD_RPATH TRUE
SKIP_BUILD_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
...
@@ -194,7 +207,7 @@ target_link_libraries(
...
@@ -194,7 +207,7 @@ target_link_libraries(
transformer-shared
# from repo-ft
transformer-shared
# from repo-ft
${
TRITON_PYTORCH_LDFLAGS
}
${
TRITON_PYTORCH_LDFLAGS
}
-lcublas
-lcublas
-lcublasLt
#
-lcublasLt
-lcudart
-lcudart
-lcurand
-lcurand
)
)
...
@@ -228,7 +241,8 @@ if(${TRITON_ENABLE_GPU})
...
@@ -228,7 +241,8 @@ if(${TRITON_ENABLE_GPU})
target_link_libraries
(
target_link_libraries
(
triton-turbomind-backend
triton-turbomind-backend
PRIVATE
PRIVATE
CUDA::cudart
# CUDA::cudart
cudart
)
)
endif
()
# TRITON_ENABLE_GPU
endif
()
# TRITON_ENABLE_GPU
...
...
src/turbomind/triton_backend/llama/CMakeLists.txt
View file @
e38ee081
...
@@ -22,8 +22,10 @@ set(llama_triton_backend_files
...
@@ -22,8 +22,10 @@ set(llama_triton_backend_files
LlamaTritonModelInstance.cc
LlamaTritonModelInstance.cc
)
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_library
(
LlamaTritonBackend STATIC
${
llama_triton_backend_files
}
)
add_library
(
LlamaTritonBackend STATIC
${
llama_triton_backend_files
}
)
set_property
(
TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON
)
#set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries
(
LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt
)
#target_link_libraries(LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils CUDA::cublasLt)
target_link_libraries
(
LlamaTritonBackend PUBLIC TransformerTritonBackend Llama tensor memory_utils
)
target_compile_features
(
LlamaTritonBackend PRIVATE cxx_std_14
)
target_compile_features
(
LlamaTritonBackend PRIVATE cxx_std_14
)
src/turbomind/triton_backend/llama/LlamaTritonModel.cc
View file @
e38ee081
...
@@ -197,7 +197,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
...
@@ -197,7 +197,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
cublasLtHandle_t
cublaslt_handle
;
cublasLtHandle_t
cublaslt_handle
;
cublasCreate
(
&
cublas_handle
);
cublasCreate
(
&
cublas_handle
);
cublasLtCreate
(
&
cublaslt_handle
);
//
cublasLtCreate(&cublaslt_handle);
cublasSetStream
(
cublas_handle
,
stream
);
cublasSetStream
(
cublas_handle
,
stream
);
std
::
unique_ptr
<
ft
::
cublasAlgoMap
>
cublas_algo_map
(
new
ft
::
cublasAlgoMap
(
"gemm_config.in"
));
std
::
unique_ptr
<
ft
::
cublasAlgoMap
>
cublas_algo_map
(
new
ft
::
cublasAlgoMap
(
"gemm_config.in"
));
...
...
src/turbomind/utils/CMakeLists.txt
View file @
e38ee081
...
@@ -14,98 +14,105 @@
...
@@ -14,98 +14,105 @@
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
add_subdirectory
(
gemm_test
)
add_subdirectory
(
gemm_test
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
cuda_utils STATIC cuda_utils.cc
)
add_library
(
cuda_utils STATIC cuda_utils.cc
)
set_property
(
TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cuda_utils PUBLIC
CUDA::
cudart
)
target_link_libraries
(
cuda_utils PUBLIC cudart
)
add_library
(
logger STATIC logger.cc
)
add_library
(
logger STATIC logger.cc
)
set_property
(
TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
logger PUBLIC
CUDA::
cudart
)
target_link_libraries
(
logger PUBLIC cudart
)
add_library
(
cublasAlgoMap STATIC cublasAlgoMap.cc
)
add_library
(
cublasAlgoMap STATIC cublasAlgoMap.cc
)
set_property
(
TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasAlgoMap PUBLIC
CUDA::
cublas
CUDA::
cudart
CUDA::
curand cuda_utils logger
)
target_link_libraries
(
cublasAlgoMap PUBLIC cublas cudart curand cuda_utils logger
)
add_library
(
cublasMMWrapper STATIC cublasMMWrapper.cc
)
add_library
(
cublasMMWrapper STATIC cublasMMWrapper.cc
)
set_property
(
TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasMMWrapper PUBLIC
CUDA::
cublas
CUDA::
cudart
CUDA::
curand cublasAlgoMap cuda_utils logger
)
target_link_libraries
(
cublasMMWrapper PUBLIC cublas cudart curand cublasAlgoMap cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
cublasMMWrapper PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
cublasMMWrapper PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
add_library
(
word_list STATIC word_list.cc
)
add_library
(
word_list STATIC word_list.cc
)
set_property
(
TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
nvtx_utils STATIC nvtx_utils.cc
)
add_library
(
nvtx_utils STATIC nvtx_utils.cc
)
set_property
(
TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.25"
)
if
(
${
CMAKE_VERSION
}
VERSION_LESS
"3.25"
)
target_link_libraries
(
nvtx_utils PUBLIC
CUDA::
nvToolsExt -ldl
)
#
target_link_libraries(nvtx_utils PUBLIC nvToolsExt -ldl)
else
()
else
()
target_link_libraries
(
nvtx_utils PUBLIC
CUDA::
nvtx3 -ldl
)
#
target_link_libraries(nvtx_utils PUBLIC nvtx3 -ldl)
endif
()
endif
()
add_library
(
memory_utils STATIC memory_utils.cu
)
add_library
(
memory_utils STATIC memory_utils.cu
)
set_property
(
TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
memory_utils PUBLIC cuda_utils logger tensor
)
target_link_libraries
(
memory_utils PUBLIC cuda_utils logger tensor
)
add_library
(
mpi_utils STATIC mpi_utils.cc
)
add_library
(
mpi_utils STATIC mpi_utils.cc
)
set_property
(
TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if
(
BUILD_MULTI_GPU
)
if
(
BUILD_MULTI_GPU
)
target_link_libraries
(
mpi_utils PUBLIC mpi logger
)
target_link_libraries
(
mpi_utils PUBLIC mpi logger
)
endif
()
endif
()
add_library
(
nccl_utils STATIC nccl_utils.cc
)
add_library
(
nccl_utils STATIC nccl_utils.cc
)
set_property
(
TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if
(
BUILD_MULTI_GPU
)
if
(
BUILD_MULTI_GPU
)
target_link_libraries
(
nccl_utils PUBLIC
${
NCCL_LIBRARIES
}
logger
)
target_link_libraries
(
nccl_utils PUBLIC
${
NCCL_LIBRARIES
}
logger
)
endif
()
endif
()
add_library
(
cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc
)
add_library
(
cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc
)
set_property
(
TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON
)
#set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasINT8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand cublasAlgoMap cublasMMWrapper cuda_utils logger
)
#target_link_libraries(cublasINT8MMWrapper PUBLIC cublasLt cudart curand cublasAlgoMap cublasMMWrapper cuda_utils logger)
target_link_libraries
(
cublasINT8MMWrapper PUBLIC cudart curand cublasAlgoMap cublasMMWrapper cuda_utils logger
)
if
(
ENABLE_FP8
)
if
(
ENABLE_FP8
)
add_library
(
cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu
)
add_library
(
cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu
)
set_property
(
TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON
)
#set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
cublasFP8MMWrapper PUBLIC CUDA::cublasLt CUDA::cudart CUDA::curand
#target_link_libraries(cublasFP8MMWrapper PUBLIC cublasLt cudart curand
target_link_libraries
(
cublasFP8MMWrapper PUBLIC cudart curand
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils
)
cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils
)
endif
()
endif
()
add_library
(
custom_ar_comm STATIC custom_ar_comm.cc
)
add_library
(
custom_ar_comm STATIC custom_ar_comm.cc
)
set_property
(
TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger
)
target_link_libraries
(
custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger
)
add_library
(
gemm STATIC gemm.cc
)
add_library
(
gemm STATIC gemm.cc
)
set_property
(
TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
gemm PUBLIC
target_link_libraries
(
gemm PUBLIC
CUDA::cublas CUDA::cublasLt CUDA::cudart CUDA::curand
# cublas cublasLt cudart curand
cublas cudart curand
cublasAlgoMap memory_utils cuda_utils logger
)
cublasAlgoMap memory_utils cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
gemm PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
gemm PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
add_library
(
cuda_fp8_utils STATIC cuda_fp8_utils.cu
)
add_library
(
cuda_fp8_utils STATIC cuda_fp8_utils.cu
)
set_property
(
TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
tensor STATIC Tensor.cc
)
add_library
(
tensor STATIC Tensor.cc
)
set_property
(
TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
)
src/turbomind/utils/allocator.h
View file @
e38ee081
...
@@ -158,36 +158,36 @@ public:
...
@@ -158,36 +158,36 @@ public:
{
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
pointer_mapping_
=
new
std
::
unordered_map
<
void
*
,
std
::
pair
<
size_t
,
MemoryType
>>
();
pointer_mapping_
=
new
std
::
unordered_map
<
void
*
,
std
::
pair
<
size_t
,
MemoryType
>>
();
#if defined(CUDA_MEMORY_POOL_DISABLED)
//
#if defined(CUDA_MEMORY_POOL_DISABLED)
TM_LOG_WARNING
(
//
TM_LOG_WARNING(
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
//
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP"
);
//
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
#else
//
#else
int
device_count
=
1
;
//
int device_count = 1;
check_cuda_error
(
cudaGetDeviceCount
(
&
device_count
));
//
check_cuda_error(cudaGetDeviceCount(&device_count));
cudaMemPool_t
mempool
;
//
cudaMemPool_t mempool;
check_cuda_error
(
cudaDeviceGetDefaultMemPool
(
&
mempool
,
device_id
));
//
check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id));
cudaMemAccessDesc
desc
=
{};
//
cudaMemAccessDesc desc = {};
int
peer_access_available
=
0
;
//
int peer_access_available = 0;
for
(
int
i
=
0
;
i
<
device_count
;
i
++
)
{
//
for (int i = 0; i < device_count; i++) {
if
(
i
==
device_id
)
{
//
if (i == device_id) {
continue
;
//
continue;
}
//
}
check_cuda_error
(
cudaDeviceCanAccessPeer
(
&
peer_access_available
,
device_id
,
i
));
//
check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
if
(
!
peer_access_available
)
{
//
if (!peer_access_available) {
TM_LOG_WARNING
(
"Device "
+
std
::
to_string
(
device_id
)
+
" peer access Device "
+
std
::
to_string
(
i
)
//
TM_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+
" is not available."
);
//
+ " is not available.");
continue
;
//
continue;
}
//
}
desc
.
location
.
type
=
cudaMemLocationTypeDevice
;
//
desc.location.type = cudaMemLocationTypeDevice;
desc
.
location
.
id
=
i
;
//
desc.location.id = i;
desc
.
flags
=
cudaMemAccessFlagsProtReadWrite
;
//
desc.flags = cudaMemAccessFlagsProtReadWrite;
check_cuda_error
(
cudaMemPoolSetAccess
(
mempool
,
&
desc
,
1
));
//
check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1));
}
//
}
// set memory pool threshold to avoid shrinking the pool
//
// set memory pool threshold to avoid shrinking the pool
uint64_t
setVal
=
UINT64_MAX
;
//
uint64_t setVal = UINT64_MAX;
check_cuda_error
(
cudaMemPoolSetAttribute
(
mempool
,
cudaMemPoolAttrReleaseThreshold
,
&
setVal
));
//
check_cuda_error(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &setVal));
#endif
//
#endif
}
}
virtual
~
Allocator
()
virtual
~
Allocator
()
...
...
src/turbomind/utils/cublasAlgoMap.cc
View file @
e38ee081
...
@@ -139,7 +139,8 @@ cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const in
...
@@ -139,7 +139,8 @@ cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const in
else
{
else
{
cublasLtMatmulAlgo_info
tmp_algo
;
cublasLtMatmulAlgo_info
tmp_algo
;
tmp_algo
.
algoId
=
tmp_algo
.
algoId
=
static_cast
<
int
>
(
data_type
==
FLOAT_DATATYPE
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
// static_cast<int>(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
static_cast
<
int
>
(
data_type
==
FLOAT_DATATYPE
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT
);
tmp_algo
.
customOption
=
-
1
;
tmp_algo
.
customOption
=
-
1
;
tmp_algo
.
tile
=
-
1
;
tmp_algo
.
tile
=
-
1
;
tmp_algo
.
splitK_val
=
-
1
;
tmp_algo
.
splitK_val
=
-
1
;
...
...
src/turbomind/utils/cublasMMWrapper.cc
View file @
e38ee081
...
@@ -192,7 +192,8 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
...
@@ -192,7 +192,8 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
}
}
}
}
if
(
using_cublasLt
)
{
// if (using_cublasLt) {
if
(
0
)
{
cublasLtMatmulDesc_t
operationDesc
=
NULL
;
cublasLtMatmulDesc_t
operationDesc
=
NULL
;
cublasLtMatrixLayout_t
Adesc
=
NULL
,
Bdesc
=
NULL
,
Cdesc
=
NULL
;
cublasLtMatrixLayout_t
Adesc
=
NULL
,
Bdesc
=
NULL
,
Cdesc
=
NULL
;
cudaDataType_t
scaleType
;
cudaDataType_t
scaleType
;
...
@@ -279,22 +280,22 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
...
@@ -279,22 +280,22 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
}
}
}
}
cublasLtMatmul
(
cublaslt_handle_
,
//
cublasLtMatmul(cublaslt_handle_,
operationDesc
,
//
operationDesc,
alpha
,
//
alpha,
A
,
//
A,
Adesc
,
//
Adesc,
B
,
//
B,
Bdesc
,
//
Bdesc,
beta
,
//
beta,
C
,
//
C,
Cdesc
,
//
Cdesc,
C
,
//
C,
Cdesc
,
//
Cdesc,
(
findAlgo
==
1
?
(
&
algo
)
:
NULL
),
//
(findAlgo == 1 ? (&algo) : NULL),
workSpace
,
//
workSpace,
workspaceSize
,
//
workspaceSize,
stream_
);
//
stream_);
cublasLtMatmulDescDestroy
(
operationDesc
);
cublasLtMatmulDescDestroy
(
operationDesc
);
cublasLtMatrixLayoutDestroy
(
Adesc
);
cublasLtMatrixLayoutDestroy
(
Adesc
);
...
@@ -448,8 +449,8 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
...
@@ -448,8 +449,8 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_EPILOGUE
,
&
epi
,
sizeof
(
cublasLtEpilogue_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_EPILOGUE
,
&
epi
,
sizeof
(
cublasLtEpilogue_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_BIAS_POINTER
,
&
bias
,
sizeof
(
const
void
*
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_BIAS_POINTER
,
&
bias
,
sizeof
(
const
void
*
));
check_cuda_error
(
cublasLtMatmul
(
//
check_cuda_error(cublasLtMatmul(
cublaslt_handle_
,
operationDesc
,
alpha
,
A
,
Adesc
,
B
,
Bdesc
,
beta
,
C
,
Cdesc
,
C
,
Cdesc
,
NULL
,
NULL
,
0
,
stream_
));
//
cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
cublasLtMatrixLayoutDestroy
(
Adesc
);
cublasLtMatrixLayoutDestroy
(
Adesc
);
cublasLtMatrixLayoutDestroy
(
Bdesc
);
cublasLtMatrixLayoutDestroy
(
Bdesc
);
cublasLtMatrixLayoutDestroy
(
Cdesc
);
cublasLtMatrixLayoutDestroy
(
Cdesc
);
...
@@ -985,7 +986,8 @@ void cublasMMWrapper::_Int8Gemm(const int m,
...
@@ -985,7 +986,8 @@ void cublasMMWrapper::_Int8Gemm(const int m,
* - 0: int8 * int8 -> int32 -> int8
* - 0: int8 * int8 -> int32 -> int8
* - 1: int8 * int8 -> int32 -> int32
* - 1: int8 * int8 -> int32 -> int32
*/
*/
#if (CUBLAS_VERSION) <= 11601
// #if (CUBLAS_VERSION) <= 11601
#if 1
FT_CHECK_WITH_INFO
(
false
,
"CUBLAS version too low."
);
FT_CHECK_WITH_INFO
(
false
,
"CUBLAS version too low."
);
#else
#else
...
...
src/turbomind/utils/cuda_type_utils.cuh
View file @
e38ee081
...
@@ -322,7 +322,7 @@ __device__ inline int8_t cuda_cast<int8_t, half>(half val)
...
@@ -322,7 +322,7 @@ __device__ inline int8_t cuda_cast<int8_t, half>(half val)
int16_t
int16_in
;
int16_t
int16_in
;
};
};
fp16
=
val
;
fp16
=
val
;
asm
volatile
(
"cvt.rni.sat.s8.f16 %0, %1;"
:
"=h"
(
int16
)
:
"h"
(
int16_in
));
//
asm volatile("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
return
int8
[
0
];
return
int8
[
0
];
}
}
...
@@ -333,20 +333,31 @@ __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val)
...
@@ -333,20 +333,31 @@ __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val)
int8_t
int8
[
2
];
int8_t
int8
[
2
];
int16_t
int16
;
int16_t
int16
;
};
};
int8
[
0
]
=
cuda_cast
<
int8_t
>
(
val
.
x
);
// int8[0] = cuda_cast<int8_t>(val.x);
int8
[
1
]
=
cuda_cast
<
int8_t
>
(
val
.
y
);
// int8[1] = cuda_cast<int8_t>(val.y);
int8
[
0
]
=
cuda_cast
<
int8_t
>
((
val
.
data
[
0
]));
int8
[
1
]
=
cuda_cast
<
int8_t
>
((
val
.
data
[
1
]));
return
int16
;
return
int16
;
}
}
template
<
>
template
<
>
__device__
inline
int8_t
cuda_cast
<
int8_t
,
float
>
(
float
val
)
__device__
inline
int8_t
cuda_cast
<
int8_t
,
float
>
(
float
val
)
{
{
union
{
// union {
int8_t
int8
[
2
];
// int8_t int8[2];
int16_t
int16
;
// int16_t int16;
};
// };
asm
volatile
(
"cvt.rni.sat.s8.f32 %0, %1;"
:
"=h"
(
int16
)
:
"f"
(
val
));
// asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
return
int8
[
0
];
// return int8[0];
int8_t
dst
;
if
(
val
>=
128
){
dst
=
127
;
}
else
if
(
val
<
-
128
){
dst
=
-
128
;
}
else
{
dst
=
static_cast
<
int8_t
>
(
val
);
}
return
dst
;
}
}
template
<
>
template
<
>
...
@@ -528,13 +539,15 @@ __device__ inline To cuda_max(Ti val)
...
@@ -528,13 +539,15 @@ __device__ inline To cuda_max(Ti val)
template
<
>
template
<
>
__device__
inline
half
cuda_max
(
half2
val
)
__device__
inline
half
cuda_max
(
half2
val
)
{
{
return
(
val
.
x
>
val
.
y
)
?
val
.
x
:
val
.
y
;
// return (val.x > val.y) ? val.x : val.y;
return
(
val
.
data
[
0
]
>
val
.
data
[
1
])
?
val
.
data
[
0
]
:
val
.
data
[
1
];
}
}
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
template
<
>
template
<
>
__device__
inline
__nv_bfloat16
cuda_max
(
__nv_bfloat162
val
)
__device__
inline
__nv_bfloat16
cuda_max
(
__nv_bfloat162
val
)
{
{
return
(
val
.
x
>
val
.
y
)
?
val
.
x
:
val
.
y
;
// return (val.x > val.y) ? val.x : val.y;
return
(
val
.
data
[
0
]
>
val
.
data
[
1
])
?
val
.
data
[
0
]
:
val
.
data
[
1
];
}
}
#endif
#endif
...
...
src/turbomind/utils/gemm.cc
View file @
e38ee081
...
@@ -26,7 +26,7 @@ Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
...
@@ -26,7 +26,7 @@ Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
stream_
=
stream
;
stream_
=
stream
;
mutex_
=
new
std
::
mutex
();
// mutex per process
mutex_
=
new
std
::
mutex
();
// mutex per process
check_cuda_error
(
cublasCreate
(
&
cublas_handle_
));
check_cuda_error
(
cublasCreate
(
&
cublas_handle_
));
check_cuda_error
(
cublasLtCreate
(
&
cublaslt_handle_
));
//
check_cuda_error(cublasLtCreate(&cublaslt_handle_));
check_cuda_error
(
cublasSetStream
(
cublas_handle_
,
stream
));
check_cuda_error
(
cublasSetStream
(
cublas_handle_
,
stream
));
if
(
allocator_
!=
nullptr
)
{
if
(
allocator_
!=
nullptr
)
{
...
@@ -41,7 +41,7 @@ Gemm::~Gemm()
...
@@ -41,7 +41,7 @@ Gemm::~Gemm()
allocator_
->
free
((
void
**
)(
&
workspace_
));
allocator_
->
free
((
void
**
)(
&
workspace_
));
allocator_
=
nullptr
;
allocator_
=
nullptr
;
}
}
cublasLtDestroy
(
cublaslt_handle_
);
//
cublasLtDestroy(cublaslt_handle_);
cublasDestroy
(
cublas_handle_
);
cublasDestroy
(
cublas_handle_
);
delete
cublas_algo_map_
;
delete
cublas_algo_map_
;
delete
mutex_
;
delete
mutex_
;
...
@@ -248,7 +248,8 @@ void Gemm::gemm(const GemmOp transa,
...
@@ -248,7 +248,8 @@ void Gemm::gemm(const GemmOp transa,
mutex_
->
lock
();
mutex_
->
lock
();
// Use cublas as default in FP32 and cublasLt as default in FP16
// Use cublas as default in FP32 and cublasLt as default in FP16
bool
is_fp16_compute_type
=
compute_type_
==
TYPE_FP16
;
bool
is_fp16_compute_type
=
compute_type_
==
TYPE_FP16
;
bool
using_cublasLt
=
Atype
==
TYPE_FP16
;
// bool using_cublasLt = Atype == TYPE_FP16;
bool
using_cublasLt
=
(
Atype
==
TYPE_FP16
)
?
false
:
false
;
int
batch_count
=
1
;
int
batch_count
=
1
;
half
h_alpha
=
(
half
)
alpha
;
half
h_alpha
=
(
half
)
alpha
;
...
@@ -267,7 +268,8 @@ void Gemm::gemm(const GemmOp transa,
...
@@ -267,7 +268,8 @@ void Gemm::gemm(const GemmOp transa,
using_cublasLt
=
(
info
.
stages
!=
-
1
);
using_cublasLt
=
(
info
.
stages
!=
-
1
);
}
}
if
(
using_cublasLt
)
{
// if (using_cublasLt) {
if
(
0
)
{
const
size_t
a_rows
=
(
a_op
==
getCublasOperation
(
GEMM_OP_N
))
?
_m
:
k
;
const
size_t
a_rows
=
(
a_op
==
getCublasOperation
(
GEMM_OP_N
))
?
_m
:
k
;
const
size_t
a_cols
=
(
a_op
==
getCublasOperation
(
GEMM_OP_N
))
?
k
:
_m
;
const
size_t
a_cols
=
(
a_op
==
getCublasOperation
(
GEMM_OP_N
))
?
k
:
_m
;
const
size_t
b_rows
=
(
b_op
==
getCublasOperation
(
GEMM_OP_N
))
?
k
:
_n
;
const
size_t
b_rows
=
(
b_op
==
getCublasOperation
(
GEMM_OP_N
))
?
k
:
_n
;
...
...
src/turbomind/utils/gemm_test/CMakeLists.txt
View file @
e38ee081
...
@@ -13,7 +13,8 @@
...
@@ -13,7 +13,8 @@
# limitations under the License.
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
set
(
gemm_func_files
set
(
gemm_func_files
gemm_func.cc
gemm_func.cc
...
@@ -51,59 +52,71 @@ set(swin_gemm_func_files
...
@@ -51,59 +52,71 @@ set(swin_gemm_func_files
swin_gemm_func.cc
swin_gemm_func.cc
)
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
gemm_func STATIC
${
gemm_func_files
}
)
add_library
(
gemm_func STATIC
${
gemm_func_files
}
)
target_link_libraries
(
gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger
)
#target_link_libraries(gemm_func PUBLIC cublas cublasLt cudart cuda_utils logger)
set_property
(
TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
target_link_libraries
(
gemm_func PUBLIC cublas cudart cuda_utils logger
)
set_property
(
TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
encoder_gemm_func STATIC
${
encoder_gemm_func_files
}
)
add_library
(
encoder_gemm_func STATIC
${
encoder_gemm_func_files
}
)
target_link_libraries
(
encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(encoder_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
encoder_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
encoder_gemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
encoder_gemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
set_property
(
TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
encoder_igemm_func STATIC
${
encoder_igemm_func_files
}
)
add_library
(
encoder_igemm_func STATIC
${
encoder_igemm_func_files
}
)
target_link_libraries
(
encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger
)
#target_link_libraries(encoder_igemm_func PUBLIC cublas cublasLt cudart cuda_utils logger)
target_link_libraries
(
encoder_igemm_func PUBLIC cublas cudart cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
encoder_igemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
encoder_igemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
set_property
(
TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
decoding_gemm_func STATIC
${
decoding_gemm_func_files
}
)
add_library
(
decoding_gemm_func STATIC
${
decoding_gemm_func_files
}
)
target_link_libraries
(
decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(decoding_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
set_property
(
TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
target_link_libraries
(
decoding_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
set_property
(
TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
gpt_gemm_func STATIC
${
gpt_gemm_func_files
}
)
add_library
(
gpt_gemm_func STATIC
${
gpt_gemm_func_files
}
)
target_link_libraries
(
gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(gpt_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
gpt_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
gpt_gemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
gpt_gemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
set_property
(
TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
xlnet_gemm_func STATIC
${
xlnet_gemm_func_files
}
)
add_library
(
xlnet_gemm_func STATIC
${
xlnet_gemm_func_files
}
)
target_link_libraries
(
xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(xlnet_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
set_property
(
TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
target_link_libraries
(
xlnet_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
set_property
(
TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
t5_gemm_func STATIC
${
t5_gemm_func_files
}
)
add_library
(
t5_gemm_func STATIC
${
t5_gemm_func_files
}
)
target_link_libraries
(
t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(t5_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
t5_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
t5_gemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
t5_gemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
endif
()
set_property
(
TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
#
set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property
(
TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
swin_igemm_func STATIC
${
swin_igemm_func_files
}
)
add_library
(
swin_igemm_func STATIC
${
swin_igemm_func_files
}
)
target_link_libraries
(
swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger
)
#target_link_libraries(swin_igemm_func PUBLIC cublas cublasLt cudart gemm_func encoder_igemm_func cuda_utils logger)
set_property
(
TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
target_link_libraries
(
swin_igemm_func PUBLIC cublas cudart gemm_func encoder_igemm_func cuda_utils logger
)
set_property
(
TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
swin_gemm_func STATIC
${
swin_gemm_func_files
}
)
add_library
(
swin_gemm_func STATIC
${
swin_gemm_func_files
}
)
target_link_libraries
(
swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(swin_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
set_property
(
TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
target_link_libraries
(
swin_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
set_property
(
TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
src/turbomind/utils/gemm_test/decoding_gemm_func.cc
View file @
e38ee081
...
@@ -130,8 +130,8 @@ void generate_decoding_gemm_config(int batch_size,
...
@@ -130,8 +130,8 @@ void generate_decoding_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
//
cublasLtHandle_t ltHandle;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
AType
;
cudaDataType_t
BType
;
cudaDataType_t
BType
;
...
@@ -156,8 +156,10 @@ void generate_decoding_gemm_config(int batch_size,
...
@@ -156,8 +156,10 @@ void generate_decoding_gemm_config(int batch_size,
BType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
@@ -166,8 +168,10 @@ void generate_decoding_gemm_config(int batch_size,
...
@@ -166,8 +168,10 @@ void generate_decoding_gemm_config(int batch_size,
BType
=
CUDA_R_16BF
;
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#endif
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
>::
Type
;
using
scaleT
=
typename
ScaleTypeConverter
<
T
>::
Type
;
...
@@ -241,38 +245,39 @@ void generate_decoding_gemm_config(int batch_size,
...
@@ -241,38 +245,39 @@ void generate_decoding_gemm_config(int batch_size,
const
int
ALGO_COMBINATIONS
=
5000
;
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
// LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size
*
beam_width
,
// batch_size * beam_width,
seq_len
,
// seq_len,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
&
alpha
,
// &alpha,
d_B
,
// d_B,
d_A
,
// d_A,
&
beta
,
// &beta,
d_C
,
// d_C,
cublas_workspace
,
// cublas_workspace,
workSpaceSize
,
// workSpaceSize,
fd
,
// fd,
perfResults
,
// perfResults,
ALGO_COMBINATIONS
);
// ALGO_COMBINATIONS);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
// if (perfResults[0].time < exec_time) {
printPerfStructure
(
batch_size
*
beam_width
,
// printPerfStructure(batch_size * beam_width,
seq_len
,
// seq_len,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
perfResults
[
0
],
// perfResults[0],
fd
,
// fd,
data_type
,
// data_type,
0
);
// 0);
}
// }
else
{
// else {
{
fprintf
(
fd
,
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/encoder_gemm_func.cc
View file @
e38ee081
...
@@ -127,8 +127,8 @@ void generate_encoder_gemm_config(
...
@@ -127,8 +127,8 @@ void generate_encoder_gemm_config(
cublasHandle_t
cublas_handle
;
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
//
cublasLtHandle_t ltHandle;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
AType
;
cudaDataType_t
BType
;
cudaDataType_t
BType
;
...
@@ -153,8 +153,10 @@ void generate_encoder_gemm_config(
...
@@ -153,8 +153,10 @@ void generate_encoder_gemm_config(
BType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
@@ -163,8 +165,10 @@ void generate_encoder_gemm_config(
...
@@ -163,8 +165,10 @@ void generate_encoder_gemm_config(
BType
=
CUDA_R_16BF
;
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#endif
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
...
@@ -331,30 +335,31 @@ void generate_encoder_gemm_config(
...
@@ -331,30 +335,31 @@ void generate_encoder_gemm_config(
// Let try a fixed number of combinations
// Let try a fixed number of combinations
const
int
ALGO_COMBINATIONS
=
5000
;
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
// LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size
,
// batch_size,
seq_len
,
// seq_len,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
&
alpha
,
// &alpha,
d_B
,
// d_B,
d_A
,
// d_A,
&
beta
,
// &beta,
d_C
,
// d_C,
cublas_workspace
,
// cublas_workspace,
workSpaceSize
,
// workSpaceSize,
fd
,
// fd,
perfResults
,
// perfResults,
ALGO_COMBINATIONS
);
// ALGO_COMBINATIONS);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
// if (perfResults[0].time < exec_time) {
printPerfStructure
(
// printPerfStructure(
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time
=
perfResults
[
0
].
time
;
// exec_time = perfResults[0].time;
}
// }
else
{
// else {
{
fprintf
(
fd
,
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/encoder_igemm_func.cc
View file @
e38ee081
...
@@ -234,22 +234,22 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
...
@@ -234,22 +234,22 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
cudaDeviceSynchronize
();
cudaDeviceSynchronize
();
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
auto
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
for
(
int
loop
=
0
;
loop
<
repeats
;
loop
++
)
{
for
(
int
loop
=
0
;
loop
<
repeats
;
loop
++
)
{
oneRunStatus
=
cublasLtMatmul
(
ltHandle
,
//
oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc
,
//
operationDesc,
alpha
,
//
alpha,
A
,
//
A,
Adesc
,
//
Adesc,
B
,
//
B,
Bdesc
,
//
Bdesc,
beta
,
//
beta,
C
,
//
C,
Cdesc
,
//
Cdesc,
D
,
//
D,
Ddesc
,
//
Ddesc,
&
algo
,
//
&algo,
workSpace
,
//
workSpace,
workSpaceSizeInBytes
,
//
workSpaceSizeInBytes,
stream
);
//
stream);
}
}
cudaDeviceSynchronize
();
cudaDeviceSynchronize
();
auto
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
auto
end
=
std
::
chrono
::
high_resolution_clock
::
now
();
...
...
src/turbomind/utils/gemm_test/gpt_gemm_func.cc
View file @
e38ee081
...
@@ -223,8 +223,8 @@ void generate_gpt_gemm_config(int batch_size,
...
@@ -223,8 +223,8 @@ void generate_gpt_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
//
cublasLtHandle_t ltHandle;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
AType
;
cudaDataType_t
BType
;
cudaDataType_t
BType
;
...
@@ -253,8 +253,10 @@ void generate_gpt_gemm_config(int batch_size,
...
@@ -253,8 +253,10 @@ void generate_gpt_gemm_config(int batch_size,
CType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
DType
=
CUDA_R_16F
;
DType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
@@ -264,8 +266,10 @@ void generate_gpt_gemm_config(int batch_size,
...
@@ -264,8 +266,10 @@ void generate_gpt_gemm_config(int batch_size,
CType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
DType
=
CUDA_R_16BF
;
DType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#endif
#endif
#ifdef ENABLE_FP8
#ifdef ENABLE_FP8
...
@@ -293,8 +297,10 @@ void generate_gpt_gemm_config(int batch_size,
...
@@ -293,8 +297,10 @@ void generate_gpt_gemm_config(int batch_size,
DType_FP8
[
9
]
=
CUDA_R_16BF
;
DType_FP8
[
9
]
=
CUDA_R_16BF
;
#endif
#endif
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#endif
#endif
float
alpha
=
(
float
)
1.0
f
;
float
alpha
=
(
float
)
1.0
f
;
...
@@ -456,44 +462,45 @@ void generate_gpt_gemm_config(int batch_size,
...
@@ -456,44 +462,45 @@ void generate_gpt_gemm_config(int batch_size,
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
// for gpt, computeType & scaleType should be FP32
// for gpt, computeType & scaleType should be FP32
LtHgemmCustomFind
<
T
,
float
>
(
ltHandle
,
// LtHgemmCustomFind<T, float>(ltHandle,
batch_size
*
beam_width
,
// batch_size * beam_width,
i
==
1
||
i
==
2
?
max_input_len
:
1
,
// i == 1 || i == 2 ? max_input_len : 1,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
&
alpha
,
// &alpha,
d_B
,
// d_B,
d_A
,
// d_A,
&
beta
,
// &beta,
d_C
,
// d_C,
cublas_workspace
,
// cublas_workspace,
workSpaceSize
,
// workSpaceSize,
fd
,
// fd,
perfResults
,
// perfResults,
ALGO_COMBINATIONS
,
// ALGO_COMBINATIONS,
DType_FP8
[
i
],
// DType_FP8[i],
batchCount
[
i
],
// batchCount[i],
strideA
[
i
],
// strideA[i],
strideB
[
i
],
// strideB[i],
strideD
[
i
]);
// strideD[i]);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
// if (perfResults[0].time < exec_time) {
printPerfStructure
(
batch_size
*
beam_width
,
// printPerfStructure(batch_size * beam_width,
seq_len
,
// seq_len,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
perfResults
[
0
],
// perfResults[0],
fd
,
// fd,
data_type
,
// data_type,
0
,
// 0,
batchCount
[
i
]);
// batchCount[i]);
}
// }
else
{
// else {
{
fprintf
(
fd
,
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/swin_gemm_func.cc
View file @
e38ee081
...
@@ -133,8 +133,8 @@ void generate_swin_gemm_config(
...
@@ -133,8 +133,8 @@ void generate_swin_gemm_config(
cublasHandle_t
cublas_handle
;
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
//
cublasLtHandle_t ltHandle;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
AType
;
cudaDataType_t
BType
;
cudaDataType_t
BType
;
...
@@ -159,8 +159,10 @@ void generate_swin_gemm_config(
...
@@ -159,8 +159,10 @@ void generate_swin_gemm_config(
BType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
@@ -169,8 +171,10 @@ void generate_swin_gemm_config(
...
@@ -169,8 +171,10 @@ void generate_swin_gemm_config(
BType
=
CUDA_R_16BF
;
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#endif
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
...
@@ -309,30 +313,31 @@ void generate_swin_gemm_config(
...
@@ -309,30 +313,31 @@ void generate_swin_gemm_config(
const
int
ALGO_COMBINATIONS
=
5000
;
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
// LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size
,
// batch_size,
seq_len
,
// seq_len,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
&
alpha
,
// &alpha,
d_B
,
// d_B,
d_A
,
// d_A,
&
beta
,
// &beta,
d_C
,
// d_C,
cublas_workspace
,
// cublas_workspace,
workSpaceSize
,
// workSpaceSize,
fd
,
// fd,
perfResults
,
// perfResults,
ALGO_COMBINATIONS
);
// ALGO_COMBINATIONS);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
// if (perfResults[0].time < exec_time) {
printPerfStructure
(
// printPerfStructure(
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time
=
perfResults
[
0
].
time
;
// exec_time = perfResults[0].time;
}
// }
else
{
// else {
{
fprintf
(
fd
,
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/swin_igemm_func.cc
View file @
e38ee081
...
@@ -144,23 +144,23 @@ int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
...
@@ -144,23 +144,23 @@ int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
int8_t
*
d_B
=
d_A
+
m
*
k
;
// k * n, stored in column-major
int8_t
*
d_B
=
d_A
+
m
*
k
;
// k * n, stored in column-major
int8_t
*
d_C
=
(
int8_t
*
)(
d_B
+
k
*
n
);
// m * n, stored in column-major
int8_t
*
d_C
=
(
int8_t
*
)(
d_B
+
k
*
n
);
// m * n, stored in column-major
cublasLtHandle_t
ltHandle
;
//
cublasLtHandle_t ltHandle;
cublasLtCreate
(
&
ltHandle
);
//
cublasLtCreate(<Handle);
LtIgemmCustomFind
(
ltHandle
,
//
LtIgemmCustomFind(ltHandle,
m
,
//
m,
n
,
//
n,
k
,
//
k,
&
alpha
,
/* host pointer */
//
&alpha, /* host pointer */
d_A
,
//
d_A,
d_B
,
//
d_B,
&
beta
,
/* host pointer */
//
&beta, /* host pointer */
d_C
,
//
d_C,
NULL
,
//
NULL,
0
,
//
0,
fout
);
//
fout);
cublasLtDestroy
(
ltHandle
);
//
cublasLtDestroy(ltHandle);
return
0
;
return
0
;
}
}
...
...
src/turbomind/utils/gemm_test/t5_gemm_func.cc
View file @
e38ee081
...
@@ -195,8 +195,8 @@ void generate_t5_gemm_config(int batch_size,
...
@@ -195,8 +195,8 @@ void generate_t5_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
//
cublasLtHandle_t ltHandle;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
AType
;
cudaDataType_t
BType
;
cudaDataType_t
BType
;
...
@@ -221,8 +221,10 @@ void generate_t5_gemm_config(int batch_size,
...
@@ -221,8 +221,10 @@ void generate_t5_gemm_config(int batch_size,
BType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#ifdef ENABLE_BF16
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
@@ -231,8 +233,10 @@ void generate_t5_gemm_config(int batch_size,
...
@@ -231,8 +233,10 @@ void generate_t5_gemm_config(int batch_size,
BType
=
CUDA_R_16BF
;
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
}
#endif
#endif
float
f_alpha
=
(
float
)
1.0
f
;
float
f_alpha
=
(
float
)
1.0
f
;
...
@@ -442,60 +446,61 @@ void generate_t5_gemm_config(int batch_size,
...
@@ -442,60 +446,61 @@ void generate_t5_gemm_config(int batch_size,
scaleT
alpha_scale
=
(
scaleT
)
1.0
f
;
scaleT
alpha_scale
=
(
scaleT
)
1.0
f
;
scaleT
beta_scale
=
(
scaleT
)
0.0
f
;
scaleT
beta_scale
=
(
scaleT
)
0.0
f
;
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
// LtHgemmCustomFind<T, scaleT>(ltHandle,
m
,
// m,
seq_len
,
// seq_len,
head_num
,
// head_num,
size_per_head
,
// size_per_head,
n
,
// n,
m
,
// m,
k
,
// k,
&
(
alpha_scale
),
// &(alpha_scale),
d_B
,
// d_B,
d_A
,
// d_A,
&
(
beta_scale
),
// &(beta_scale),
d_C
,
// d_C,
cublas_workspace
,
// cublas_workspace,
workSpaceSize
,
// workSpaceSize,
fd
,
// fd,
perfResults
,
// perfResults,
ALGO_COMBINATIONS
);
// ALGO_COMBINATIONS);
}
else
{
LtHgemmCustomFind
<
T
,
float
>
(
ltHandle
,
m
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
(
f_alpha
),
d_B
,
d_A
,
&
(
f_beta
),
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
}
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
*
(
i
<=
5
||
i
==
1
?
1
:
beam_width
),
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
}
}
else
{
else
{
// LtHgemmCustomFind<T, float>(ltHandle,
// m,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &(f_alpha),
// d_B,
// d_A,
// &(f_beta),
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
}
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0);
// }
// else {
{
fprintf
(
fd
,
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment