Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
...@@ -51,7 +51,7 @@ void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream) ...@@ -51,7 +51,7 @@ void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream)
invokeOneOrTwoShotAllReduceKernel<T>(param_, stream); invokeOneOrTwoShotAllReduceKernel<T>(param_, stream);
// swap back // swap back
output_tensor_->at(0).data = (const void*)tmp_tensor_data_; output_tensor_->at(0).data = (void*)tmp_tensor_data_;
} }
template<typename T> template<typename T>
...@@ -114,7 +114,7 @@ bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buff ...@@ -114,7 +114,7 @@ bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buff
if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) { if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) {
tmp_tensor_data_ = (T*)(tensor_buffer->at(0).data); tmp_tensor_data_ = (T*)(tensor_buffer->at(0).data);
output_tensor_ = tensor_buffer; output_tensor_ = tensor_buffer;
tensor_buffer->at(0).data = param_.peer_comm_buffer_ptrs[rank_]; tensor_buffer->at(0).data = (void*)param_.peer_comm_buffer_ptrs[rank_];
param_.local_output_buffer_ptr = tmp_tensor_data_; param_.local_output_buffer_ptr = tmp_tensor_data_;
return true; return true;
} }
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# limitations under the License. # limitations under the License.
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
set(gemm_func_files set(gemm_func_files
gemm_func.cc gemm_func.cc
) )
...@@ -50,58 +52,58 @@ set(swin_gemm_func_files ...@@ -50,58 +52,58 @@ set(swin_gemm_func_files
) )
add_library(gemm_func STATIC ${gemm_func_files}) add_library(gemm_func STATIC ${gemm_func_files})
target_link_libraries(gemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger) target_link_libraries(gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files}) add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
target_link_libraries(encoder_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger) target_link_libraries(encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT) if (SPARSITY_SUPPORT)
target_link_libraries(encoder_gemm_func PUBLIC -lcusparse -lcusparseLt) target_link_libraries(encoder_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif() endif()
set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files}) add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
target_link_libraries(encoder_igemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger) target_link_libraries(encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
if (SPARSITY_SUPPORT) if (SPARSITY_SUPPORT)
target_link_libraries(encoder_igemm_func PUBLIC -lcusparse -lcusparseLt) target_link_libraries(encoder_igemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif() endif()
set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files}) add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
target_link_libraries(decoding_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger) target_link_libraries(decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files}) add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
target_link_libraries(gpt_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger) target_link_libraries(gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT) if (SPARSITY_SUPPORT)
target_link_libraries(gpt_gemm_func PUBLIC -lcusparse -lcusparseLt) target_link_libraries(gpt_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif() endif()
set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files}) add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
target_link_libraries(xlnet_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger) target_link_libraries(xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(t5_gemm_func STATIC ${t5_gemm_func_files}) add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
target_link_libraries(t5_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger) target_link_libraries(t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT) if (SPARSITY_SUPPORT)
target_link_libraries(t5_gemm_func PUBLIC -lcusparse -lcusparseLt) target_link_libraries(t5_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif() endif()
set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_igemm_func STATIC ${swin_igemm_func_files}) add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
target_link_libraries(swin_igemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func encoder_igemm_func cuda_utils logger) target_link_libraries(swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger)
set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_gemm_func STATIC ${swin_gemm_func_files}) add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
target_link_libraries(swin_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger) target_link_libraries(swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "src/turbomind/utils/gemm_test/decoding_gemm_func.h" #include "src/turbomind/utils/gemm_test/decoding_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -137,7 +139,6 @@ void generate_decoding_gemm_config(int batch_size, ...@@ -137,7 +139,6 @@ void generate_decoding_gemm_config(int batch_size,
cudaDataType_t computeType; cudaDataType_t computeType;
int startAlgo, endAlgo; int startAlgo, endAlgo;
const int ites = 100; const int ites = 100;
struct timeval start, end;
CublasDataType data_type; CublasDataType data_type;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -195,7 +196,7 @@ void generate_decoding_gemm_config(int batch_size, ...@@ -195,7 +196,7 @@ void generate_decoding_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) { for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status; cublasStatus_t status;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
status = cublasGemmEx(cublas_handle, status = cublasGemmEx(cublas_handle,
CUBLAS_OP_N, CUBLAS_OP_N,
...@@ -221,11 +222,12 @@ void generate_decoding_gemm_config(int batch_size, ...@@ -221,11 +222,12 @@ void generate_decoding_gemm_config(int batch_size,
} }
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites); printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (diffTime(start, end) / ites < exec_time) { if (dur.count() / ites < exec_time) {
exec_time = diffTime(start, end) / ites; exec_time = dur.count() / ites;
fast_algo = algo; fast_algo = algo;
} }
} }
...@@ -236,7 +238,7 @@ void generate_decoding_gemm_config(int batch_size, ...@@ -236,7 +238,7 @@ void generate_decoding_gemm_config(int batch_size,
if (data_type != FLOAT_DATATYPE) { if (data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n"); printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations // Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000; const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle, LtHgemmCustomFind<T, scaleT>(ltHandle,
......
...@@ -27,8 +27,10 @@ ...@@ -27,8 +27,10 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <map> #include <map>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "src/turbomind/utils/gemm_test/encoder_gemm_func.h" #include "src/turbomind/utils/gemm_test/encoder_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -134,7 +136,6 @@ void generate_encoder_gemm_config( ...@@ -134,7 +136,6 @@ void generate_encoder_gemm_config(
cudaDataType_t computeType; cudaDataType_t computeType;
int startAlgo, endAlgo; int startAlgo, endAlgo;
const int ites = 100; const int ites = 100;
struct timeval start, end;
CublasDataType data_type; CublasDataType data_type;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -212,7 +213,7 @@ void generate_encoder_gemm_config( ...@@ -212,7 +213,7 @@ void generate_encoder_gemm_config(
for (int algo = startAlgo; algo <= endAlgo; algo++) { for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status; cublasStatus_t status;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
if (i < 3) { if (i < 3) {
status = cublasGemmEx(cublas_handle, status = cublasGemmEx(cublas_handle,
...@@ -312,11 +313,12 @@ void generate_encoder_gemm_config( ...@@ -312,11 +313,12 @@ void generate_encoder_gemm_config(
} }
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites); printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (diffTime(start, end) / ites < exec_time) { if (dur.count() / ites < exec_time) {
exec_time = diffTime(start, end) / ites; exec_time = dur.count() / ites;
fast_algo = algo; fast_algo = algo;
} }
} }
...@@ -327,7 +329,7 @@ void generate_encoder_gemm_config( ...@@ -327,7 +329,7 @@ void generate_encoder_gemm_config(
if (i < 3 && data_type != FLOAT_DATATYPE) { if (i < 3 && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n"); printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations // Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000; const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle, LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size, batch_size,
...@@ -485,7 +487,7 @@ void generate_encoder_gemm_config( ...@@ -485,7 +487,7 @@ void generate_encoder_gemm_config(
&handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
// and these descs can be stored to other place // and these descs can be stored to other place
...@@ -516,10 +518,11 @@ void generate_encoder_gemm_config( ...@@ -516,10 +518,11 @@ void generate_encoder_gemm_config(
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites); auto dur = std::chrono::duration<float, std::milli>(end - start);
if (diffTime(start, end) < exec_time) { printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
exec_time = diffTime(start, end); if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg; fast_algo = alg;
} }
} }
......
...@@ -27,8 +27,10 @@ ...@@ -27,8 +27,10 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <map> #include <map>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "encoder_igemm_func.h" #include "encoder_igemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
#ifndef CUDART_VERSION #ifndef CUDART_VERSION
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
...@@ -83,7 +85,7 @@ int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE ...@@ -83,7 +85,7 @@ int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE
#if (CUDART_VERSION >= 11000) #if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL); cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else #else
stages = 0; stages = 0;
#endif #endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d " printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
...@@ -149,7 +151,7 @@ int printBatchPerfStructure( ...@@ -149,7 +151,7 @@ int printBatchPerfStructure(
#if (CUDART_VERSION >= 11000) #if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL); cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else #else
stages = 0; stages = 0;
#endif #endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d " printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
...@@ -228,10 +230,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // ...@@ -228,10 +230,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult); cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) { if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) { if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
struct timeval start, end;
cublasStatus_t oneRunStatus; cublasStatus_t oneRunStatus;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int loop = 0; loop < repeats; loop++) { for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle, oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc, operationDesc,
...@@ -251,11 +252,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // ...@@ -251,11 +252,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
stream); stream);
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) { if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus; algoStatus = oneRunStatus;
} }
float time = diffTime(start, end); float time = dur.count();
// For the moment only add successful findings // For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) { if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo; perfResults.algo = algo;
...@@ -352,7 +354,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle, ...@@ -352,7 +354,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
} }
#else #else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif #endif
int ldaTransform = 32 * m; int ldaTransform = 32 * m;
...@@ -369,7 +371,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle, ...@@ -369,7 +371,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
#if (CUDART_VERSION >= 11000) #if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else #else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType); status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif #endif
if (status != CUBLAS_STATUS_SUCCESS) { if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP; goto CLEANUP;
...@@ -689,7 +691,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, ...@@ -689,7 +691,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
} }
#else #else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C; order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif #endif
int ldaTransform = 32 * m; int ldaTransform = 32 * m;
...@@ -711,7 +713,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, ...@@ -711,7 +713,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
#if (CUDART_VERSION >= 11000) #if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType); status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else #else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType); status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif #endif
if (status != CUBLAS_STATUS_SUCCESS) { if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP; goto CLEANUP;
...@@ -1166,11 +1168,10 @@ int generate_encoder_igemm_config( ...@@ -1166,11 +1168,10 @@ int generate_encoder_igemm_config(
} }
if (do_sparse_test) { if (do_sparse_test) {
printf("***cusparseLt Gemm Testing Begin***\n"); printf("***cusparseLt Gemm Testing Begin***\n");
const int spgemm_num = 3; const int spgemm_num = 3;
FILE* fd; FILE* fd;
int line_count = 0; int line_count = 0;
const int ites = 100; const int ites = 100;
struct timeval start, end;
if (!isAppend) { if (!isAppend) {
fd = fopen(SPIGEMM_CONFIG, "w+"); fd = fopen(SPIGEMM_CONFIG, "w+");
} }
...@@ -1267,7 +1268,7 @@ int generate_encoder_igemm_config( ...@@ -1267,7 +1268,7 @@ int generate_encoder_igemm_config(
&handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_8I, col_order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_8I, col_order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_8I, col_order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_8I, col_order))
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
// and these descs can be stored to other place // and these descs can be stored to other place
...@@ -1298,10 +1299,11 @@ int generate_encoder_igemm_config( ...@@ -1298,10 +1299,11 @@ int generate_encoder_igemm_config(
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites); auto dur = std::chrono::duration<float, std::milli>(end - start);
if (diffTime(start, end) < exec_time) { printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
exec_time = diffTime(start, end); if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg; fast_algo = alg;
} }
} }
......
...@@ -24,9 +24,11 @@ ...@@ -24,9 +24,11 @@
#include <map> #include <map>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <time.h>
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "encoder_gemm_func.h" #include "encoder_gemm_func.h"
#include <assert.h> #include <assert.h>
#include <sys/types.h> #include <sys/types.h>
#include <vector>
#ifndef CUDART_VERSION #ifndef CUDART_VERSION
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
...@@ -268,17 +269,17 @@ int LtHgemmCustomFind(cublasLtHandle_t ltHandle, ...@@ -268,17 +269,17 @@ int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// given algo // given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32}; const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations // Let try a fixed number of combinations
int AlgoCount = 0; int AlgoCount = 0;
int AlgoCountRestrict = 0; // workspace == 0 int AlgoCountRestrict = 0; // workspace == 0
const int maxNumTraversal = 50; // max number of traversal const int maxNumTraversal = 50; // max number of traversal
cublasLtMatmulAlgo_t algos[AlgoCombinations]; // 0 <= workspace <= 32MB std::vector<cublasLtMatmulAlgo_t> algos(AlgoCombinations); // 0 <= workspace <= 32MB
cublasLtMatmulAlgo_t algosRestrict[AlgoCombinations]; // workspace == 0 std::vector<cublasLtMatmulAlgo_t> algosRestrict(AlgoCombinations); // workspace == 0
const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
int nbAlgoIds = 0; // Number of algorithms actually returned by int nbAlgoIds = 0; // Number of algorithms actually returned by
// cublasLtMatmulAlgoGetIds function. // cublasLtMatmulAlgoGetIds function.
#define ALGO_IDS 100 // Number of algorithms requested. #define ALGO_IDS 100 // Number of algorithms requested.
int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by
// cublasLtMatmulAlgoGetIds function. // cublasLtMatmulAlgoGetIds function.
cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype; cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
#if (CUDART_VERSION >= 11000) #if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType; cublasComputeType_t computeType;
......
...@@ -28,10 +28,12 @@ ...@@ -28,10 +28,12 @@
#ifdef ENABLE_FP8 #ifdef ENABLE_FP8
#include <cuda_fp8.h> #include <cuda_fp8.h>
#endif #endif
#include <cuda_profiler_api.h> #ifdef __linux__
#include <map>
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <cuda_profiler_api.h>
#include <map>
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h" #include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -232,7 +234,6 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -232,7 +234,6 @@ void generate_gpt_gemm_config(int batch_size,
cudaDataType_t computeType; cudaDataType_t computeType;
int startAlgo, endAlgo; int startAlgo, endAlgo;
const int ites = 100; const int ites = 100;
struct timeval start, end;
CublasDataType data_type; CublasDataType data_type;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -332,7 +333,7 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -332,7 +333,7 @@ void generate_gpt_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) { for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status; cublasStatus_t status;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
if (i == 1) { if (i == 1) {
status = cublasGemmStridedBatchedEx(cublas_handle, status = cublasGemmStridedBatchedEx(cublas_handle,
...@@ -432,11 +433,12 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -432,11 +433,12 @@ void generate_gpt_gemm_config(int batch_size,
} }
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites); printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (diffTime(start, end) / ites < exec_time) { if (dur.count() / ites < exec_time) {
exec_time = diffTime(start, end) / ites; exec_time = dur.count() / ites;
fast_algo = algo; fast_algo = algo;
} }
} }
...@@ -450,7 +452,7 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -450,7 +452,7 @@ void generate_gpt_gemm_config(int batch_size,
if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) { if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) {
printf("***cublasLt Gemm Testing Beign***\n"); printf("***cublasLt Gemm Testing Beign***\n");
// Let try a fixed number of combinations // Let try a fixed number of combinations
int ALGO_COMBINATIONS = 10000; const int ALGO_COMBINATIONS = 10000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// for gpt, computeType & scaleType should be FP32 // for gpt, computeType & scaleType should be FP32
...@@ -644,7 +646,7 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -644,7 +646,7 @@ void generate_gpt_gemm_config(int batch_size,
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order)) cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
// and these descs can be stored to other place // and these descs can be stored to other place
...@@ -675,10 +677,11 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -675,10 +677,11 @@ void generate_gpt_gemm_config(int batch_size,
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites); auto dur = std::chrono::duration<float, std::milli>(end - start);
if (diffTime(start, end) < exec_time) { printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
exec_time = diffTime(start, end); if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg; fast_algo = alg;
} }
} }
......
...@@ -32,8 +32,10 @@ ...@@ -32,8 +32,10 @@
#endif #endif
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <map> #include <map>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "src/turbomind/utils/gemm_test/swin_gemm_func.h" #include "src/turbomind/utils/gemm_test/swin_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -140,7 +142,6 @@ void generate_swin_gemm_config( ...@@ -140,7 +142,6 @@ void generate_swin_gemm_config(
cudaDataType_t computeType; cudaDataType_t computeType;
int startAlgo, endAlgo; int startAlgo, endAlgo;
const int ites = 100; const int ites = 100;
struct timeval start, end;
CublasDataType data_type; CublasDataType data_type;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -211,7 +212,7 @@ void generate_swin_gemm_config( ...@@ -211,7 +212,7 @@ void generate_swin_gemm_config(
for (int algo = startAlgo; algo <= endAlgo; algo++) { for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status; cublasStatus_t status;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
if (i < 5) { if (i < 5) {
status = cublasGemmEx(cublas_handle, status = cublasGemmEx(cublas_handle,
...@@ -289,11 +290,12 @@ void generate_swin_gemm_config( ...@@ -289,11 +290,12 @@ void generate_swin_gemm_config(
} }
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites); printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (diffTime(start, end) / ites < exec_time) { if (dur.count() / ites < exec_time) {
exec_time = diffTime(start, end) / ites; exec_time = dur.count() / ites;
fast_algo = algo; fast_algo = algo;
} }
} }
...@@ -304,7 +306,7 @@ void generate_swin_gemm_config( ...@@ -304,7 +306,7 @@ void generate_swin_gemm_config(
if (i < 5 && data_type != FLOAT_DATATYPE) { if (i < 5 && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n"); printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations // Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000; const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle, LtHgemmCustomFind<T, scaleT>(ltHandle,
......
...@@ -27,8 +27,10 @@ ...@@ -27,8 +27,10 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <map> #include <map>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*/ */
#include "swin_igemm_func.h" #include "swin_igemm_func.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -86,10 +87,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // ...@@ -86,10 +87,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult); cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) { if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) { if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
struct timeval start, end;
cublasStatus_t oneRunStatus; cublasStatus_t oneRunStatus;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int loop = 0; loop < repeats; loop++) { for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle, oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc, operationDesc,
...@@ -109,11 +109,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // ...@@ -109,11 +109,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
stream); stream);
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) { if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus; algoStatus = oneRunStatus;
} }
float time = diffTime(start, end); float time = dur.count();
// For the moment only add successful findings // For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) { if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo; perfResults.algo = algo;
......
...@@ -25,9 +25,11 @@ ...@@ -25,9 +25,11 @@
#include <map> #include <map>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/time.h>
#include <time.h> #include <time.h>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "src/turbomind/utils/gemm_test/t5_gemm_func.h" #include "src/turbomind/utils/gemm_test/t5_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -202,7 +204,6 @@ void generate_t5_gemm_config(int batch_size, ...@@ -202,7 +204,6 @@ void generate_t5_gemm_config(int batch_size,
cudaDataType_t computeType; cudaDataType_t computeType;
int startAlgo, endAlgo; int startAlgo, endAlgo;
const int ites = 100; const int ites = 100;
struct timeval start, end;
CublasDataType data_type; CublasDataType data_type;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -286,7 +287,7 @@ void generate_t5_gemm_config(int batch_size, ...@@ -286,7 +287,7 @@ void generate_t5_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) { for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status; cublasStatus_t status;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
if (i == 0) { if (i == 0) {
status = cublasGemmBatchedEx(cublas_handle, status = cublasGemmBatchedEx(cublas_handle,
...@@ -408,11 +409,12 @@ void generate_t5_gemm_config(int batch_size, ...@@ -408,11 +409,12 @@ void generate_t5_gemm_config(int batch_size,
} }
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites); printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (diffTime(start, end) / ites < exec_time) { if (dur.count() / ites < exec_time) {
exec_time = diffTime(start, end) / ites; exec_time = dur.count() / ites;
fast_algo = algo; fast_algo = algo;
} }
} }
...@@ -431,7 +433,7 @@ void generate_t5_gemm_config(int batch_size, ...@@ -431,7 +433,7 @@ void generate_t5_gemm_config(int batch_size,
if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) { if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) {
printf("***cublasLt Gemm Testing Begin***\n"); printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations // Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000; const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// for t5, computeType & scaleType should be FP32 // for t5, computeType & scaleType should be FP32
...@@ -643,7 +645,7 @@ void generate_t5_gemm_config(int batch_size, ...@@ -643,7 +645,7 @@ void generate_t5_gemm_config(int batch_size,
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order)) cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
// and these descs can be stored to other place // and these descs can be stored to other place
...@@ -674,10 +676,11 @@ void generate_t5_gemm_config(int batch_size, ...@@ -674,10 +676,11 @@ void generate_t5_gemm_config(int batch_size,
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan)) CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites); auto dur = std::chrono::duration<float, std::milli>(end - start);
if (diffTime(start, end) < exec_time) { printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
exec_time = diffTime(start, end); if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg; fast_algo = alg;
} }
} }
......
...@@ -27,8 +27,10 @@ ...@@ -27,8 +27,10 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <map> #include <map>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
*/ */
#include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h" #include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind { namespace turbomind {
...@@ -225,7 +227,6 @@ void generate_xlnet_gemm_config(int batch_size, ...@@ -225,7 +227,6 @@ void generate_xlnet_gemm_config(int batch_size,
cudaDataType_t computeType; cudaDataType_t computeType;
int startAlgo, endAlgo; int startAlgo, endAlgo;
const int ites = 100; const int ites = 100;
struct timeval start, end;
CublasDataType data_type; CublasDataType data_type;
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
...@@ -285,7 +286,7 @@ void generate_xlnet_gemm_config(int batch_size, ...@@ -285,7 +286,7 @@ void generate_xlnet_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) { for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status; cublasStatus_t status;
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
if (i == 1 || i == 7 || i == 8 || i == 9) { if (i == 1 || i == 7 || i == 8 || i == 9) {
status = cublasGemmEx(cublas_handle, status = cublasGemmEx(cublas_handle,
...@@ -338,11 +339,12 @@ void generate_xlnet_gemm_config(int batch_size, ...@@ -338,11 +339,12 @@ void generate_xlnet_gemm_config(int batch_size,
} }
} }
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&end, NULL); auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites); printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (diffTime(start, end) / ites < exec_time) { if (dur.count() / ites < exec_time) {
exec_time = diffTime(start, end) / ites; exec_time = dur.count() / ites;
fast_algo = algo; fast_algo = algo;
} // end if diffTime } // end if diffTime
} // end status } // end status
...@@ -353,7 +355,7 @@ void generate_xlnet_gemm_config(int batch_size, ...@@ -353,7 +355,7 @@ void generate_xlnet_gemm_config(int batch_size,
if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) { if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n"); printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations // Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000; const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS]; customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle, LtHgemmCustomFind<T, scaleT>(ltHandle,
......
...@@ -27,8 +27,10 @@ ...@@ -27,8 +27,10 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <map> #include <map>
#ifdef __linux__
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h> #include <unistd.h>
#endif
#include <vector> #include <vector>
namespace turbomind { namespace turbomind {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment