Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
......@@ -51,7 +51,7 @@ void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream)
invokeOneOrTwoShotAllReduceKernel<T>(param_, stream);
// swap back
output_tensor_->at(0).data = (const void*)tmp_tensor_data_;
output_tensor_->at(0).data = (void*)tmp_tensor_data_;
}
template<typename T>
......@@ -114,7 +114,7 @@ bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buff
if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) {
tmp_tensor_data_ = (T*)(tensor_buffer->at(0).data);
output_tensor_ = tensor_buffer;
tensor_buffer->at(0).data = param_.peer_comm_buffer_ptrs[rank_];
tensor_buffer->at(0).data = (void*)param_.peer_comm_buffer_ptrs[rank_];
param_.local_output_buffer_ptr = tmp_tensor_data_;
return true;
}
......
......@@ -13,6 +13,8 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
set(gemm_func_files
gemm_func.cc
)
......@@ -50,58 +52,58 @@ set(swin_gemm_func_files
)
add_library(gemm_func STATIC ${gemm_func_files})
target_link_libraries(gemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
target_link_libraries(gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
target_link_libraries(encoder_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
target_link_libraries(encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(encoder_gemm_func PUBLIC -lcusparse -lcusparseLt)
target_link_libraries(encoder_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif()
set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
target_link_libraries(encoder_igemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
target_link_libraries(encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(encoder_igemm_func PUBLIC -lcusparse -lcusparseLt)
target_link_libraries(encoder_igemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif()
set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
target_link_libraries(decoding_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
target_link_libraries(decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
target_link_libraries(gpt_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
target_link_libraries(gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(gpt_gemm_func PUBLIC -lcusparse -lcusparseLt)
target_link_libraries(gpt_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif()
set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
target_link_libraries(xlnet_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
target_link_libraries(xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
target_link_libraries(t5_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
target_link_libraries(t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(t5_gemm_func PUBLIC -lcusparse -lcusparseLt)
target_link_libraries(t5_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
endif()
set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
target_link_libraries(swin_igemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func encoder_igemm_func cuda_utils logger)
target_link_libraries(swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger)
set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
target_link_libraries(swin_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
target_link_libraries(swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
......@@ -15,6 +15,8 @@
*/
#include "src/turbomind/utils/gemm_test/decoding_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind {
......@@ -137,7 +139,6 @@ void generate_decoding_gemm_config(int batch_size,
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
......@@ -195,7 +196,7 @@ void generate_decoding_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
status = cublasGemmEx(cublas_handle,
CUBLAS_OP_N,
......@@ -221,11 +222,12 @@ void generate_decoding_gemm_config(int batch_size,
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (dur.count() / ites < exec_time) {
exec_time = dur.count() / ites;
fast_algo = algo;
}
}
......@@ -236,7 +238,7 @@ void generate_decoding_gemm_config(int batch_size,
if (data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
......
......@@ -27,8 +27,10 @@
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,8 @@
*/
#include "src/turbomind/utils/gemm_test/encoder_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind {
......@@ -134,7 +136,6 @@ void generate_encoder_gemm_config(
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
......@@ -212,7 +213,7 @@ void generate_encoder_gemm_config(
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
if (i < 3) {
status = cublasGemmEx(cublas_handle,
......@@ -312,11 +313,12 @@ void generate_encoder_gemm_config(
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (dur.count() / ites < exec_time) {
exec_time = dur.count() / ites;
fast_algo = algo;
}
}
......@@ -327,7 +329,7 @@ void generate_encoder_gemm_config(
if (i < 3 && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size,
......@@ -485,7 +487,7 @@ void generate_encoder_gemm_config(
&handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
......@@ -516,10 +518,11 @@ void generate_encoder_gemm_config(
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg;
}
}
......
......@@ -27,8 +27,10 @@
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,8 @@
*/
#include "encoder_igemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
......@@ -83,7 +85,7 @@ int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
stages = 0;
stages = 0;
#endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
......@@ -149,7 +151,7 @@ int printBatchPerfStructure(
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
stages = 0;
stages = 0;
#endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
......@@ -228,10 +230,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
struct timeval start, end;
cublasStatus_t oneRunStatus;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
......@@ -251,11 +252,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
stream);
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus;
}
float time = diffTime(start, end);
float time = dur.count();
// For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo;
......@@ -352,7 +354,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
......@@ -369,7 +371,7 @@ int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
......@@ -689,7 +691,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
......@@ -711,7 +713,7 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
......@@ -1166,11 +1168,10 @@ int generate_encoder_igemm_config(
}
if (do_sparse_test) {
printf("***cusparseLt Gemm Testing Begin***\n");
const int spgemm_num = 3;
FILE* fd;
int line_count = 0;
const int ites = 100;
struct timeval start, end;
const int spgemm_num = 3;
FILE* fd;
int line_count = 0;
const int ites = 100;
if (!isAppend) {
fd = fopen(SPIGEMM_CONFIG, "w+");
}
......@@ -1267,7 +1268,7 @@ int generate_encoder_igemm_config(
&handle, &mat_A, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_8I, col_order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_8I, col_order))
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
......@@ -1298,10 +1299,11 @@ int generate_encoder_igemm_config(
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg;
}
}
......
......@@ -24,9 +24,11 @@
#include <map>
#include <stdio.h>
#include <stdlib.h>
#ifdef __linux__
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#endif
#include <time.h>
#include <vector>
namespace turbomind {
......
......@@ -17,6 +17,7 @@
#include "encoder_gemm_func.h"
#include <assert.h>
#include <sys/types.h>
#include <vector>
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
......@@ -268,17 +269,17 @@ int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
int AlgoCount = 0;
int AlgoCountRestrict = 0; // workspace == 0
const int maxNumTraversal = 50; // max number of traversal
cublasLtMatmulAlgo_t algos[AlgoCombinations]; // 0 <= workspace <= 32MB
cublasLtMatmulAlgo_t algosRestrict[AlgoCombinations]; // workspace == 0
const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
int nbAlgoIds = 0; // Number of algorithms actually returned by
// cublasLtMatmulAlgoGetIds function.
#define ALGO_IDS 100 // Number of algorithms requested.
int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by
// cublasLtMatmulAlgoGetIds function.
int AlgoCount = 0;
int AlgoCountRestrict = 0; // workspace == 0
const int maxNumTraversal = 50; // max number of traversal
std::vector<cublasLtMatmulAlgo_t> algos(AlgoCombinations); // 0 <= workspace <= 32MB
std::vector<cublasLtMatmulAlgo_t> algosRestrict(AlgoCombinations); // workspace == 0
const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
int nbAlgoIds = 0; // Number of algorithms actually returned by
// cublasLtMatmulAlgoGetIds function.
#define ALGO_IDS 100 // Number of algorithms requested.
int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by
// cublasLtMatmulAlgoGetIds function.
cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType;
......
......@@ -28,10 +28,12 @@
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#endif
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <cuda_profiler_api.h>
#include <map>
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,8 @@
*/
#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind {
......@@ -232,7 +234,6 @@ void generate_gpt_gemm_config(int batch_size,
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
......@@ -332,7 +333,7 @@ void generate_gpt_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
if (i == 1) {
status = cublasGemmStridedBatchedEx(cublas_handle,
......@@ -432,11 +433,12 @@ void generate_gpt_gemm_config(int batch_size,
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (dur.count() / ites < exec_time) {
exec_time = dur.count() / ites;
fast_algo = algo;
}
}
......@@ -450,7 +452,7 @@ void generate_gpt_gemm_config(int batch_size,
if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) {
printf("***cublasLt Gemm Testing Beign***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 10000;
const int ALGO_COMBINATIONS = 10000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// for gpt, computeType & scaleType should be FP32
......@@ -644,7 +646,7 @@ void generate_gpt_gemm_config(int batch_size,
CHECK_CUSPARSE(
cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
......@@ -675,10 +677,11 @@ void generate_gpt_gemm_config(int batch_size,
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg;
}
}
......
......@@ -32,8 +32,10 @@
#endif
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,8 @@
*/
#include "src/turbomind/utils/gemm_test/swin_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind {
......@@ -140,7 +142,6 @@ void generate_swin_gemm_config(
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
......@@ -211,7 +212,7 @@ void generate_swin_gemm_config(
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
if (i < 5) {
status = cublasGemmEx(cublas_handle,
......@@ -289,11 +290,12 @@ void generate_swin_gemm_config(
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (dur.count() / ites < exec_time) {
exec_time = dur.count() / ites;
fast_algo = algo;
}
}
......@@ -304,7 +306,7 @@ void generate_swin_gemm_config(
if (i < 5 && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
......
......@@ -27,8 +27,10 @@
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,7 @@
*/
#include "swin_igemm_func.h"
#include <chrono>
namespace turbomind {
......@@ -86,10 +87,9 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
struct timeval start, end;
cublasStatus_t oneRunStatus;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
......@@ -109,11 +109,12 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
stream);
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus;
}
float time = diffTime(start, end);
float time = dur.count();
// For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo;
......
......@@ -25,9 +25,11 @@
#include <map>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,8 @@
*/
#include "src/turbomind/utils/gemm_test/t5_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind {
......@@ -202,7 +204,6 @@ void generate_t5_gemm_config(int batch_size,
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
......@@ -286,7 +287,7 @@ void generate_t5_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
if (i == 0) {
status = cublasGemmBatchedEx(cublas_handle,
......@@ -408,11 +409,12 @@ void generate_t5_gemm_config(int batch_size,
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (dur.count() / ites < exec_time) {
exec_time = dur.count() / ites;
fast_algo = algo;
}
}
......@@ -431,7 +433,7 @@ void generate_t5_gemm_config(int batch_size,
if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// for t5, computeType & scaleType should be FP32
......@@ -643,7 +645,7 @@ void generate_t5_gemm_config(int batch_size,
CHECK_CUSPARSE(
cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
......@@ -674,10 +676,11 @@ void generate_t5_gemm_config(int batch_size,
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
printf("algo_%d costs %.3fms \n", alg, dur.count() / ites);
if (dur.count() < exec_time) {
exec_time = dur.count();
fast_algo = alg;
}
}
......
......@@ -27,8 +27,10 @@
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
......@@ -15,6 +15,8 @@
*/
#include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h"
#include "src/turbomind/macro.h"
#include <chrono>
namespace turbomind {
......@@ -225,7 +227,6 @@ void generate_xlnet_gemm_config(int batch_size,
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
......@@ -285,7 +286,7 @@ void generate_xlnet_gemm_config(int batch_size,
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
auto start = std::chrono::high_resolution_clock::now();
for (int ite = 0; ite < ites; ++ite) {
if (i == 1 || i == 7 || i == 8 || i == 9) {
status = cublasGemmEx(cublas_handle,
......@@ -338,11 +339,12 @@ void generate_xlnet_gemm_config(int batch_size,
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
auto end = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration<float, std::milli>(end - start);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
printf("algo_%d costs %.3fms \n", algo, dur.count() / ites);
if (dur.count() / ites < exec_time) {
exec_time = dur.count() / ites;
fast_algo = algo;
} // end if diffTime
} // end status
......@@ -353,7 +355,7 @@ void generate_xlnet_gemm_config(int batch_size,
if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
......
......@@ -27,8 +27,10 @@
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#ifdef __linux__
#include <sys/time.h>
#include <unistd.h>
#endif
#include <vector>
namespace turbomind {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment