Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -14,10 +14,10 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
namespace fastertransformer {
namespace turbomind {
/* **************************** debug tools ********************************* */
......@@ -57,7 +57,7 @@ template<typename T>
void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name)
{
if (buf == nullptr) {
FT_LOG_WARNING("It is an nullptr, skip!");
TM_LOG_WARNING("It is an nullptr, skip!");
return;
}
cudaDeviceSynchronize();
......@@ -110,7 +110,7 @@ template<typename T>
void print_to_screen(const T* result, const int size)
{
if (result == nullptr) {
FT_LOG_WARNING("It is an nullptr, skip! \n");
TM_LOG_WARNING("It is an nullptr, skip! \n");
return;
}
T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
......@@ -366,7 +366,7 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
FtCudaDataType model_file_type;
INIReader reader = INIReader(ini_file);
if (reader.ParseError() < 0) {
FT_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str());
TM_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str());
model_file_type = FtCudaDataType::FP32;
}
else {
......@@ -381,7 +381,7 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
model_file_type = FtCudaDataType::BF16;
}
else {
FT_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str());
TM_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str());
model_file_type = FtCudaDataType::FP32;
}
}
......@@ -389,4 +389,4 @@ FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
}
/* ************************** end of common utils ************************** */
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,8 +17,8 @@
#pragma once
#include "3rdparty/INIReader.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/logger.h"
#include <cublasLt.h>
#include <cublas_v2.h>
......@@ -31,7 +31,7 @@
#include <cusparseLt.h>
#endif
namespace fastertransformer {
namespace turbomind {
#define MAX_CONFIG_NUM 20
#define COL32_ 32
......@@ -137,7 +137,7 @@ inline void syncAndCheck(const char* const file, int const line)
throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
+ " " + file + ":" + std::to_string(line) + " \n");
}
FT_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
TM_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
}
}
......@@ -212,7 +212,7 @@ inline void myAssert(bool result, const char* const file, int const line, std::s
do { \
bool is_valid_val = (val); \
if (!is_valid_val) { \
fastertransformer::myAssert(is_valid_val, __FILE__, __LINE__, (info)); \
turbomind::myAssert(is_valid_val, __FILE__, __LINE__, (info)); \
} \
} while (0)
......@@ -442,14 +442,14 @@ void compareTwoTensor(
}
if (print_size > 0) {
FT_LOG_INFO(" id | pred | ref |abs diff | rel diff (%) |");
TM_LOG_INFO(" id | pred | ref |abs diff | rel diff (%) |");
}
float mean_abs_diff = 0.0f;
float mean_rel_diff = 0.0f;
int count = 0;
for (int i = 0; i < size; i++) {
if (i < print_size) {
FT_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
TM_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
i,
(float)h_pred[i],
(float)h_ref[i],
......@@ -474,7 +474,7 @@ void compareTwoTensor(
}
mean_abs_diff = mean_abs_diff / (float)count;
mean_rel_diff = mean_rel_diff / (float)count;
FT_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
TM_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
if (fd != nullptr) {
fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
......@@ -485,4 +485,4 @@ void compareTwoTensor(
}
/* ************************** end of common utils ************************** */
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,7 +16,7 @@
#include "custom_ar_comm.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
CustomAllReduceComm<T>::CustomAllReduceComm(size_t rank_size, size_t rank): rank_size_(rank_size), rank_(rank)
......@@ -137,7 +137,7 @@ void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* c
if (rank_size != RANKS_PER_NODE) {
#ifdef BUILD_MULTI_GPU
if (rank_size > 1) {
FT_LOG_WARNING("Custom All Reduce only supports 8 Ranks currently. Using NCCL as Comm.");
TM_LOG_WARNING("Custom All Reduce only supports 8 Ranks currently. Using NCCL as Comm.");
}
#else
FT_CHECK_WITH_INFO(rank_size == 1,
......@@ -158,7 +158,7 @@ void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* c
}
custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
#else
FT_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
TM_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(nullptr);
}
......@@ -186,4 +186,4 @@ initCustomAllReduceComm<uint32_t>(std::vector<std::shared_ptr<AbstractCustomComm
int enable_custom_all_reduce,
size_t rank_size);
} // namespace fastertransformer
} // namespace turbomind
......@@ -21,12 +21,12 @@
#include <stdio.h>
#include <stdlib.h>
#include "src/fastertransformer/kernels/custom_ar_kernels.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/kernels/custom_ar_kernels.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
namespace fastertransformer {
namespace turbomind {
class AbstractCustomComm {
public:
......@@ -84,4 +84,4 @@ struct CustomARCommTypeConverter<__nv_bfloat16> {
};
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm.h"
#include "src/turbomind/utils/gemm.h"
namespace fastertransformer {
namespace turbomind {
/* ***************************** GEMM Impl ******************************** */
......@@ -222,7 +222,7 @@ void Gemm::gemm(const GemmOp transa,
const float alpha,
const float beta)
{
FT_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
TM_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
// Implementation copied from cublasMMWrapper::Gemm
// Switch A and B since both cublas and cublasLt assume a column major layout,
......@@ -423,7 +423,7 @@ void Gemm::batchedGemm(const GemmOp transa,
const float alpha,
const float beta)
{
FT_LOG_TRACE(
TM_LOG_TRACE(
"Gemm::batchedGemm [b=%ld m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", batch_size, m, n, k, lda, ldb, ldc);
// Switch A and B.
......@@ -624,7 +624,7 @@ void Gemm::stridedBatchedGemm(GemmOp transa,
const float alpha,
const float beta)
{
FT_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]",
TM_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]",
batch_size,
m,
n,
......@@ -873,7 +873,7 @@ void SpGemm::gemm(const GemmOp transa,
const float alpha,
const float beta)
{
FT_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
TM_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
checkDataTypeValidity(Atype);
checkDataTypeValidity(Btype);
checkDataTypeValidity(Ctype);
......@@ -994,7 +994,7 @@ void SpGemm::gemm(const GemmOp transa,
std::shared_ptr<Gemm> createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse, bool quantized)
{
FT_LOG_TRACE(
TM_LOG_TRACE(
"Create Gemm instance [sparse=%s, quantized=%s]", sparse ? "true" : "false", quantized ? "true" : "false");
std::shared_ptr<Gemm> gemm;
if (!sparse) {
......@@ -1105,7 +1105,7 @@ cusparseComputeType getCusparseComputeType(DataType ctype)
void pruneMatrixB(void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans)
{
FT_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str());
TM_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str());
// Due to A/B switching, the matrix B will be used as a matrix A.
const cusparseOrder_t order = CUSPARSE_ORDER_COL;
......@@ -1141,7 +1141,7 @@ size_t compressMatrixB(void** output,
const size_t n,
const GemmOp trans)
{
FT_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n);
TM_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n);
// swap A/B due to column/row major layout mismatch.
cusparseOrder_t order = CUSPARSE_ORDER_COL;
......@@ -1181,4 +1181,4 @@ size_t compressMatrixB(void** output,
/* ************************* End of GEMM utils **************************** */
} // end of namespace fastertransformer
} // end of namespace turbomind
......@@ -27,13 +27,13 @@
// TODO: Need to remove the dependency of the layer module.
// e.g. refactor Weight class to some base module.
#include "src/fastertransformer/layers/DenseWeight.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/DenseWeight.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
......@@ -42,7 +42,7 @@
// cublas default workspace size: 32MB. Let me make this as a Gemm property.
#define WORKSPACE_SIZE 33554432
namespace fastertransformer {
namespace turbomind {
// A wrapper of cublas or cusparse matrix operator.
// - GEMM_OP_N = CUBLAS_OP_N or CUSPARSE_OP_N
......@@ -677,4 +677,4 @@ size_t compressMatrixB(void** output,
/* ************************* End of GEMM utils **************************** */
} // end of namespace fastertransformer
} // end of namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/decoding_gemm_func.h"
#include "src/turbomind/utils/gemm_test/decoding_gemm_func.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_decoding_gemm_config(int batch_size,
......@@ -401,4 +401,4 @@ size_t calDecodingGemmTestBufSizeInByte(int batch_size,
return buf_size_in_byte;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
......@@ -31,7 +31,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_decoding_gemm_config(int batch_size,
......@@ -55,4 +55,4 @@ size_t calDecodingGemmTestBufSizeInByte(int batch_size,
int vocab_size,
CublasDataType data_type);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/encoder_gemm_func.h"
#include "src/turbomind/utils/gemm_test/encoder_gemm_func.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_encoder_gemm_config(
......@@ -560,4 +560,4 @@ template void generate_encoder_gemm_config<__nv_bfloat16>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
......@@ -31,7 +31,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_encoder_gemm_config(int batch_size,
......@@ -42,4 +42,4 @@ void generate_encoder_gemm_config(int batch_size,
bool isAppend = true,
int tensor_para_size = 1);
} // namespace fastertransformer
} // namespace turbomind
......@@ -20,7 +20,7 @@
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
namespace turbomind {
int batch_size_;
int seq_len_;
......@@ -1329,4 +1329,4 @@ int generate_encoder_igemm_config(
return 0;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,8 +16,8 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <algorithm>
#include <cublasLt.h>
#include <cuda_runtime.h>
......@@ -29,7 +29,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
/* CAUTION : must match cublasLtMatmulTile_t */
const char* const matmulTileName[] = {"UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8",
......@@ -77,4 +77,4 @@ int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
void matInit(int rows, int cols, int8_t* p, int ld);
} // namespace fastertransformer
} // namespace turbomind
......@@ -22,7 +22,7 @@
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
namespace turbomind {
// Utility function to print customMatmulPerf_t structure
int printPerfStructure(int batch_size,
......@@ -986,4 +986,4 @@ size_t calGemmTestBufSizeInByteXlnet(
return max_size * size_per_ele;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,8 +17,8 @@
#pragma once
#include "encoder_igemm_func.h" // TODO(bhsueh) Remove this include
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
......@@ -34,7 +34,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
// Scale Type Converter
// is_fp16_compute_type is only valid when T = half
......@@ -98,4 +98,4 @@ int printPerfStructure(int batch_size,
int hasPrint,
int batch_count = 1);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/gpt_gemm_func.h"
#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
namespace fastertransformer {
namespace turbomind {
bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
{
......@@ -804,4 +804,4 @@ size_t calGptGemmTestBufSizeInByte(int batch_size,
return buf_size_in_byte;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
......@@ -36,7 +36,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_gpt_gemm_config(int batch_size,
......@@ -60,4 +60,4 @@ size_t calGptGemmTestBufSizeInByte(int batch_size,
int tensor_para_size,
CublasDataType data_type);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/swin_gemm_func.h"
#include "src/turbomind/utils/gemm_test/swin_gemm_func.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_swin_gemm_config(
......@@ -398,4 +398,4 @@ template void generate_swin_gemm_config<__nv_bfloat16>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
......@@ -31,10 +31,10 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_swin_gemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,7 +16,7 @@
#include "swin_igemm_func.h"
namespace fastertransformer {
namespace turbomind {
static const char* showStatus(cublasStatus_t error)
{
......@@ -278,4 +278,4 @@ int generate_swin_igemm_config(
return 0;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,9 +16,9 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/encoder_igemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/encoder_igemm_func.h"
#include <algorithm>
#include <cublasLt.h>
#include <cuda_runtime.h>
......@@ -30,7 +30,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
/* CAUTION : must match cublasLtMatmulTile_t */
// const char* const matmulTileName[] = {
......@@ -42,4 +42,4 @@ namespace fastertransformer {
int generate_swin_igemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
} // namespace fastertransformer
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment