Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/t5_gemm_func.h"
#include "src/turbomind/utils/gemm_test/t5_gemm_func.h"
namespace fastertransformer {
namespace turbomind {
bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
{
......@@ -830,4 +830,4 @@ size_t calT5GemmTestBufSizeInByte(int batch_size,
return buf_size_in_byte;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
......@@ -31,7 +31,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_t5_gemm_config(int batch_size,
......@@ -66,4 +66,4 @@ size_t calT5GemmTestBufSizeInByte(int batch_size,
int tensor_para_size,
CublasDataType data_type);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h"
#include "src/turbomind/utils/gemm_test/xlnet_gemm_func.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_xlnet_gemm_config(int batch_size,
......@@ -458,4 +458,4 @@ template void generate_xlnet_gemm_config<__nv_bfloat16>(int batch_size,
bool isAppend);
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
......@@ -31,7 +31,7 @@
#include <unistd.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void generate_xlnet_gemm_config(int batch_size,
......@@ -43,4 +43,4 @@ void generate_xlnet_gemm_config(int batch_size,
void* buffer_in,
bool isAppend = true);
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,15 +17,15 @@
#pragma once
#include "cuda_fp16.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include <cstdlib>
#include <stdexcept>
#include <type_traits>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
class GPUBuf {
......@@ -84,4 +84,4 @@ public:
T* ptr;
};
} // namespace fastertransformer
} // namespace turbomind
#pragma once
namespace fastertransformer {
namespace turbomind {
class AbstractInstanceComm {
public:
......@@ -13,4 +13,4 @@ public:
virtual void* getSharedObject() = 0;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,21 +14,21 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#include <cuda_runtime.h>
namespace fastertransformer {
namespace turbomind {
Logger::Logger()
{
char* is_first_rank_only_char = std::getenv("FT_LOG_FIRST_RANK_ONLY");
char* is_first_rank_only_char = std::getenv("TM_LOG_FIRST_RANK_ONLY");
bool is_first_rank_only =
(is_first_rank_only_char != nullptr && std::string(is_first_rank_only_char) == "ON") ? true : false;
int device_id;
cudaGetDevice(&device_id);
char* level_name = std::getenv("FT_LOG_LEVEL");
char* level_name = std::getenv("TM_LOG_LEVEL");
if (level_name != nullptr) {
std::map<std::string, Level> name_to_level = {
{"TRACE", TRACE},
......@@ -38,7 +38,7 @@ Logger::Logger()
{"ERROR", ERROR},
};
auto level = name_to_level.find(level_name);
// If FT_LOG_FIRST_RANK_ONLY=ON, set LOG LEVEL of other device to ERROR
// If TM_LOG_FIRST_RANK_ONLY=ON, set LOG LEVEL of other device to ERROR
if (is_first_rank_only && device_id != 0) {
level = name_to_level.find("ERROR");
}
......@@ -47,7 +47,7 @@ Logger::Logger()
}
else {
fprintf(stderr,
"[FT][WARNING] Invalid logger level FT_LOG_LEVEL=%s. "
"[FT][WARNING] Invalid logger level TM_LOG_LEVEL=%s. "
"Ignore the environment variable and use a default "
"logging level.\n",
level_name);
......@@ -56,4 +56,4 @@ Logger::Logger()
}
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -20,9 +20,9 @@
#include <map>
#include <string>
#include "src/fastertransformer/utils/string_utils.h"
#include "src/turbomind/utils/string_utils.h"
namespace fastertransformer {
namespace turbomind {
class Logger {
......@@ -77,7 +77,7 @@ public:
}
private:
const std::string PREFIX = "[FT]";
const std::string PREFIX = "[TM]";
const std::map<const Level, const std::string> level_name_ = {
{TRACE, "TRACE"}, {DEBUG, "DEBUG"}, {INFO, "INFO"}, {WARNING, "WARNING"}, {ERROR, "ERROR"}};
......@@ -106,16 +106,16 @@ private:
}
};
#define FT_LOG(level, ...) \
#define TM_LOG(level, ...) \
do { \
if (fastertransformer::Logger::getLogger().getLevel() <= level) { \
fastertransformer::Logger::getLogger().log(level, __VA_ARGS__); \
if (turbomind::Logger::getLogger().getLevel() <= level) { \
turbomind::Logger::getLogger().log(level, __VA_ARGS__); \
} \
} while (0)
#define FT_LOG_TRACE(...) FT_LOG(fastertransformer::Logger::TRACE, __VA_ARGS__)
#define FT_LOG_DEBUG(...) FT_LOG(fastertransformer::Logger::DEBUG, __VA_ARGS__)
#define FT_LOG_INFO(...) FT_LOG(fastertransformer::Logger::INFO, __VA_ARGS__)
#define FT_LOG_WARNING(...) FT_LOG(fastertransformer::Logger::WARNING, __VA_ARGS__)
#define FT_LOG_ERROR(...) FT_LOG(fastertransformer::Logger::ERROR, __VA_ARGS__)
} // namespace fastertransformer
#define TM_LOG_TRACE(...) TM_LOG(turbomind::Logger::TRACE, __VA_ARGS__)
#define TM_LOG_DEBUG(...) TM_LOG(turbomind::Logger::DEBUG, __VA_ARGS__)
#define TM_LOG_INFO(...) TM_LOG(turbomind::Logger::INFO, __VA_ARGS__)
#define TM_LOG_WARNING(...) TM_LOG(turbomind::Logger::WARNING, __VA_ARGS__)
#define TM_LOG_ERROR(...) TM_LOG(turbomind::Logger::ERROR, __VA_ARGS__)
} // namespace turbomind
......@@ -14,15 +14,15 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
#include <curand_kernel.h>
#include <sys/stat.h>
#include <unordered_map>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
......@@ -130,7 +130,7 @@ template<typename T>
void cudaH2Dcpy(T* tgt, const T* src, const size_t size)
{
if (tgt == nullptr || src == nullptr) {
FT_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size));
TM_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size));
}
check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
}
......@@ -313,14 +313,14 @@ std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string fi
}
size_t size = dim0 * dim1;
if (size == 0) {
FT_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
TM_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
return std::vector<T>();
}
std::vector<T> host_array(size);
std::ifstream in(filename, std::ios::in | std::ios::binary);
if (!in.is_open()) {
FT_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
TM_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
return std::vector<T>();
}
......@@ -328,12 +328,12 @@ std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string fi
in.seekg(0, in.end);
in.seekg(0, in.beg);
FT_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
TM_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
in.read((char*)host_array.data(), loaded_data_size);
size_t in_get_size = in.gcount();
if (in_get_size != loaded_data_size) {
FT_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n",
TM_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n",
filename.c_str(),
in_get_size,
loaded_data_size);
......@@ -417,7 +417,7 @@ int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, F
break;
#endif
default:
FT_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type);
TM_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type);
FT_CHECK(false);
}
return 0;
......@@ -839,4 +839,4 @@ bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within
template bool
invokeCheckRange<int>(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,11 +16,11 @@
#pragma once
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
......@@ -144,4 +144,4 @@ size_t cuda_datatype_size(FtCudaDataType dt);
template<typename T>
bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/turbomind/utils/mpi_utils.h"
namespace fastertransformer {
namespace turbomind {
namespace mpi {
#ifdef BUILD_MULTI_GPU
......@@ -119,4 +119,4 @@ void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm)
}
} // namespace mpi
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,7 +16,7 @@
#pragma once
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#ifdef BUILD_MULTI_GPU
#include <mpi.h>
......@@ -24,7 +24,7 @@
#include <stdio.h>
#include <unordered_map>
namespace fastertransformer {
namespace turbomind {
#ifdef BUILD_MULTI_GPU
#define MPICHECK(cmd) \
......@@ -91,4 +91,4 @@ int getCommWorldSize();
void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm);
} // namespace mpi
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,16 +14,16 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include <atomic>
namespace fastertransformer {
namespace turbomind {
#ifdef BUILD_MULTI_GPU
template<typename T>
ncclDataType_t getNcclDataType()
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
ncclDataType_t nccl_data_type;
if (std::is_same<T, float>::value) {
nccl_data_type = ncclFloat;
......@@ -56,7 +56,7 @@ ncclDataType_t getNcclDataType()
template<typename T>
void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
ncclDataType_t nccl_data_type = getNcclDataType<T>();
NCCLCHECK(ncclGroupStart());
......@@ -64,14 +64,14 @@ void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, Ncc
(const void*)send_buf, (void*)recv_buf, data_size, nccl_data_type, ncclSum, nccl_param.nccl_comm_, stream));
NCCLCHECK(ncclGroupEnd());
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template<typename T>
void ftNcclAllGather(
const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
ncclDataType_t nccl_data_type = getNcclDataType<T>();
NCCLCHECK(ncclGroupStart());
......@@ -79,18 +79,18 @@ void ftNcclAllGather(
ncclAllGather(send_buf + rank * data_size, recv_buf, data_size, nccl_data_type, nccl_param.nccl_comm_, stream));
NCCLCHECK(ncclGroupEnd());
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template<typename T>
void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
ncclDataType_t nccl_data_type = getNcclDataType<T>();
NCCLCHECK(ncclSend(send_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream));
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template void
......@@ -111,12 +111,12 @@ ftNcclSend(const char* send_buf, const int data_size, const int peer, NcclParam
template<typename T>
void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
ncclDataType_t nccl_data_type = getNcclDataType<T>();
NCCLCHECK(ncclRecv(recv_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream));
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template void
......@@ -136,12 +136,12 @@ ftNcclRecv(char* recv_buf, const int data_size, const int peer, NcclParam nccl_p
template<typename T>
void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
ncclDataType_t nccl_data_type = getNcclDataType<T>();
NCCLCHECK(ncclBcast(buff, data_size, nccl_data_type, root, nccl_param.nccl_comm_, stream));
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template void
......@@ -215,7 +215,7 @@ void ftNcclGroupEnd()
void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
cudaError_t cudaErr;
ncclResult_t tensor_ncclErr = ncclSuccess, tensor_ncclAsyncErr = ncclSuccess, pipeline_ncclErr = ncclSuccess,
......@@ -229,7 +229,7 @@ void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para, cud
while (1) {
cudaErr = cudaStreamQuery(stream);
if (cudaErr == cudaSuccess) {
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
return;
}
......@@ -282,11 +282,11 @@ void ftNcclGetUniqueId(NcclUid& uid)
void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const NcclUid uid)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
#ifdef BUILD_MULTI_GPU
// Initialize a nccl communicator.
if (param.nccl_comm_ != nullptr) {
FT_LOG_WARNING("NcclParam is already initialized.");
TM_LOG_WARNING("NcclParam is already initialized.");
return;
}
param.rank_ = rank;
......@@ -294,7 +294,7 @@ void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size,
param.nccl_uid_ = uid.nccl_uid_;
NCCLCHECK(ncclCommInitRank(&param.nccl_comm_, param.world_size_, param.nccl_uid_, param.rank_));
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
void ftNcclParamDestroy(NcclParam& param)
......@@ -311,7 +311,7 @@ void ftNcclInitialize(NcclParam& tensor_para,
const int tensor_para_size,
const int pipeline_para_size)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
// Initialize nccl communication grid of tensor and pipeline parallel groups.
#ifndef BUILD_MULTI_GPU
FT_CHECK_WITH_INFO(tensor_para_size == 1,
......@@ -331,7 +331,7 @@ void ftNcclInitialize(NcclParam& tensor_para,
#else
// Initialize a nccl communicator.
if (tensor_para.nccl_comm_ != nullptr && pipeline_para.nccl_comm_ != nullptr) {
FT_LOG_WARNING("NcclParam is already initialized. Skip NCCL initialization.");
TM_LOG_WARNING("NcclParam is already initialized. Skip NCCL initialization.");
return;
}
FT_CHECK(tensor_para.nccl_comm_ == nullptr);
......@@ -340,7 +340,7 @@ void ftNcclInitialize(NcclParam& tensor_para,
FT_CHECK(pipeline_para_size > 0);
if (tensor_para_size == 1 && pipeline_para_size == 1) {
FT_LOG_WARNING("Skip NCCL initialization since requested tensor/pipeline parallel sizes are equals to 1.");
TM_LOG_WARNING("Skip NCCL initialization since requested tensor/pipeline parallel sizes are equals to 1.");
tensor_para.rank_ = 0;
tensor_para.world_size_ = tensor_para_size;
pipeline_para.rank_ = 0;
......@@ -384,19 +384,19 @@ void ftNcclInitialize(NcclParam& tensor_para,
ncclUniqueId pp_uid;
// The root of each group creates a nccl uid.
if (tp_rank == 0) {
FT_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, tp_rank);
TM_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, tp_rank);
NCCLCHECK(ncclGetUniqueId(&tp_uid));
}
if (pp_rank == 0) {
FT_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, pp_rank);
TM_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, pp_rank);
NCCLCHECK(ncclGetUniqueId(&pp_uid));
}
// Broadcast nccl uid to share the same nccl uid across gpus in the same group.
FT_LOG_DEBUG("Broadcast nccl uid to the others in the same parallel groups.");
TM_LOG_DEBUG("Broadcast nccl uid to the others in the same parallel groups.");
MPI_Bcast(&tp_uid, sizeof(tp_uid), MPI_BYTE, 0, tp_comm);
MPI_Bcast(&pp_uid, sizeof(pp_uid), MPI_BYTE, 0, pp_comm);
FT_LOG_DEBUG("Initialize NCCL communicators.");
TM_LOG_DEBUG("Initialize NCCL communicators.");
ncclComm_t tp_nccl_comm, pp_nccl_comm;
NCCLCHECK(ncclCommInitRank(&tp_nccl_comm, tensor_para_size, tp_uid, tp_rank));
NCCLCHECK(ncclCommInitRank(&pp_nccl_comm, pipeline_para_size, pp_uid, pp_rank));
......@@ -409,13 +409,13 @@ void ftNcclInitialize(NcclParam& tensor_para,
pipeline_para.rank_ = pp_rank;
pipeline_para.nccl_uid_ = pp_uid;
pipeline_para.nccl_comm_ = pp_nccl_comm;
FT_LOG_INFO("NCCL initialized rank=%d world_size=%d tensor_para=%s pipeline_para=%s",
TM_LOG_INFO("NCCL initialized rank=%d world_size=%d tensor_para=%s pipeline_para=%s",
rank,
world_size,
tensor_para.toString().c_str(),
pipeline_para.toString().c_str());
#endif
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
static std::atomic<int>& ncclGroupCount()
......@@ -449,4 +449,4 @@ size_t getLocalBatchSize(const size_t batch_size, const size_t seq_len, const si
return local_batch_size;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,9 +16,9 @@
#pragma once
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/mpi_utils.h"
#include <cuda_runtime.h>
#ifdef BUILD_MULTI_GPU
......@@ -32,7 +32,7 @@
#define ENABLE_BF16_NCCL
#endif
namespace fastertransformer {
namespace turbomind {
#ifdef BUILD_MULTI_GPU
#define NCCLCHECK(cmd) \
do { \
......@@ -128,4 +128,4 @@ int ftNcclGroupCount();
size_t getLocalBatchSize(const size_t batch_size, const size_t seq_len, const size_t pipeline_para_size);
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,7 +17,7 @@
#pragma once
namespace fastertransformer {
namespace turbomind {
enum class PromptLearningType {
no_prompt,
......@@ -26,4 +26,4 @@ enum class PromptLearningType {
p_prompt_tuning
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -21,7 +21,7 @@
#include <string>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename... Args>
inline std::string fmtstr(const std::string& format, Args... args)
......@@ -81,4 +81,4 @@ inline std::string arr2str(T* arr, size_t size)
ss << ")";
return ss.str();
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -20,7 +20,7 @@
#include <cuda.h>
#include <cuda_runtime_api.h>
namespace fastertransformer {
namespace turbomind {
#define TIMEIT(print, n, stream, fn, ...) \
({ \
......@@ -57,4 +57,4 @@ struct abs_diff {
}
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -23,7 +23,7 @@
#include <cuda_fp16.h>
#include <cudnn.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void conv2d(T* output,
......@@ -154,7 +154,7 @@ void conv2d(T* output,
output_descriptor_,
convolution_algorithm_,
&ws_size));
FT_LOG_DEBUG("Convolution algorithm: %d with workspace size: %d \n", convolution_algorithm_, ws_size);
TM_LOG_DEBUG("Convolution algorithm: %d with workspace size: %d \n", convolution_algorithm_, ws_size);
FT_CHECK_WITH_INFO(
ws_size <= (1 << 29),
"Current workspace used for CuDNN Convolution is fixed as 1 << 29, please increase it in WenetEncoder::allocateBuffer!");
......@@ -195,4 +195,4 @@ void conv2d(T* output,
checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,7 +19,7 @@
#include "assert.h"
namespace fastertransformer {
namespace turbomind {
int read_word_list(const std::string& filename, std::vector<int>& file_data)
{
......@@ -47,4 +47,4 @@ int read_word_list(const std::string& filename, std::vector<int>& file_data)
return 0;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,7 +19,7 @@
#include "Tensor.h"
#include "stdlib.h"
namespace fastertransformer {
namespace turbomind {
int read_word_list(const std::string& filename, std::vector<int>& tensor_data);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment