Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/Tensor.h"
namespace ft = fastertransformer;
namespace ft = turbomind;
template<typename T>
void move_tensor_H2D(const triton::Tensor& tensor,
......
......@@ -16,7 +16,7 @@
#pragma once
namespace fastertransformer {
namespace turbomind {
enum IA3_config {
KEY_ADAPTER = 1 << 0,
......@@ -43,4 +43,4 @@ static inline IA3_config& operator|=(IA3_config& x, IA3_config y)
return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,7 +17,7 @@
#pragma once
#include "stdlib.h"
namespace fastertransformer {
namespace turbomind {
#define ACTIVATION_AMAX_NUM 72
#define INT8O_GEMM_NUM 8
......@@ -48,4 +48,4 @@ struct ScaleList {
size_t p4_offset_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,10 +14,10 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/string_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/string_utils.h"
#include "stdlib.h"
#include <cuda_fp16.h>
......@@ -31,7 +31,7 @@
#include <unordered_map>
#include <vector>
namespace fastertransformer {
namespace turbomind {
Tensor::Tensor():
// a none tensor.
......@@ -271,7 +271,7 @@ std::string Tensor::getNumpyTypeDesc(DataType type) const
{TYPE_FP64, "f8"}};
if (type == TYPE_BF16) {
FT_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
TM_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
"support bfloat16 as of now, it will be properly extended if numpy supports. "
"Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
}
......@@ -352,7 +352,7 @@ TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
insert(kv.first, kv.second);
}
else {
FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
TM_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
}
}
}
......@@ -371,7 +371,7 @@ TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tenso
insert(pair.first, pair.second);
}
else {
FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
TM_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
}
}
}
......@@ -456,4 +456,4 @@ void TensorMap::saveNpy(const std::string& base_folder)
}
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/string_utils.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/string_utils.h"
#include "stdlib.h"
#include <cuda_fp16.h>
......@@ -33,7 +33,7 @@
#include <unordered_map>
#include <vector>
namespace fastertransformer {
namespace turbomind {
typedef enum datatype_enum {
TYPE_INVALID,
......@@ -135,13 +135,13 @@ struct Tensor {
template<typename T>
inline T getVal(size_t index) const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(where == MEMORY_CPU);
FT_CHECK(data != nullptr);
FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -151,9 +151,9 @@ struct Tensor {
template<typename T>
inline T getVal() const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -163,9 +163,9 @@ struct Tensor {
template<typename T>
inline T* getPtr() const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getPtr with type %s, but data type is: %s",
TM_LOG_DEBUG("getPtr with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -174,7 +174,7 @@ struct Tensor {
inline void* getPtrWithOffset(size_t offset) const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (data == nullptr) {
return (void*)data;
}
......@@ -187,9 +187,9 @@ struct Tensor {
template<typename T>
inline T* getPtrWithOffset(size_t offset) const
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -207,7 +207,7 @@ struct Tensor {
T max() const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -230,7 +230,7 @@ struct Tensor {
T min() const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -253,7 +253,7 @@ struct Tensor {
T any(T val) const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -272,7 +272,7 @@ struct Tensor {
T all(T val) const
{
if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str());
}
......@@ -324,7 +324,7 @@ public:
inline bool isExist(const std::string& key) const
{
FT_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
TM_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
return tensor_map_.find(key) != tensor_map_.end();
}
......@@ -355,7 +355,7 @@ public:
inline Tensor& at(const std::string& key)
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(),
......@@ -374,7 +374,7 @@ public:
inline Tensor& at(const std::string& key, Tensor& default_tensor)
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) {
return tensor_map_.at(key);
}
......@@ -383,7 +383,7 @@ public:
inline Tensor at(const std::string& key, Tensor& default_tensor) const
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) {
return tensor_map_.at(key);
}
......@@ -392,7 +392,7 @@ public:
inline Tensor& at(const std::string& key, Tensor&& default_tensor)
{
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) {
return tensor_map_.at(key);
}
......@@ -518,4 +518,4 @@ public:
void saveNpy(const std::string& base_folder);
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,9 +16,9 @@
#pragma once
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
enum class ActivationType {
Gelu,
......@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type)
|| activaiton_type == ActivationType::SiGLU;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -41,13 +41,13 @@
#include <memory>
#endif
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
#define CUDA_MEMORY_POOL_DISABLED
#endif
namespace fastertransformer {
namespace turbomind {
enum class AllocatorType {
CUDA,
......@@ -74,26 +74,26 @@ public:
template<typename T>
void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes
void* void_ptr = (void*)ptr;
void* ptr_address = getAddress(void_ptr);
if (isExist(ptr_address)) {
ReallocType realloc_type = isReMalloc(ptr_address, size);
if (realloc_type == ReallocType::INCREASE) {
FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
TM_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
free((void**)(&void_ptr), is_host);
return malloc(size, is_set_zero, is_host);
}
#if !defined(CUDA_MEMORY_POOL_DISABLED)
else if (realloc_type == ReallocType::DECREASE) {
FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
TM_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
free((void**)(&void_ptr), is_host);
return malloc(size, is_set_zero, is_host);
}
#endif
else {
FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
TM_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
if (is_set_zero) {
memSet(void_ptr, 0, size);
}
......@@ -101,7 +101,7 @@ public:
}
}
else {
FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
TM_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
return malloc(size, is_set_zero, is_host);
}
}
......@@ -147,10 +147,10 @@ private:
public:
Allocator(int device_id): device_id_(device_id)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
pointer_mapping_ = new std::unordered_map<void*, size_t>();
#if defined(CUDA_MEMORY_POOL_DISABLED)
FT_LOG_WARNING(
TM_LOG_WARNING(
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
#else
......@@ -166,7 +166,7 @@ public:
}
check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
if (!peer_access_available) {
FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
TM_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+ " is not available.");
continue;
}
......@@ -183,7 +183,7 @@ public:
virtual ~Allocator()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
while (!pointer_mapping_->empty()) {
free((void**)(&pointer_mapping_->begin()->first));
}
......@@ -202,7 +202,7 @@ public:
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (size == 0) {
return nullptr;
}
......@@ -224,7 +224,7 @@ public:
check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
}
check_cuda_error(getSetDevice(o_device));
FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
pointer_mapping_->insert({getAddress(ptr), size});
......@@ -233,12 +233,12 @@ public:
void free(void** ptr, bool is_host = false) const
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr);
if (*ptr != nullptr) {
int o_device = 0;
if (pointer_mapping_->count(address)) {
FT_LOG_DEBUG("Free buffer %p", address);
TM_LOG_DEBUG("Free buffer %p", address);
check_cuda_error(getSetDevice(device_id_, &o_device));
if (is_host) {
check_cuda_error(cudaFreeHost(*ptr));
......@@ -255,7 +255,7 @@ public:
pointer_mapping_->erase(address);
}
else {
FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
}
}
*ptr = nullptr;
......@@ -287,7 +287,7 @@ class Allocator<AllocatorType::TF>: public IAllocator {
for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
}
FT_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
TM_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
if (current_buffer_size < size) {
return ReallocType::INCREASE;
}
......@@ -317,7 +317,7 @@ public:
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
tensorflow::Tensor buf;
long long int buf_size = ((long long int)ceil(size / 32.) * 32);
tensorflow::Status status;
......@@ -347,7 +347,7 @@ public:
void free(void** ptr, bool is_host = false) const
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr);
pointer_mapping_->erase(address);
*ptr = nullptr;
......@@ -387,7 +387,7 @@ class Allocator<AllocatorType::TH>: public IAllocator {
for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
current_buffer_size *= pointer_mapping_->at(address).size(i);
}
FT_LOG_DEBUG(
TM_LOG_DEBUG(
"current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
if (current_buffer_size < size) {
return ReallocType::INCREASE;
......@@ -419,7 +419,7 @@ public:
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
int64_t buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
torch::Tensor buf;
if (is_host) {
......@@ -432,14 +432,14 @@ public:
if (is_set_zero) {
cudaMemset(ptr, 0, buf_size);
}
FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
pointer_mapping_->insert({getAddress(ptr), buf});
return ptr;
}
void free(void** ptr, bool is_host = false) const
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr);
pointer_mapping_->erase(address);
*ptr = nullptr;
......@@ -448,7 +448,7 @@ public:
virtual ~Allocator()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
while (!pointer_mapping_->empty()) {
void* ptr = pointer_mapping_->begin()->second.data_ptr();
free((void**)(&ptr));
......@@ -463,4 +463,4 @@ public:
}
};
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -23,7 +23,7 @@
#include <cuda_fp16.h>
#include <cudnn.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void conv2d(T* output,
......@@ -134,4 +134,4 @@ void conv2d(T* output,
checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,7 +16,7 @@
#include "cublasAlgoMap.h"
namespace fastertransformer {
namespace turbomind {
cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
config_filename_(filename), sp_config_filename_(sp_config_filename)
......@@ -223,4 +223,4 @@ bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n,
}
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,7 +14,7 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
......@@ -24,7 +24,7 @@
#include <utility>
#pragma once
namespace fastertransformer {
namespace turbomind {
#define GEMM_NUM 6
#define GEMM_CONFIG "gemm_config.in"
......@@ -102,4 +102,4 @@ public:
getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,7 +17,7 @@
#include "cublasFP8MMWrapper.h"
#include "cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
#define CUBLAS_WORKSPACE_1MB 1048576
cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
......@@ -27,7 +27,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
IAllocator* allocator):
cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
cublasVersionCheck();
......@@ -44,7 +44,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle,
IAllocator* allocator):
cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
cublasVersionCheck();
if (allocator_ != nullptr) {
......@@ -54,7 +54,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle,
cublasFP8MMWrapper::~cublasFP8MMWrapper()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_ = nullptr;
if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_qgemm_));
......@@ -69,7 +69,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper):
wrapper.mu_,
wrapper.allocator_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublasVersionCheck();
}
......@@ -135,7 +135,7 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res,
cudaStream_t stream,
bool fastAccum)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
......@@ -345,7 +345,7 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res,
cudaStream_t stream,
bool fastAccum)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
......@@ -534,7 +534,7 @@ void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3* res,
const float output_scale,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
size_t workspace_size = 0;
// get workspace size
......@@ -615,7 +615,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16* res,
const float* output_scale,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
......@@ -777,7 +777,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3* res,
const float* output_scale,
cudaStream_t stream)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale;
......@@ -1018,4 +1018,4 @@ template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3*
const float* output_scale,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,9 +16,9 @@
#include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h"
#include "cuda_utils.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
......@@ -28,7 +28,7 @@
#pragma once
namespace fastertransformer {
namespace turbomind {
class cublasFP8MMWrapper: public cublasMMWrapper {
public:
......@@ -170,8 +170,8 @@ public:
private:
int version_major_, version_minor_, version_patch_;
fastertransformer::qgmma1x1Launcher qgmmaLauncher;
turbomind::qgmma1x1Launcher qgmmaLauncher;
void* cublas_workspace_qgemm_ = nullptr;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -20,7 +20,7 @@
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
namespace turbomind {
cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
cublasAlgoMap* cublas_algo_map,
......@@ -556,4 +556,4 @@ void cublasINT8MMWrapper::SpGemm(
mu_->unlock();
}
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,9 +15,9 @@
*/
#include "cuda_utils.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
......@@ -26,7 +26,7 @@
#include <string>
#pragma once
namespace fastertransformer {
namespace turbomind {
class cublasINT8MMWrapper: public cublasMMWrapper {
private:
......@@ -91,4 +91,4 @@ public:
#endif
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -21,7 +21,7 @@
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
namespace turbomind {
cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle,
cudaStream_t stream,
......@@ -35,7 +35,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
mu_(mu),
allocator_(allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
}
......@@ -57,7 +57,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
mu_(mu),
allocator_(allocator)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
}
......@@ -66,7 +66,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
cublasMMWrapper::~cublasMMWrapper()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_ = nullptr;
if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_));
......@@ -85,7 +85,7 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
mu_(wrapper.mu_),
allocator_(wrapper.allocator_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
}
......@@ -110,7 +110,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
cudaDataType_t computeType,
cublasGemmAlgo_t algo)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock();
check_cuda_error(cublasGemmEx(cublas_handle_,
transa,
......@@ -147,7 +147,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
void* C,
const int ldc)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
}
......@@ -165,7 +165,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
float f_alpha,
float f_beta)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
half h_alpha = (half)(f_alpha);
half h_beta = (half)(f_beta);
......@@ -396,7 +396,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
void* C,
const int ldc)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t Atype, Btype, Ctype;
cublasComputeType_t computeType;
cudaDataType_t scaleType;
......@@ -1099,4 +1099,4 @@ void cublasMMWrapper::Int8Gemm(const int m,
return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,8 +15,8 @@
*/
#include "cuda_utils.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasAlgoMap.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
......@@ -25,7 +25,7 @@
#include <string>
#pragma once
namespace fastertransformer {
namespace turbomind {
class cublasMMWrapper {
protected:
......@@ -293,4 +293,4 @@ public:
#endif
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <cuda_fp16.h>
namespace fastertransformer {
namespace turbomind {
#ifdef ENABLE_BF16
inline __device__ float2 bf1622float2(const __nv_bfloat162 val)
......@@ -287,4 +287,4 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif // ENABLE_BF16
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,7 +16,7 @@
#include "cuda_fp8_utils.h"
namespace fastertransformer {
namespace turbomind {
#ifdef ENABLE_FP8
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
......@@ -121,4 +121,4 @@ template void
invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
#endif // ENABLE_FP8
} // namespace fastertransformer
} // namespace turbomind
......@@ -31,7 +31,7 @@
#define USE_QGMMA
#endif
namespace fastertransformer {
namespace turbomind {
const float FP8_E4M3_MAX = 480.0f;
......@@ -190,5 +190,5 @@ void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_
template<typename T_W>
void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
#endif // ENABLE_FP8
......@@ -16,13 +16,13 @@
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
inline __device__ T ldg(const T* val)
......@@ -598,4 +598,4 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
#endif // ENABLE_FP8
} // namespace fastertransformer
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment