Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
#pragma once #pragma once
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
namespace ft = fastertransformer; namespace ft = turbomind;
template<typename T> template<typename T>
void move_tensor_H2D(const triton::Tensor& tensor, void move_tensor_H2D(const triton::Tensor& tensor,
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
enum IA3_config { enum IA3_config {
KEY_ADAPTER = 1 << 0, KEY_ADAPTER = 1 << 0,
...@@ -43,4 +43,4 @@ static inline IA3_config& operator|=(IA3_config& x, IA3_config y) ...@@ -43,4 +43,4 @@ static inline IA3_config& operator|=(IA3_config& x, IA3_config y)
return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y)); return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#pragma once #pragma once
#include "stdlib.h" #include "stdlib.h"
namespace fastertransformer { namespace turbomind {
#define ACTIVATION_AMAX_NUM 72 #define ACTIVATION_AMAX_NUM 72
#define INT8O_GEMM_NUM 8 #define INT8O_GEMM_NUM 8
...@@ -48,4 +48,4 @@ struct ScaleList { ...@@ -48,4 +48,4 @@ struct ScaleList {
size_t p4_offset_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM; size_t p4_offset_ = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/string_utils.h" #include "src/turbomind/utils/string_utils.h"
#include "stdlib.h" #include "stdlib.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
Tensor::Tensor(): Tensor::Tensor():
// a none tensor. // a none tensor.
...@@ -271,7 +271,7 @@ std::string Tensor::getNumpyTypeDesc(DataType type) const ...@@ -271,7 +271,7 @@ std::string Tensor::getNumpyTypeDesc(DataType type) const
{TYPE_FP64, "f8"}}; {TYPE_FP64, "f8"}};
if (type == TYPE_BF16) { if (type == TYPE_BF16) {
FT_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't " TM_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
"support bfloat16 as of now, it will be properly extended if numpy supports. " "support bfloat16 as of now, it will be properly extended if numpy supports. "
"Please refer for the discussions https://github.com/numpy/numpy/issues/19808."); "Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
} }
...@@ -352,7 +352,7 @@ TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map) ...@@ -352,7 +352,7 @@ TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
insert(kv.first, kv.second); insert(kv.first, kv.second);
} }
else { else {
FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str())); TM_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
} }
} }
} }
...@@ -371,7 +371,7 @@ TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tenso ...@@ -371,7 +371,7 @@ TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tenso
insert(pair.first, pair.second); insert(pair.first, pair.second);
} }
else { else {
FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str())); TM_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
} }
} }
} }
...@@ -456,4 +456,4 @@ void TensorMap::saveNpy(const std::string& base_folder) ...@@ -456,4 +456,4 @@ void TensorMap::saveNpy(const std::string& base_folder)
} }
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/string_utils.h" #include "src/turbomind/utils/string_utils.h"
#include "stdlib.h" #include "stdlib.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
typedef enum datatype_enum { typedef enum datatype_enum {
TYPE_INVALID, TYPE_INVALID,
...@@ -135,13 +135,13 @@ struct Tensor { ...@@ -135,13 +135,13 @@ struct Tensor {
template<typename T> template<typename T>
inline T getVal(size_t index) const inline T getVal(size_t index) const
{ {
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(where == MEMORY_CPU); FT_CHECK(where == MEMORY_CPU);
FT_CHECK(data != nullptr); FT_CHECK(data != nullptr);
FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size"); FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -151,9 +151,9 @@ struct Tensor { ...@@ -151,9 +151,9 @@ struct Tensor {
template<typename T> template<typename T>
inline T getVal() const inline T getVal() const
{ {
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -163,9 +163,9 @@ struct Tensor { ...@@ -163,9 +163,9 @@ struct Tensor {
template<typename T> template<typename T>
inline T* getPtr() const inline T* getPtr() const
{ {
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getPtr with type %s, but data type is: %s", TM_LOG_DEBUG("getPtr with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -174,7 +174,7 @@ struct Tensor { ...@@ -174,7 +174,7 @@ struct Tensor {
inline void* getPtrWithOffset(size_t offset) const inline void* getPtrWithOffset(size_t offset) const
{ {
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (data == nullptr) { if (data == nullptr) {
return (void*)data; return (void*)data;
} }
...@@ -187,9 +187,9 @@ struct Tensor { ...@@ -187,9 +187,9 @@ struct Tensor {
template<typename T> template<typename T>
inline T* getPtrWithOffset(size_t offset) const inline T* getPtrWithOffset(size_t offset) const
{ {
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -207,7 +207,7 @@ struct Tensor { ...@@ -207,7 +207,7 @@ struct Tensor {
T max() const T max() const
{ {
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -230,7 +230,7 @@ struct Tensor { ...@@ -230,7 +230,7 @@ struct Tensor {
T min() const T min() const
{ {
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -253,7 +253,7 @@ struct Tensor { ...@@ -253,7 +253,7 @@ struct Tensor {
T any(T val) const T any(T val) const
{ {
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -272,7 +272,7 @@ struct Tensor { ...@@ -272,7 +272,7 @@ struct Tensor {
T all(T val) const T all(T val) const
{ {
if (getTensorType<T>() != type) { if (getTensorType<T>() != type) {
FT_LOG_DEBUG("getVal with type %s, but data type is: %s", TM_LOG_DEBUG("getVal with type %s, but data type is: %s",
getNumpyTypeDesc(getTensorType<T>()).c_str(), getNumpyTypeDesc(getTensorType<T>()).c_str(),
getNumpyTypeDesc(type).c_str()); getNumpyTypeDesc(type).c_str());
} }
...@@ -324,7 +324,7 @@ public: ...@@ -324,7 +324,7 @@ public:
inline bool isExist(const std::string& key) const inline bool isExist(const std::string& key) const
{ {
FT_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str()); TM_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
return tensor_map_.find(key) != tensor_map_.end(); return tensor_map_.find(key) != tensor_map_.end();
} }
...@@ -355,7 +355,7 @@ public: ...@@ -355,7 +355,7 @@ public:
inline Tensor& at(const std::string& key) inline Tensor& at(const std::string& key)
{ {
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
FT_CHECK_WITH_INFO(isExist(key), FT_CHECK_WITH_INFO(isExist(key),
fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)", fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
key.c_str(), key.c_str(),
...@@ -374,7 +374,7 @@ public: ...@@ -374,7 +374,7 @@ public:
inline Tensor& at(const std::string& key, Tensor& default_tensor) inline Tensor& at(const std::string& key, Tensor& default_tensor)
{ {
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) { if (isExist(key)) {
return tensor_map_.at(key); return tensor_map_.at(key);
} }
...@@ -383,7 +383,7 @@ public: ...@@ -383,7 +383,7 @@ public:
inline Tensor at(const std::string& key, Tensor& default_tensor) const inline Tensor at(const std::string& key, Tensor& default_tensor) const
{ {
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) { if (isExist(key)) {
return tensor_map_.at(key); return tensor_map_.at(key);
} }
...@@ -392,7 +392,7 @@ public: ...@@ -392,7 +392,7 @@ public:
inline Tensor& at(const std::string& key, Tensor&& default_tensor) inline Tensor& at(const std::string& key, Tensor&& default_tensor)
{ {
FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str()); TM_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
if (isExist(key)) { if (isExist(key)) {
return tensor_map_.at(key); return tensor_map_.at(key);
} }
...@@ -518,4 +518,4 @@ public: ...@@ -518,4 +518,4 @@ public:
void saveNpy(const std::string& base_folder); void saveNpy(const std::string& base_folder);
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
enum class ActivationType { enum class ActivationType {
Gelu, Gelu,
...@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type) ...@@ -63,4 +63,4 @@ inline bool isGatedActivation(ActivationType activaiton_type)
|| activaiton_type == ActivationType::SiGLU; || activaiton_type == ActivationType::SiGLU;
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -41,13 +41,13 @@ ...@@ -41,13 +41,13 @@
#include <memory> #include <memory>
#endif #endif
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#if defined(CUDART_VERSION) && CUDART_VERSION < 11020 #if defined(CUDART_VERSION) && CUDART_VERSION < 11020
#define CUDA_MEMORY_POOL_DISABLED #define CUDA_MEMORY_POOL_DISABLED
#endif #endif
namespace fastertransformer { namespace turbomind {
enum class AllocatorType { enum class AllocatorType {
CUDA, CUDA,
...@@ -74,26 +74,26 @@ public: ...@@ -74,26 +74,26 @@ public:
template<typename T> template<typename T>
void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false) void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes size = ((size + 31) / 32) * 32; // make the buffer align with 32 bytes
void* void_ptr = (void*)ptr; void* void_ptr = (void*)ptr;
void* ptr_address = getAddress(void_ptr); void* ptr_address = getAddress(void_ptr);
if (isExist(ptr_address)) { if (isExist(ptr_address)) {
ReallocType realloc_type = isReMalloc(ptr_address, size); ReallocType realloc_type = isReMalloc(ptr_address, size);
if (realloc_type == ReallocType::INCREASE) { if (realloc_type == ReallocType::INCREASE) {
FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr); TM_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
free((void**)(&void_ptr), is_host); free((void**)(&void_ptr), is_host);
return malloc(size, is_set_zero, is_host); return malloc(size, is_set_zero, is_host);
} }
#if !defined(CUDA_MEMORY_POOL_DISABLED) #if !defined(CUDA_MEMORY_POOL_DISABLED)
else if (realloc_type == ReallocType::DECREASE) { else if (realloc_type == ReallocType::DECREASE) {
FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr); TM_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
free((void**)(&void_ptr), is_host); free((void**)(&void_ptr), is_host);
return malloc(size, is_set_zero, is_host); return malloc(size, is_set_zero, is_host);
} }
#endif #endif
else { else {
FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size); TM_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
if (is_set_zero) { if (is_set_zero) {
memSet(void_ptr, 0, size); memSet(void_ptr, 0, size);
} }
...@@ -101,7 +101,7 @@ public: ...@@ -101,7 +101,7 @@ public:
} }
} }
else { else {
FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr); TM_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
return malloc(size, is_set_zero, is_host); return malloc(size, is_set_zero, is_host);
} }
} }
...@@ -147,10 +147,10 @@ private: ...@@ -147,10 +147,10 @@ private:
public: public:
Allocator(int device_id): device_id_(device_id) Allocator(int device_id): device_id_(device_id)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
pointer_mapping_ = new std::unordered_map<void*, size_t>(); pointer_mapping_ = new std::unordered_map<void*, size_t>();
#if defined(CUDA_MEMORY_POOL_DISABLED) #if defined(CUDA_MEMORY_POOL_DISABLED)
FT_LOG_WARNING( TM_LOG_WARNING(
"Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free." "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
"Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP"); "Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
#else #else
...@@ -166,7 +166,7 @@ public: ...@@ -166,7 +166,7 @@ public:
} }
check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i)); check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
if (!peer_access_available) { if (!peer_access_available) {
FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i) TM_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+ " is not available."); + " is not available.");
continue; continue;
} }
...@@ -183,7 +183,7 @@ public: ...@@ -183,7 +183,7 @@ public:
virtual ~Allocator() virtual ~Allocator()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
while (!pointer_mapping_->empty()) { while (!pointer_mapping_->empty()) {
free((void**)(&pointer_mapping_->begin()->first)); free((void**)(&pointer_mapping_->begin()->first));
} }
...@@ -202,7 +202,7 @@ public: ...@@ -202,7 +202,7 @@ public:
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (size == 0) { if (size == 0) {
return nullptr; return nullptr;
} }
...@@ -224,7 +224,7 @@ public: ...@@ -224,7 +224,7 @@ public:
check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_)); check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
} }
check_cuda_error(getSetDevice(o_device)); check_cuda_error(getSetDevice(o_device));
FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size); TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
pointer_mapping_->insert({getAddress(ptr), size}); pointer_mapping_->insert({getAddress(ptr), size});
...@@ -233,12 +233,12 @@ public: ...@@ -233,12 +233,12 @@ public:
void free(void** ptr, bool is_host = false) const void free(void** ptr, bool is_host = false) const
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr); void* address = getAddress(*ptr);
if (*ptr != nullptr) { if (*ptr != nullptr) {
int o_device = 0; int o_device = 0;
if (pointer_mapping_->count(address)) { if (pointer_mapping_->count(address)) {
FT_LOG_DEBUG("Free buffer %p", address); TM_LOG_DEBUG("Free buffer %p", address);
check_cuda_error(getSetDevice(device_id_, &o_device)); check_cuda_error(getSetDevice(device_id_, &o_device));
if (is_host) { if (is_host) {
check_cuda_error(cudaFreeHost(*ptr)); check_cuda_error(cudaFreeHost(*ptr));
...@@ -255,7 +255,7 @@ public: ...@@ -255,7 +255,7 @@ public:
pointer_mapping_->erase(address); pointer_mapping_->erase(address);
} }
else { else {
FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address); TM_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
} }
} }
*ptr = nullptr; *ptr = nullptr;
...@@ -287,7 +287,7 @@ class Allocator<AllocatorType::TF>: public IAllocator { ...@@ -287,7 +287,7 @@ class Allocator<AllocatorType::TF>: public IAllocator {
for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) { for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
current_buffer_size *= pointer_mapping_->at(address).dim_size(i); current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
} }
FT_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size); TM_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
if (current_buffer_size < size) { if (current_buffer_size < size) {
return ReallocType::INCREASE; return ReallocType::INCREASE;
} }
...@@ -317,7 +317,7 @@ public: ...@@ -317,7 +317,7 @@ public:
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
tensorflow::Tensor buf; tensorflow::Tensor buf;
long long int buf_size = ((long long int)ceil(size / 32.) * 32); long long int buf_size = ((long long int)ceil(size / 32.) * 32);
tensorflow::Status status; tensorflow::Status status;
...@@ -347,7 +347,7 @@ public: ...@@ -347,7 +347,7 @@ public:
void free(void** ptr, bool is_host = false) const void free(void** ptr, bool is_host = false) const
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr); void* address = getAddress(*ptr);
pointer_mapping_->erase(address); pointer_mapping_->erase(address);
*ptr = nullptr; *ptr = nullptr;
...@@ -387,7 +387,7 @@ class Allocator<AllocatorType::TH>: public IAllocator { ...@@ -387,7 +387,7 @@ class Allocator<AllocatorType::TH>: public IAllocator {
for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) { for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
current_buffer_size *= pointer_mapping_->at(address).size(i); current_buffer_size *= pointer_mapping_->at(address).size(i);
} }
FT_LOG_DEBUG( TM_LOG_DEBUG(
"current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size); "current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
if (current_buffer_size < size) { if (current_buffer_size < size) {
return ReallocType::INCREASE; return ReallocType::INCREASE;
...@@ -419,7 +419,7 @@ public: ...@@ -419,7 +419,7 @@ public:
void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false) void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
int64_t buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32; int64_t buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
torch::Tensor buf; torch::Tensor buf;
if (is_host) { if (is_host) {
...@@ -432,14 +432,14 @@ public: ...@@ -432,14 +432,14 @@ public:
if (is_set_zero) { if (is_set_zero) {
cudaMemset(ptr, 0, buf_size); cudaMemset(ptr, 0, buf_size);
} }
FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size); TM_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
pointer_mapping_->insert({getAddress(ptr), buf}); pointer_mapping_->insert({getAddress(ptr), buf});
return ptr; return ptr;
} }
void free(void** ptr, bool is_host = false) const void free(void** ptr, bool is_host = false) const
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
void* address = getAddress(*ptr); void* address = getAddress(*ptr);
pointer_mapping_->erase(address); pointer_mapping_->erase(address);
*ptr = nullptr; *ptr = nullptr;
...@@ -448,7 +448,7 @@ public: ...@@ -448,7 +448,7 @@ public:
virtual ~Allocator() virtual ~Allocator()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
while (!pointer_mapping_->empty()) { while (!pointer_mapping_->empty()) {
void* ptr = pointer_mapping_->begin()->second.data_ptr(); void* ptr = pointer_mapping_->begin()->second.data_ptr();
free((void**)(&ptr)); free((void**)(&ptr));
...@@ -463,4 +463,4 @@ public: ...@@ -463,4 +463,4 @@ public:
} }
}; };
#endif #endif
} // namespace fastertransformer } // namespace turbomind
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cudnn.h> #include <cudnn.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void conv2d(T* output, void conv2d(T* output,
...@@ -134,4 +134,4 @@ void conv2d(T* output, ...@@ -134,4 +134,4 @@ void conv2d(T* output,
checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_)); checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "cublasAlgoMap.h" #include "cublasAlgoMap.h"
namespace fastertransformer { namespace turbomind {
cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename): cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
config_filename_(filename), sp_config_filename_(sp_config_filename) config_filename_(filename), sp_config_filename_(sp_config_filename)
...@@ -223,4 +223,4 @@ bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, ...@@ -223,4 +223,4 @@ bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n,
} }
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include <utility> #include <utility>
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
#define GEMM_NUM 6 #define GEMM_NUM 6
#define GEMM_CONFIG "gemm_config.in" #define GEMM_CONFIG "gemm_config.in"
...@@ -102,4 +102,4 @@ public: ...@@ -102,4 +102,4 @@ public:
getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type); getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "cublasFP8MMWrapper.h" #include "cublasFP8MMWrapper.h"
#include "cuda_utils.h" #include "cuda_utils.h"
namespace fastertransformer { namespace turbomind {
#define CUBLAS_WORKSPACE_1MB 1048576 #define CUBLAS_WORKSPACE_1MB 1048576
cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle, cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
...@@ -27,7 +27,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle, ...@@ -27,7 +27,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
IAllocator* allocator): IAllocator* allocator):
cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator) cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper"); FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
cublasVersionCheck(); cublasVersionCheck();
...@@ -44,7 +44,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle, ...@@ -44,7 +44,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle,
IAllocator* allocator): IAllocator* allocator):
cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator) cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper"); FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
cublasVersionCheck(); cublasVersionCheck();
if (allocator_ != nullptr) { if (allocator_ != nullptr) {
...@@ -54,7 +54,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle, ...@@ -54,7 +54,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t cublas_handle,
cublasFP8MMWrapper::~cublasFP8MMWrapper() cublasFP8MMWrapper::~cublasFP8MMWrapper()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_ = nullptr; mu_ = nullptr;
if (allocator_ != nullptr) { if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_qgemm_)); allocator_->free((void**)(&cublas_workspace_qgemm_));
...@@ -69,7 +69,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper): ...@@ -69,7 +69,7 @@ cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper):
wrapper.mu_, wrapper.mu_,
wrapper.allocator_) wrapper.allocator_)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublasVersionCheck(); cublasVersionCheck();
} }
...@@ -135,7 +135,7 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res, ...@@ -135,7 +135,7 @@ void cublasFP8MMWrapper::Gemm(__nv_bfloat16* res,
cudaStream_t stream, cudaStream_t stream,
bool fastAccum) bool fastAccum)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock(); mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale; const void* devAscalePtr = (const void*)kernel_scale;
...@@ -345,7 +345,7 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res, ...@@ -345,7 +345,7 @@ void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3* res,
cudaStream_t stream, cudaStream_t stream,
bool fastAccum) bool fastAccum)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock(); mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale; const void* devAscalePtr = (const void*)kernel_scale;
...@@ -534,7 +534,7 @@ void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3* res, ...@@ -534,7 +534,7 @@ void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3* res,
const float output_scale, const float output_scale,
cudaStream_t stream) cudaStream_t stream)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock(); mu_->lock();
size_t workspace_size = 0; size_t workspace_size = 0;
// get workspace size // get workspace size
...@@ -615,7 +615,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16* res, ...@@ -615,7 +615,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16* res,
const float* output_scale, const float* output_scale,
cudaStream_t stream) cudaStream_t stream)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock(); mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale; const void* devAscalePtr = (const void*)kernel_scale;
...@@ -777,7 +777,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3* res, ...@@ -777,7 +777,7 @@ void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3* res,
const float* output_scale, const float* output_scale,
cudaStream_t stream) cudaStream_t stream)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock(); mu_->lock();
const void* devAscalePtr = (const void*)kernel_scale; const void* devAscalePtr = (const void*)kernel_scale;
...@@ -1018,4 +1018,4 @@ template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3* ...@@ -1018,4 +1018,4 @@ template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3*
const float* output_scale, const float* output_scale,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h" #include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h"
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h" #include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
class cublasFP8MMWrapper: public cublasMMWrapper { class cublasFP8MMWrapper: public cublasMMWrapper {
public: public:
...@@ -170,8 +170,8 @@ public: ...@@ -170,8 +170,8 @@ public:
private: private:
int version_major_, version_minor_, version_patch_; int version_major_, version_minor_, version_patch_;
fastertransformer::qgmma1x1Launcher qgmmaLauncher; turbomind::qgmma1x1Launcher qgmmaLauncher;
void* cublas_workspace_qgemm_ = nullptr; void* cublas_workspace_qgemm_ = nullptr;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
#endif #endif
namespace fastertransformer { namespace turbomind {
cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle, cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
cudaStream_t stream, cudaStream_t stream,
cublasAlgoMap* cublas_algo_map, cublasAlgoMap* cublas_algo_map,
...@@ -556,4 +556,4 @@ void cublasINT8MMWrapper::SpGemm( ...@@ -556,4 +556,4 @@ void cublasINT8MMWrapper::SpGemm(
mu_->unlock(); mu_->unlock();
} }
#endif #endif
} // namespace fastertransformer } // namespace turbomind
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
*/ */
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" #include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h" #include "src/turbomind/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include <string> #include <string>
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
class cublasINT8MMWrapper: public cublasMMWrapper { class cublasINT8MMWrapper: public cublasMMWrapper {
private: private:
...@@ -91,4 +91,4 @@ public: ...@@ -91,4 +91,4 @@ public:
#endif #endif
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
#endif #endif
namespace fastertransformer { namespace turbomind {
cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle, cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
cublasLtHandle_t cublaslt_handle, cublasLtHandle_t cublaslt_handle,
cudaStream_t stream, cudaStream_t stream,
...@@ -35,7 +35,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle, ...@@ -35,7 +35,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
mu_(mu), mu_(mu),
allocator_(allocator) allocator_(allocator)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) { if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false); cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
} }
...@@ -57,7 +57,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle, ...@@ -57,7 +57,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
mu_(mu), mu_(mu),
allocator_(allocator) allocator_(allocator)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) { if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false); cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
} }
...@@ -66,7 +66,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle, ...@@ -66,7 +66,7 @@ cublasMMWrapper::cublasMMWrapper(cublasHandle_t cublas_handle,
cublasMMWrapper::~cublasMMWrapper() cublasMMWrapper::~cublasMMWrapper()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_ = nullptr; mu_ = nullptr;
if (allocator_ != nullptr) { if (allocator_ != nullptr) {
allocator_->free((void**)(&cublas_workspace_)); allocator_->free((void**)(&cublas_workspace_));
...@@ -85,7 +85,7 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper): ...@@ -85,7 +85,7 @@ cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
mu_(wrapper.mu_), mu_(wrapper.mu_),
allocator_(wrapper.allocator_) allocator_(wrapper.allocator_)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (allocator_ != nullptr) { if (allocator_ != nullptr) {
cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false); cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
} }
...@@ -110,7 +110,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, ...@@ -110,7 +110,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
cudaDataType_t computeType, cudaDataType_t computeType,
cublasGemmAlgo_t algo) cublasGemmAlgo_t algo)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
mu_->lock(); mu_->lock();
check_cuda_error(cublasGemmEx(cublas_handle_, check_cuda_error(cublasGemmEx(cublas_handle_,
transa, transa,
...@@ -147,7 +147,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, ...@@ -147,7 +147,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
void* C, void* C,
const int ldc) const int ldc)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f); Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
} }
...@@ -165,7 +165,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, ...@@ -165,7 +165,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
float f_alpha, float f_alpha,
float f_beta) float f_beta)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
half h_alpha = (half)(f_alpha); half h_alpha = (half)(f_alpha);
half h_beta = (half)(f_beta); half h_beta = (half)(f_beta);
...@@ -396,7 +396,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa, ...@@ -396,7 +396,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
void* C, void* C,
const int ldc) const int ldc)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t Atype, Btype, Ctype; cudaDataType_t Atype, Btype, Ctype;
cublasComputeType_t computeType; cublasComputeType_t computeType;
cudaDataType_t scaleType; cudaDataType_t scaleType;
...@@ -1099,4 +1099,4 @@ void cublasMMWrapper::Int8Gemm(const int m, ...@@ -1099,4 +1099,4 @@ void cublasMMWrapper::Int8Gemm(const int m,
return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false); return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
*/ */
#include "cuda_utils.h" #include "cuda_utils.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h" #include "src/turbomind/utils/cublasAlgoMap.h"
#include <cublasLt.h> #include <cublasLt.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
#include <string> #include <string>
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
class cublasMMWrapper { class cublasMMWrapper {
protected: protected:
...@@ -293,4 +293,4 @@ public: ...@@ -293,4 +293,4 @@ public:
#endif #endif
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
namespace fastertransformer { namespace turbomind {
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
inline __device__ float2 bf1622float2(const __nv_bfloat162 val) inline __device__ float2 bf1622float2(const __nv_bfloat162 val)
...@@ -287,4 +287,4 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _ ...@@ -287,4 +287,4 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif // ENABLE_BF16 #endif // ENABLE_BF16
} // namespace fastertransformer } // namespace turbomind
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "cuda_fp8_utils.h" #include "cuda_fp8_utils.h"
namespace fastertransformer { namespace turbomind {
#ifdef ENABLE_FP8 #ifdef ENABLE_FP8
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode> template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
...@@ -121,4 +121,4 @@ template void ...@@ -121,4 +121,4 @@ template void
invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream); invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
#endif // ENABLE_FP8 #endif // ENABLE_FP8
} // namespace fastertransformer } // namespace turbomind
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
#define USE_QGMMA #define USE_QGMMA
#endif #endif
namespace fastertransformer { namespace turbomind {
const float FP8_E4M3_MAX = 480.0f; const float FP8_E4M3_MAX = 480.0f;
...@@ -190,5 +190,5 @@ void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_ ...@@ -190,5 +190,5 @@ void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_
template<typename T_W> template<typename T_W>
void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream); void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
#endif // ENABLE_FP8 #endif // ENABLE_FP8
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh" #include "src/turbomind/utils/cuda_bf16_fallbacks.cuh"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cuda.h> #include <cuda.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
inline __device__ T ldg(const T* val) inline __device__ T ldg(const T* val)
...@@ -598,4 +598,4 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val) ...@@ -598,4 +598,4 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
#endif // ENABLE_FP8 #endif // ENABLE_FP8
} // namespace fastertransformer } // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment