Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <curand_kernel.h> #include <curand_kernel.h>
namespace fastertransformer { namespace turbomind {
void invokeTopPInitialize(int* topp_id_val_buf, void invokeTopPInitialize(int* topp_id_val_buf,
int* topp_offset_buf, int* topp_offset_buf,
...@@ -152,4 +152,4 @@ void invokeComputeToppDecay(float* runtime_top_p, ...@@ -152,4 +152,4 @@ void invokeComputeToppDecay(float* runtime_top_p,
const int local_batch_size, const int local_batch_size,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/stop_criteria_kernels.h" #include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
__global__ void stop_words_criterion(const int* output_ids, __global__ void stop_words_criterion(const int* output_ids,
const int* parent_ids, const int* parent_ids,
...@@ -91,7 +91,7 @@ void invokeStopWordsCriterion(const int* output_ids, ...@@ -91,7 +91,7 @@ void invokeStopWordsCriterion(const int* output_ids,
int step, int step,
cudaStream_t stream) cudaStream_t stream)
{ {
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
// Check if we have sampled a word from the stop_words list. If so, stop the sequence. // Check if we have sampled a word from the stop_words list. If so, stop the sequence.
dim3 block, grid; dim3 block, grid;
block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL); block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL);
...@@ -143,7 +143,7 @@ void invokeLengthCriterion(bool* finished, ...@@ -143,7 +143,7 @@ void invokeLengthCriterion(bool* finished,
{ {
// Check if we have attained the sequence length limit. If so, stop the sequence. // Check if we have attained the sequence length limit. If so, stop the sequence.
// In addition, check if all sequences are stopped and return the result in should_stop // In addition, check if all sequences are stopped and return the result in should_stop
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
dim3 block{min(512, uint32_t(batch_size * beam_width))}; dim3 block{min(512, uint32_t(batch_size * beam_width))};
dim3 grid{1}; dim3 grid{1};
h_pinned_finished_sum_[0] = -1; h_pinned_finished_sum_[0] = -1;
...@@ -156,4 +156,4 @@ void invokeLengthCriterion(bool* finished, ...@@ -156,4 +156,4 @@ void invokeLengthCriterion(bool* finished,
*should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width; *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace fastertransformer { namespace turbomind {
void invokeStopWordsCriterion(const int* output_ids, void invokeStopWordsCriterion(const int* output_ids,
const int* parent_ids, const int* parent_ids,
...@@ -39,4 +39,4 @@ void invokeLengthCriterion(bool* finished, ...@@ -39,4 +39,4 @@ void invokeLengthCriterion(bool* finished,
int step, int step,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -15,13 +15,13 @@ ...@@ -15,13 +15,13 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
__inline__ __device__ int target_index(int id1, int id2, int id3, int id4, int dim_1, int dim_2, int dim_3, int dim_4) __inline__ __device__ int target_index(int id1, int id2, int id3, int id4, int dim_1, int dim_2, int dim_3, int dim_4)
{ {
...@@ -2520,4 +2520,4 @@ INSTANTIATETRANSPOSEATTENTIONS(__nv_bfloat16); ...@@ -2520,4 +2520,4 @@ INSTANTIATETRANSPOSEATTENTIONS(__nv_bfloat16);
#endif #endif
#undef INSTANTIATETRANSPOSEATTENTIONS #undef INSTANTIATETRANSPOSEATTENTIONS
} // namespace fastertransformer } // namespace turbomind
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
*/ */
#pragma once #pragma once
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeAddQKVBiasIA3Transpose(T* q_buf, void invokeAddQKVBiasIA3Transpose(T* q_buf,
...@@ -262,4 +262,4 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf, ...@@ -262,4 +262,4 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
template<typename T> template<typename T>
void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream = 0); void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream = 0);
} // namespace fastertransformer } // namespace turbomind
...@@ -18,11 +18,11 @@ ...@@ -18,11 +18,11 @@
#include <assert.h> #include <assert.h>
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
namespace fastertransformer { namespace turbomind {
class BaseLayer { class BaseLayer {
public: public:
...@@ -65,4 +65,4 @@ protected: ...@@ -65,4 +65,4 @@ protected:
bool sparse_; bool sparse_;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
*/ */
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "stdlib.h" #include "stdlib.h"
namespace fastertransformer { namespace turbomind {
// Note that the int8 mode of BERT and GPT are different. // Note that the int8 mode of BERT and GPT are different.
// For int8 mode = 2 on GPT: // For int8 mode = 2 on GPT:
...@@ -63,4 +63,4 @@ struct DenseWeight { ...@@ -63,4 +63,4 @@ struct DenseWeight {
bool fuse_gemm_bias = false; bool fuse_gemm_bias = false;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
namespace fastertransformer { namespace turbomind {
class DynamicDecodeBaseLayer: public BaseLayer { class DynamicDecodeBaseLayer: public BaseLayer {
protected: protected:
...@@ -39,11 +39,11 @@ public: ...@@ -39,11 +39,11 @@ public:
DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){}; DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){};
virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0; virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors, virtual void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) = 0; const std::vector<turbomind::Tensor>* input_tensors) = 0;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors, virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) = 0; const std::unordered_map<std::string, Tensor>* input_tensors) = 0;
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0; virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -14,21 +14,21 @@ ...@@ -14,21 +14,21 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/kernels/ban_bad_words.h" #include "src/turbomind/kernels/ban_bad_words.h"
#include "src/fastertransformer/kernels/stop_criteria_kernels.h" #include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void DynamicDecodeLayer<T>::allocateBuffer() void DynamicDecodeLayer<T>::allocateBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true); h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true);
return; return;
} }
...@@ -36,7 +36,7 @@ void DynamicDecodeLayer<T>::allocateBuffer() ...@@ -36,7 +36,7 @@ void DynamicDecodeLayer<T>::allocateBuffer()
template<typename T> template<typename T>
void DynamicDecodeLayer<T>::freeBuffer() void DynamicDecodeLayer<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&h_pinned_finished_sum_), true); allocator_->free((void**)(&h_pinned_finished_sum_), true);
return; return;
} }
...@@ -44,7 +44,7 @@ void DynamicDecodeLayer<T>::freeBuffer() ...@@ -44,7 +44,7 @@ void DynamicDecodeLayer<T>::freeBuffer()
template<typename T> template<typename T>
void DynamicDecodeLayer<T>::initialize() void DynamicDecodeLayer<T>::initialize()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0, // max_batch_size, deprecated online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0, // max_batch_size, deprecated
0, // local_head_num, deprecated 0, // local_head_num, deprecated
0, // size_per_head, deprecated 0, // size_per_head, deprecated
...@@ -123,14 +123,14 @@ DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t vocab_size, ...@@ -123,14 +123,14 @@ DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t vocab_size,
vocab_size_padded_(vocab_size_padded), vocab_size_padded_(vocab_size_padded),
cuda_device_prop_(cuda_device_prop) cuda_device_prop_(cuda_device_prop)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize(); initialize();
} }
template<typename T> template<typename T>
DynamicDecodeLayer<T>::~DynamicDecodeLayer() DynamicDecodeLayer<T>::~DynamicDecodeLayer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
delete online_beamsearch_decode_; delete online_beamsearch_decode_;
delete beamsearch_decode_; delete beamsearch_decode_;
delete topk_decode_; delete topk_decode_;
...@@ -145,7 +145,7 @@ DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_deco ...@@ -145,7 +145,7 @@ DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_deco
vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_), vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_),
cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_) cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize(); initialize();
} }
...@@ -169,7 +169,7 @@ void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_wid ...@@ -169,7 +169,7 @@ void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_wid
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional * \param top_p_reset_ids [batch_size] on gpu, uint32, optional
*/ */
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
has_diff_runtime_args_ = hasDiffRuntimeArgs(runtime_args); has_diff_runtime_args_ = hasDiffRuntimeArgs(runtime_args);
if (beam_width == 1) { // sampling layers if (beam_width == 1) { // sampling layers
topk_decode_->setup(batch_size, beam_width, runtime_args); topk_decode_->setup(batch_size, beam_width, runtime_args);
...@@ -181,7 +181,7 @@ template<typename T> ...@@ -181,7 +181,7 @@ template<typename T>
void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors, void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) const std::unordered_map<std::string, Tensor>* input_tensors)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap input_map(*input_tensors); TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors); TensorMap output_map(*output_tensors);
forward(&output_map, &input_map); forward(&output_map, &input_map);
...@@ -235,7 +235,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_ ...@@ -235,7 +235,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
* *
*/ */
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const int ite = (int)input_tensors->at("ite").getVal<uint>(); const int ite = (int)input_tensors->at("ite").getVal<uint>();
const int step = input_tensors->at("step").getVal<int>(); const int step = input_tensors->at("step").getVal<int>();
FT_CHECK(input_tensors->at("logits").shape.size() == 3); FT_CHECK(input_tensors->at("logits").shape.size() == 3);
...@@ -516,4 +516,4 @@ bool DynamicDecodeLayer<T>::hasDiffRuntimeArgs(TensorMap* input_tensors) ...@@ -516,4 +516,4 @@ bool DynamicDecodeLayer<T>::hasDiffRuntimeArgs(TensorMap* input_tensors)
template class DynamicDecodeLayer<float>; template class DynamicDecodeLayer<float>;
template class DynamicDecodeLayer<half>; template class DynamicDecodeLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -19,12 +19,12 @@ ...@@ -19,12 +19,12 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h" #include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h" #include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class DynamicDecodeLayer: public BaseLayer { class DynamicDecodeLayer: public BaseLayer {
...@@ -83,4 +83,4 @@ public: ...@@ -83,4 +83,4 @@ public:
const std::unordered_map<std::string, Tensor>* input_tensors); const std::unordered_map<std::string, Tensor>* input_tensors);
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/layers/FfnFP8Layer.h" #include "src/turbomind/layers/FfnFP8Layer.h"
#include "src/fastertransformer/kernels/activation_fp8_kernels.h" #include "src/turbomind/kernels/activation_fp8_kernels.h"
#include "src/fastertransformer/utils/cublasFP8MMWrapper.h" #include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/fastertransformer/utils/nvtx_utils.h" #include "src/turbomind/utils/nvtx_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T1, typename T2> template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors, void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors,
...@@ -32,7 +32,7 @@ void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors, ...@@ -32,7 +32,7 @@ void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors,
// output tensors: // output tensors:
// output_hidden_state [token_num, d_model], // output_hidden_state [token_num, d_model],
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() == 1); FT_CHECK(input_tensors->size() == 1);
FT_CHECK(output_tensors->size() == 1); FT_CHECK(output_tensors->size() == 1);
...@@ -396,7 +396,7 @@ FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t inter_size, ...@@ -396,7 +396,7 @@ FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t inter_size,
inter_size_(inter_size), inter_size_(inter_size),
fp8_mode_(fp8_mode) fp8_mode_(fp8_mode)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
} }
template<typename T1, typename T2> template<typename T1, typename T2>
...@@ -410,13 +410,13 @@ FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer): ...@@ -410,13 +410,13 @@ FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
inter_size_(ffn_layer.inter_size_), inter_size_(ffn_layer.inter_size_),
fp8_mode_(ffn_layer.fp8_mode_) fp8_mode_(ffn_layer.fp8_mode_)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
} }
template<typename T1, typename T2> template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::~FfnFP8Layer() FfnFP8Layer<T1, T2>::~FfnFP8Layer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr; cublas_wrapper_ = nullptr;
freeBuffer(); freeBuffer();
} }
...@@ -430,7 +430,7 @@ void FfnFP8Layer<T1, T2>::allocateBuffer() ...@@ -430,7 +430,7 @@ void FfnFP8Layer<T1, T2>::allocateBuffer()
template<typename T1, typename T2> template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num) void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
inter_buf_ = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false); inter_buf_ = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
inter_buf_bf16_ = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false); inter_buf_bf16_ = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
...@@ -440,7 +440,7 @@ void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num) ...@@ -440,7 +440,7 @@ void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
template<typename T1, typename T2> template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::freeBuffer() void FfnFP8Layer<T1, T2>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_)); allocator_->free((void**)(&inter_buf_));
allocator_->free((void**)(&inter_buf_bf16_)); allocator_->free((void**)(&inter_buf_bf16_));
...@@ -532,4 +532,4 @@ void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m, ...@@ -532,4 +532,4 @@ void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m,
template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>; template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
#pragma once #pragma once
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
#include "src/fastertransformer/layers/FfnFP8Weight.h" #include "src/turbomind/layers/FfnFP8Weight.h"
#include "src/fastertransformer/layers/FfnLayer.h" #include "src/turbomind/layers/FfnLayer.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
template<typename T1, typename T2> template<typename T1, typename T2>
class FfnFP8Layer: public BaseLayer { class FfnFP8Layer: public BaseLayer {
...@@ -130,4 +130,4 @@ private: ...@@ -130,4 +130,4 @@ private:
const float* output_scale) override; const float* output_scale) override;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#pragma once #pragma once
#include "FfnWeight.h" #include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h" #include "src/turbomind/utils/ScaleList.h"
namespace fastertransformer { namespace turbomind {
template<typename T1, typename T2> template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> { struct FfnFP8Weight: FfnWeight<T1, T2> {
...@@ -27,4 +27,4 @@ struct FfnFP8Weight: FfnWeight<T1, T2> { ...@@ -27,4 +27,4 @@ struct FfnFP8Weight: FfnWeight<T1, T2> {
float* identity_h_scale; float* identity_h_scale;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -17,12 +17,12 @@ ...@@ -17,12 +17,12 @@
#pragma once #pragma once
#include "FfnWeight.h" #include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h" #include "src/turbomind/utils/ScaleList.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct FfnINT8Weight: FfnWeight<T> { struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr; ScaleList* scale_list_ptr;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -14,15 +14,15 @@ ...@@ -14,15 +14,15 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/layers/FfnLayer.h" #include "src/turbomind/layers/FfnLayer.h"
#include "src/fastertransformer/kernels/transpose_int8_kernels.h" #include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/fastertransformer/utils/nvtx_utils.h" #include "src/turbomind/utils/nvtx_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void FfnLayer<T>::forward(std::vector<fastertransformer::Tensor>* output_tensors, void FfnLayer<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights) const FfnWeight<T>* ffn_weights)
{ {
TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}}); TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
...@@ -46,7 +46,7 @@ void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, c ...@@ -46,7 +46,7 @@ void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, c
// expanded_source_row_to_expanded_dest_row [token_num, moe_k] (optional) // expanded_source_row_to_expanded_dest_row [token_num, moe_k] (optional)
// expert_for_source_row [token_num, moe_k] (optional) // expert_for_source_row [token_num, moe_k] (optional)
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 1 && input_tensors->size() <= 5); FT_CHECK(input_tensors->size() >= 1 && input_tensors->size() <= 5);
FT_CHECK(output_tensors->size() >= 1 || output_tensors->size() <= 4); FT_CHECK(output_tensors->size() >= 1 || output_tensors->size() <= 4);
bool use_moe = false; bool use_moe = false;
...@@ -405,7 +405,7 @@ FfnLayer<T>::FfnLayer(size_t max_batch_size, ...@@ -405,7 +405,7 @@ FfnLayer<T>::FfnLayer(size_t max_batch_size,
use_gated_activation_(use_gated_activation), use_gated_activation_(use_gated_activation),
int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr) int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (int8_mode_ == 0) { if (int8_mode_ == 0) {
moe_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, T>>(); moe_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, T>>();
} }
...@@ -438,13 +438,13 @@ FfnLayer<T>::FfnLayer(FfnLayer<T> const& ffn_layer): ...@@ -438,13 +438,13 @@ FfnLayer<T>::FfnLayer(FfnLayer<T> const& ffn_layer):
weight_only_int8_fc_runner_(ffn_layer.weight_only_int8_fc_runner_), weight_only_int8_fc_runner_(ffn_layer.weight_only_int8_fc_runner_),
int8_fc_runner_(ffn_layer.int8_fc_runner_) int8_fc_runner_(ffn_layer.int8_fc_runner_)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
} }
template<typename T> template<typename T>
FfnLayer<T>::~FfnLayer() FfnLayer<T>::~FfnLayer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr; cublas_wrapper_ = nullptr;
freeBuffer(); freeBuffer();
} }
...@@ -459,7 +459,7 @@ void FfnLayer<T>::allocateBuffer() ...@@ -459,7 +459,7 @@ void FfnLayer<T>::allocateBuffer()
template<typename T> template<typename T>
void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe) void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (use_moe) { if (use_moe) {
moe_gates_buf_ = moe_gates_buf_ =
(T*)allocator_->reMalloc(moe_gates_buf_, sizeof(T) * pad_to_multiple_of_16(token_num * expert_num_), false); (T*)allocator_->reMalloc(moe_gates_buf_, sizeof(T) * pad_to_multiple_of_16(token_num * expert_num_), false);
...@@ -505,7 +505,7 @@ void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe) ...@@ -505,7 +505,7 @@ void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
template<typename T> template<typename T>
void FfnLayer<T>::freeBuffer() void FfnLayer<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_)); allocator_->free((void**)(&inter_buf_));
if (use_gated_activation_) { if (use_gated_activation_) {
...@@ -712,4 +712,4 @@ template class SiluFfnLayer<half>; ...@@ -712,4 +712,4 @@ template class SiluFfnLayer<half>;
template class SiluFfnLayer<__nv_bfloat16>; template class SiluFfnLayer<__nv_bfloat16>;
#endif #endif
} // namespace fastertransformer } // namespace turbomind
...@@ -16,20 +16,20 @@ ...@@ -16,20 +16,20 @@
#pragma once #pragma once
#include "src/fastertransformer/kernels/activation_kernels.h" #include "src/turbomind/kernels/activation_kernels.h"
#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" #include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" #include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/fastertransformer/kernels/matrix_vector_multiplication.h" #include "src/turbomind/kernels/matrix_vector_multiplication.h"
#include "src/fastertransformer/kernels/moe_kernels.h" #include "src/turbomind/kernels/moe_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
#include "src/fastertransformer/layers/FfnWeight.h" #include "src/turbomind/layers/FfnWeight.h"
#include "src/fastertransformer/utils/activation_types.h" #include "src/turbomind/utils/activation_types.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include <stdint.h> #include <stdint.h>
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class FfnLayer: public BaseLayer { class FfnLayer: public BaseLayer {
...@@ -122,8 +122,8 @@ public: ...@@ -122,8 +122,8 @@ public:
inter_size_ = runtime_inter_size; inter_size_ = runtime_inter_size;
} }
virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors, virtual void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights); const FfnWeight<T>* ffn_weights);
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights); virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
}; };
...@@ -229,4 +229,4 @@ private: ...@@ -229,4 +229,4 @@ private:
using FfnLayer<T>::inter_size_; using FfnLayer<T>::inter_size_;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -15,13 +15,13 @@ ...@@ -15,13 +15,13 @@
*/ */
#include "FfnLayerINT8.h" #include "FfnLayerINT8.h"
#include "src/fastertransformer/utils/nvtx_utils.h" #include "src/turbomind/utils/nvtx_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void FfnLayerINT8<T>::forward(std::vector<fastertransformer::Tensor>* output_tensors, void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights) const FfnWeight<T>* ffn_weights)
{ {
// input_tensors: [input (token_num, hidden_dimension)] // input_tensors: [input (token_num, hidden_dimension)]
...@@ -337,4 +337,4 @@ void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, Sc ...@@ -337,4 +337,4 @@ void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, Sc
template class ReluFfnLayerINT8<float>; template class ReluFfnLayerINT8<float>;
template class ReluFfnLayerINT8<half>; template class ReluFfnLayerINT8<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -17,16 +17,16 @@ ...@@ -17,16 +17,16 @@
#pragma once #pragma once
#include "FfnINT8Weight.h" #include "FfnINT8Weight.h"
#include "src/fastertransformer/kernels/activation_int8_kernels.h" #include "src/turbomind/kernels/activation_int8_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
#include "src/fastertransformer/utils/ScaleList.h" #include "src/turbomind/utils/ScaleList.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasINT8MMWrapper.h" #include "src/turbomind/utils/cublasINT8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class GeluFfnLayerINT8; class GeluFfnLayerINT8;
...@@ -77,8 +77,8 @@ public: ...@@ -77,8 +77,8 @@ public:
~FfnLayerINT8(); ~FfnLayerINT8();
void forward(std::vector<fastertransformer::Tensor>* output_tensors, void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights); const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>; friend GeluFfnLayerINT8<T>;
...@@ -143,4 +143,4 @@ private: ...@@ -143,4 +143,4 @@ private:
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override; void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "DenseWeight.h" #include "DenseWeight.h"
namespace fastertransformer { namespace turbomind {
template<typename T1, typename T2 = T1> template<typename T1, typename T2 = T1>
struct FfnWeight { struct FfnWeight {
...@@ -29,4 +29,4 @@ struct FfnWeight { ...@@ -29,4 +29,4 @@ struct FfnWeight {
DenseWeight<T1, T2> ia3_weight; DenseWeight<T1, T2> ia3_weight;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#pragma once #pragma once
#include "src/fastertransformer/layers/DenseWeight.h" #include "src/turbomind/layers/DenseWeight.h"
namespace fastertransformer { namespace turbomind {
template<typename T1, typename T2 = T1> template<typename T1, typename T2 = T1>
struct AttentionWeight { struct AttentionWeight {
...@@ -30,4 +30,4 @@ struct AttentionWeight { ...@@ -30,4 +30,4 @@ struct AttentionWeight {
DenseWeight<T1, T2> ia3_value_weight; DenseWeight<T1, T2> ia3_value_weight;
}; };
} // namespace fastertransformer } // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment