Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -17,7 +17,7 @@
#include <curand_kernel.h>
namespace fastertransformer {
namespace turbomind {
void invokeTopPInitialize(int* topp_id_val_buf,
int* topp_offset_buf,
......@@ -152,4 +152,4 @@ void invokeComputeToppDecay(float* runtime_top_p,
const int local_batch_size,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,12 +14,12 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/stop_criteria_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
__global__ void stop_words_criterion(const int* output_ids,
const int* parent_ids,
......@@ -91,7 +91,7 @@ void invokeStopWordsCriterion(const int* output_ids,
int step,
cudaStream_t stream)
{
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
// Check if we have sampled a word from the stop_words list. If so, stop the sequence.
dim3 block, grid;
block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL);
......@@ -143,7 +143,7 @@ void invokeLengthCriterion(bool* finished,
{
// Check if we have attained the sequence length limit. If so, stop the sequence.
// In addition, check if all sequences are stopped and return the result in should_stop
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
dim3 block{min(512, uint32_t(batch_size * beam_width))};
dim3 grid{1};
h_pinned_finished_sum_[0] = -1;
......@@ -156,4 +156,4 @@ void invokeLengthCriterion(bool* finished,
*should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,7 +17,7 @@
#include <cuda_runtime.h>
namespace fastertransformer {
namespace turbomind {
void invokeStopWordsCriterion(const int* output_ids,
const int* parent_ids,
......@@ -39,4 +39,4 @@ void invokeLengthCriterion(bool* finished,
int step,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,13 +15,13 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
__inline__ __device__ int target_index(int id1, int id2, int id3, int id4, int dim_1, int dim_2, int dim_3, int dim_4)
{
......@@ -2520,4 +2520,4 @@ INSTANTIATETRANSPOSEATTENTIONS(__nv_bfloat16);
#endif
#undef INSTANTIATETRANSPOSEATTENTIONS
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,9 +15,9 @@
*/
#pragma once
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/utils/Tensor.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeAddQKVBiasIA3Transpose(T* q_buf,
......@@ -262,4 +262,4 @@ void invokeMaskedSoftMaxWithRelPosBias(T* qk_buf,
template<typename T>
void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream = 0);
} // namespace fastertransformer
} // namespace turbomind
......@@ -18,11 +18,11 @@
#include <assert.h>
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
namespace fastertransformer {
namespace turbomind {
class BaseLayer {
public:
......@@ -65,4 +65,4 @@ protected:
bool sparse_;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,10 +15,10 @@
*/
#pragma once
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "stdlib.h"
namespace fastertransformer {
namespace turbomind {
// Note that the int8 mode of BERT and GPT are different.
// For int8 mode = 2 on GPT:
......@@ -63,4 +63,4 @@ struct DenseWeight {
bool fuse_gemm_bias = false;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,9 +19,9 @@
#include <string>
#include <unordered_map>
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/turbomind/layers/BaseLayer.h"
namespace fastertransformer {
namespace turbomind {
class DynamicDecodeBaseLayer: public BaseLayer {
protected:
......@@ -39,11 +39,11 @@ public:
DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){};
virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) = 0;
virtual void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors) = 0;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) = 0;
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,21 +14,21 @@
* limitations under the License.
*/
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/kernels/ban_bad_words.h"
#include "src/fastertransformer/kernels/stop_criteria_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/kernels/ban_bad_words.h"
#include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void DynamicDecodeLayer<T>::allocateBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true);
return;
}
......@@ -36,7 +36,7 @@ void DynamicDecodeLayer<T>::allocateBuffer()
template<typename T>
void DynamicDecodeLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&h_pinned_finished_sum_), true);
return;
}
......@@ -44,7 +44,7 @@ void DynamicDecodeLayer<T>::freeBuffer()
template<typename T>
void DynamicDecodeLayer<T>::initialize()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0, // max_batch_size, deprecated
0, // local_head_num, deprecated
0, // size_per_head, deprecated
......@@ -123,14 +123,14 @@ DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t vocab_size,
vocab_size_padded_(vocab_size_padded),
cuda_device_prop_(cuda_device_prop)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize();
}
template<typename T>
DynamicDecodeLayer<T>::~DynamicDecodeLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
delete online_beamsearch_decode_;
delete beamsearch_decode_;
delete topk_decode_;
......@@ -145,7 +145,7 @@ DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_deco
vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_),
cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize();
}
......@@ -169,7 +169,7 @@ void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_wid
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional
*/
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
has_diff_runtime_args_ = hasDiffRuntimeArgs(runtime_args);
if (beam_width == 1) { // sampling layers
topk_decode_->setup(batch_size, beam_width, runtime_args);
......@@ -181,7 +181,7 @@ template<typename T>
void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors);
forward(&output_map, &input_map);
......@@ -235,7 +235,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
*
*/
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const int ite = (int)input_tensors->at("ite").getVal<uint>();
const int step = input_tensors->at("step").getVal<int>();
FT_CHECK(input_tensors->at("logits").shape.size() == 3);
......@@ -516,4 +516,4 @@ bool DynamicDecodeLayer<T>::hasDiffRuntimeArgs(TensorMap* input_tensors)
template class DynamicDecodeLayer<float>;
template class DynamicDecodeLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,12 +19,12 @@
#include <string>
#include <unordered_map>
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class DynamicDecodeLayer: public BaseLayer {
......@@ -83,4 +83,4 @@ public:
const std::unordered_map<std::string, Tensor>* input_tensors);
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,12 +14,12 @@
* limitations under the License.
*/
#include "src/fastertransformer/layers/FfnFP8Layer.h"
#include "src/fastertransformer/kernels/activation_fp8_kernels.h"
#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include "src/turbomind/layers/FfnFP8Layer.h"
#include "src/turbomind/kernels/activation_fp8_kernels.h"
#include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors,
......@@ -32,7 +32,7 @@ void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors,
// output tensors:
// output_hidden_state [token_num, d_model],
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() == 1);
FT_CHECK(output_tensors->size() == 1);
......@@ -396,7 +396,7 @@ FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t inter_size,
inter_size_(inter_size),
fp8_mode_(fp8_mode)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T1, typename T2>
......@@ -410,13 +410,13 @@ FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
inter_size_(ffn_layer.inter_size_),
fp8_mode_(ffn_layer.fp8_mode_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::~FfnFP8Layer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr;
freeBuffer();
}
......@@ -430,7 +430,7 @@ void FfnFP8Layer<T1, T2>::allocateBuffer()
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
inter_buf_ = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
inter_buf_bf16_ = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
......@@ -440,7 +440,7 @@ void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_));
allocator_->free((void**)(&inter_buf_bf16_));
......@@ -532,4 +532,4 @@ void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m,
template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,13 +16,13 @@
#pragma once
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/FfnFP8Weight.h"
#include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/FfnFP8Weight.h"
#include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T1, typename T2>
class FfnFP8Layer: public BaseLayer {
......@@ -130,4 +130,4 @@ private:
const float* output_scale) override;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,8 +17,8 @@
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> {
......@@ -27,4 +27,4 @@ struct FfnFP8Weight: FfnWeight<T1, T2> {
float* identity_h_scale;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,12 +17,12 @@
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T>
struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,15 +14,15 @@
* limitations under the License.
*/
#include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void FfnLayer<T>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
void FfnLayer<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights)
{
TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
......@@ -46,7 +46,7 @@ void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, c
// expanded_source_row_to_expanded_dest_row [token_num, moe_k] (optional)
// expert_for_source_row [token_num, moe_k] (optional)
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 1 && input_tensors->size() <= 5);
FT_CHECK(output_tensors->size() >= 1 || output_tensors->size() <= 4);
bool use_moe = false;
......@@ -405,7 +405,7 @@ FfnLayer<T>::FfnLayer(size_t max_batch_size,
use_gated_activation_(use_gated_activation),
int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (int8_mode_ == 0) {
moe_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, T>>();
}
......@@ -438,13 +438,13 @@ FfnLayer<T>::FfnLayer(FfnLayer<T> const& ffn_layer):
weight_only_int8_fc_runner_(ffn_layer.weight_only_int8_fc_runner_),
int8_fc_runner_(ffn_layer.int8_fc_runner_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T>
FfnLayer<T>::~FfnLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr;
freeBuffer();
}
......@@ -459,7 +459,7 @@ void FfnLayer<T>::allocateBuffer()
template<typename T>
void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (use_moe) {
moe_gates_buf_ =
(T*)allocator_->reMalloc(moe_gates_buf_, sizeof(T) * pad_to_multiple_of_16(token_num * expert_num_), false);
......@@ -505,7 +505,7 @@ void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
template<typename T>
void FfnLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_));
if (use_gated_activation_) {
......@@ -712,4 +712,4 @@ template class SiluFfnLayer<half>;
template class SiluFfnLayer<__nv_bfloat16>;
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,20 +16,20 @@
#pragma once
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/fastertransformer/kernels/matrix_vector_multiplication.h"
#include "src/fastertransformer/kernels/moe_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/FfnWeight.h"
#include "src/fastertransformer/utils/activation_types.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/turbomind/kernels/matrix_vector_multiplication.h"
#include "src/turbomind/kernels/moe_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/FfnWeight.h"
#include "src/turbomind/utils/activation_types.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include <stdint.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
class FfnLayer: public BaseLayer {
......@@ -122,8 +122,8 @@ public:
inter_size_ = runtime_inter_size;
}
virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
virtual void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
};
......@@ -229,4 +229,4 @@ private:
using FfnLayer<T>::inter_size_;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,13 +15,13 @@
*/
#include "FfnLayerINT8.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void FfnLayerINT8<T>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights)
{
// input_tensors: [input (token_num, hidden_dimension)]
......@@ -337,4 +337,4 @@ void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, Sc
template class ReluFfnLayerINT8<float>;
template class ReluFfnLayerINT8<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,16 +17,16 @@
#pragma once
#include "FfnINT8Weight.h"
#include "src/fastertransformer/kernels/activation_int8_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/utils/ScaleList.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/activation_int8_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/utils/ScaleList.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasINT8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace fastertransformer {
namespace turbomind {
template<typename T>
class GeluFfnLayerINT8;
......@@ -77,8 +77,8 @@ public:
~FfnLayerINT8();
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>;
......@@ -143,4 +143,4 @@ private:
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -18,7 +18,7 @@
#include "DenseWeight.h"
namespace fastertransformer {
namespace turbomind {
template<typename T1, typename T2 = T1>
struct FfnWeight {
......@@ -29,4 +29,4 @@ struct FfnWeight {
DenseWeight<T1, T2> ia3_weight;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,9 +16,9 @@
#pragma once
#include "src/fastertransformer/layers/DenseWeight.h"
#include "src/turbomind/layers/DenseWeight.h"
namespace fastertransformer {
namespace turbomind {
template<typename T1, typename T2 = T1>
struct AttentionWeight {
......@@ -30,4 +30,4 @@ struct AttentionWeight {
DenseWeight<T1, T2> ia3_value_weight;
};
} // namespace fastertransformer
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment