Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -20,15 +20,15 @@
#include <vector>
// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
enum class AttentionType {
UNFUSED_MHA,
......@@ -159,4 +159,4 @@ public:
}
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace fastertransformer {
namespace turbomind {
template<typename T1, typename T2>
struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
......@@ -31,4 +31,4 @@ struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
float* identity_h_scale;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,15 +19,15 @@
#include <assert.h>
#include <vector>
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
// template<typename T>
// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
......@@ -62,4 +62,4 @@ public:
virtual ~BaseAttentionFP8Layer() = default;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,14 +16,14 @@
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct AttentionINT8Weight: AttentionWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,11 +14,11 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
__global__ void update_indir_cache_kernel(int* tgt_indir_cache,
const int* src_indir_cache,
......@@ -112,7 +112,7 @@ BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_s
template<typename T>
BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer();
}
......@@ -288,4 +288,4 @@ void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input
template class BaseBeamSearchLayer<float>;
template class BaseBeamSearchLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
......@@ -60,8 +60,8 @@ public:
~BaseBeamSearchLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) override;
void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
......@@ -77,4 +77,4 @@ void update_indir_cache_kernelLauncher(int* tgt_indir_cache,
int ite,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,10 +14,10 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
__global__ void logProbAddCumLogProb(float* log_probs,
......@@ -278,7 +278,7 @@ void BeamSearchLayer<T>::allocateBuffer()
template<typename T>
void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
invokeTopkBeamSearch<float>(nullptr,
topk_softmax_workspace_size_,
......@@ -345,10 +345,10 @@ BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer)
template<typename T>
BeamSearchLayer<T>::~BeamSearchLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template class BeamSearchLayer<float>;
template class BeamSearchLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,11 +16,11 @@
#pragma once
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include <float.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
class BeamSearchLayer: public BaseBeamSearchLayer<T> {
......@@ -65,4 +65,4 @@ public:
~BeamSearchLayer();
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,9 +14,9 @@
* limitations under the License.
*/
#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
namespace fastertransformer {
namespace turbomind {
static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
static const int MAX_K = 4;
......@@ -184,7 +184,7 @@ void OnlineBeamSearchLayer<T>::allocateBuffer()
template<typename T>
void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// we need to check 2 * beam_width candidates each time
// 64 is the max beam width we support now.
topk_softmax_workspace_size_ =
......@@ -234,16 +234,16 @@ template<typename T>
OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
BaseBeamSearchLayer<T>(beam_search_layer)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T>
OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template class OnlineBeamSearchLayer<float>;
template class OnlineBeamSearchLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,10 +16,10 @@
#pragma once
#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
......@@ -62,4 +62,4 @@ public:
~OnlineBeamSearchLayer();
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,20 +15,20 @@
* limitations under the License.
*/
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
#include <algorithm>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
curandstate_buf_ = reinterpret_cast<curandState_t*>(
allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
......@@ -55,7 +55,7 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
template<typename T>
void BaseSamplingLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&curandstate_buf_));
allocator_->free((void**)(&random_seeds_buf_));
......@@ -122,7 +122,7 @@ void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
// repetition_penalty and presence_penalty are mutually exclusive.
// min_length [1] or [batch_size] on cpu, optional
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor();
Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
allocateBuffer(batch_size, runtime_top_k, runtime_top_p);
......@@ -245,7 +245,7 @@ template<typename T>
void BaseSamplingLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors)
{
FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors);
forward(&output_map, &input_map);
......@@ -272,7 +272,7 @@ void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_t
// output_log_probs [local_batch_size], must be float*, optional
// The log probs at the current step.
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);
const int batch_size = output_tensors->at("output_ids").shape[1];
......@@ -355,10 +355,10 @@ void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_t
freeBuffer();
}
sync_check_cuda_error();
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template class BaseSamplingLayer<float>;
template class BaseSamplingLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,10 +19,10 @@
#include <curand_kernel.h>
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class BaseSamplingLayer: public DynamicDecodeBaseLayer {
......@@ -85,11 +85,11 @@ public:
~BaseSamplingLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) override;
void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,13 +17,13 @@
#include <float.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
template<uint TOP_K_MAX>
__global__ void setup_topk_runtime_args(int batch_size,
......@@ -85,7 +85,7 @@ void TopKSamplingLayer<T>::allocateBuffer()
template<typename T>
void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
uint max_top_k = top_k.size() > 0 ? top_k.max<uint>() : 1;
if (max_top_k == 0) {
......@@ -120,7 +120,7 @@ void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
template<typename T>
void TopKSamplingLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&sampling_workspace_));
allocator_->free((void**)(&runtime_top_k_buf_));
......@@ -140,7 +140,7 @@ void TopKSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
// runtime_top_p [1] or [batch_size] on cpu, optional, float.
// temperature [1] or [batch_size] on cpu, optional
// repetition_penalty [1] or [batch_size] on cpu, optional
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
uint tmp_top_k = 0;
......@@ -205,7 +205,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
// output_log_probs [local_batch_size], must be float*, optional
// The log probs at the current step.
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);
......@@ -308,11 +308,11 @@ TopKSamplingLayer<T>::TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampli
template<typename T>
TopKSamplingLayer<T>::~TopKSamplingLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer();
}
template class TopKSamplingLayer<float>;
template class TopKSamplingLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,10 +17,10 @@
#pragma once
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class TopKSamplingLayer: public BaseSamplingLayer<T> {
......@@ -71,4 +71,4 @@ public:
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -18,14 +18,14 @@
#include <algorithm>
#include <float.h>
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
static __global__ void set_topp_runtime_args(int batch_size,
uint top_k,
......@@ -117,7 +117,7 @@ void TopPSamplingLayer<T>::allocateBuffer()
template<typename T>
void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
invokeTopPSampling<T>(nullptr, // workspace
sampling_workspace_size_,
......@@ -163,7 +163,7 @@ void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
template<typename T>
void TopPSamplingLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&sampling_workspace_));
allocator_->free((void**)(&topp_id_vals_buf_));
......@@ -196,7 +196,7 @@ void TopPSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional
**/
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
const size_t runtime_top_p_size = runtime_top_p.size();
......@@ -274,7 +274,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
log probs at the current step.
**/
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);
......@@ -339,7 +339,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
local_batch_size,
stream_);
sync_check_cuda_error();
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template<typename T>
......@@ -390,4 +390,4 @@ TopPSamplingLayer<T>::~TopPSamplingLayer()
template class TopPSamplingLayer<float>;
template class TopPSamplingLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,9 +17,9 @@
#pragma once
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class TopPSamplingLayer: public BaseSamplingLayer<T> {
......@@ -81,4 +81,4 @@ public:
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,7 +19,7 @@
#pragma once
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct FtWeight {
......@@ -46,4 +46,4 @@ public:
}
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,16 +2,16 @@
#pragma once
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#include <pthread.h>
namespace fastertransformer {
namespace turbomind {
class Barrier {
public:
Barrier(unsigned count)
{
FT_LOG_INFO("Barrier(%d)", (int)count);
TM_LOG_INFO("Barrier(%d)", (int)count);
pthread_barrier_init(&barrier_, nullptr, count);
}
......@@ -34,4 +34,4 @@ private:
pthread_barrier_t barrier_{};
};
} // namespace fastertransformer
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/LlamaBatch.h"
#include "src/fastertransformer/kernels/decoding_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/logger.h"
#include <cstdint>
#include <iomanip>
#include <sstream>
#include <unordered_map>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
......@@ -28,7 +28,7 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
};
auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) {
FT_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
TM_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
req->signal.set_value(ec);
req.reset();
};
......@@ -147,7 +147,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request
template<typename T>
void LlamaBatch<T>::allocateBuffer(size_t batch_size, size_t session_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const size_t batchxbeam = batch_size;
const size_t hidden_units = llama_->hidden_units_;
......@@ -239,7 +239,7 @@ void LlamaBatch<T>::allocatePersistantBuffer(size_t max_batch_size)
template<typename T>
void LlamaBatch<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)&context_decoder_input_buf_);
allocator_->free((void**)&context_decoder_ids_buf_);
......@@ -340,7 +340,7 @@ void LlamaBatch<T>::initializeSampling(int infer_request_count)
}
inputs.insert({param.first, {ref.where, ref.type, shape, param.second}});
if (debug_ && rank_ == 0) {
FT_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str());
TM_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str());
}
}
}
......@@ -441,12 +441,12 @@ void LlamaBatch<T>::initializeGeneration()
step_ = max_context_len_;
if (rank_ == 0) {
FT_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_);
FT_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_);
TM_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_);
TM_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_);
FT_LOG_INFO("[initGen] slot sequence_id context_len seq_limit_len finished");
TM_LOG_INFO("[initGen] slot sequence_id context_len seq_limit_len finished");
for (int i = 0; i < batch_size_; ++i) {
FT_LOG_INFO("[initGen] %4d %11ld %11d %13d %8d",
TM_LOG_INFO("[initGen] %4d %11ld %11d %13d %8d",
i,
(long)cached_seq_[i].id,
h_context_length_buf_[i],
......@@ -461,7 +461,7 @@ bool LlamaBatch<T>::generate()
{
constexpr int kLogInterval = 10;
if (rank_ == 0 && (step_ - 1) % kLogInterval == 0) {
FT_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1);
TM_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1);
}
const bool is_first_step = step_ == max_context_len_;
......@@ -530,14 +530,14 @@ bool LlamaBatch<T>::generate()
for (int k = 0; k < prev.size(); ++k) {
sprev << std::setw(6) << prev[k];
}
FT_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str());
TM_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str());
}
std::stringstream scurr;
for (int k = 0; k < curr.size(); ++k) {
scurr << std::setw(6) << curr[k];
}
FT_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str());
TM_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str());
}
////////////////////////////////////////////////
......@@ -580,7 +580,7 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe
seq.cache_len = std::min(seq.cache_len, (size_t)step);
}
else if (rank_ == 0) {
FT_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
TM_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
}
}
......@@ -697,7 +697,7 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe
request_seq_len_limit_[i] = session_len_ - 1;
if (rank_ == 0) {
const int trunc_output_len = request_seq_len_limit_[i] - h_context_length_buf_[i];
FT_LOG_WARNING(
TM_LOG_WARNING(
"[initialize] [%ld] total sequence length (%d + %d) exceeds session_len (%d), request_output_len is truncated to %d",
(long)seq.id,
h_context_length_buf_[i],
......@@ -729,15 +729,15 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe
v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
if (llama_->tensor_para_.rank_ == 0) {
FT_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count);
FT_LOG_INFO("[init] batch_size = %d", (int)batch_size_);
FT_LOG_INFO("[init] session_len = %d", (int)session_len_);
FT_LOG_INFO("[init] max_input_length = %d", (int)max_input_length);
FT_LOG_INFO("[init] max_context_len = %d", (int)max_context_len);
FT_LOG_INFO(
TM_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count);
TM_LOG_INFO("[init] batch_size = %d", (int)batch_size_);
TM_LOG_INFO("[init] session_len = %d", (int)session_len_);
TM_LOG_INFO("[init] max_input_length = %d", (int)max_input_length);
TM_LOG_INFO("[init] max_context_len = %d", (int)max_context_len);
TM_LOG_INFO(
"[init] slot sequence_id history_len input_len context_len tmp_input_len token_ids.size cache_len");
for (int i = batch_size_ - infer_request_count; i < batch_size_; ++i) {
FT_LOG_INFO("[init] %4d %11ld %11d %9d %11d %13d %14d %9d",
TM_LOG_INFO("[init] %4d %11ld %11d %9d %11d %13d %14d %9d",
i,
(int)cached_seq_[i].id,
h_history_length_buf_[i],
......@@ -766,7 +766,7 @@ void LlamaBatch<T>::contextDecode()
const int context_decode_count = batch_size_ - base;
if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
TM_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
}
invokePlusScalar(input_length_buf_ + base, -1, context_decode_count, stream_);
invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);
......@@ -782,7 +782,7 @@ void LlamaBatch<T>::contextDecode()
if (i == batch_size_ || token_num + h_context_length_buf_[i] > max_context_token_num_) {
const int context_decode_batch_size = i - offset;
if (rank_ == 0) {
FT_LOG_INFO(
TM_LOG_INFO(
"[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
base,
context_decode_batch_size,
......@@ -841,11 +841,11 @@ void LlamaBatch<T>::contextDecode()
check_cuda_error(cudaStreamSynchronize(stream_));
const auto tock = std::chrono::high_resolution_clock::now();
if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
TM_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
}
}
else if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] Context decoding is not needed.");
TM_LOG_INFO("[decodeContext] Context decoding is not needed.");
}
}
......@@ -874,7 +874,7 @@ void LlamaBatch<T>::finish()
for (int i = 0; i < batch_size_; ++i) {
ss << (i ? ", " : "") << "(" << h_sequence_lengths_[i] << "," << h_finished_buf_[i] << ")";
}
FT_LOG_INFO("[finish] [%s]", ss.str().c_str());
TM_LOG_INFO("[finish] [%s]", ss.str().c_str());
}
for (int i = 0; i < batch_size_; ++i) {
......@@ -930,7 +930,7 @@ void LlamaBatch<T>::synchronize()
batch_size_ = idx;
if (rank_ == 0) {
FT_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_);
TM_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_);
}
finished_count_ = 0;
......@@ -973,7 +973,7 @@ template<typename T>
void LlamaBatch<T>::finishRequest(int index, bool force_end)
{
if (rank_ == 0) {
FT_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id);
TM_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id);
}
if (debug_ && rank_ == 0) {
......@@ -988,7 +988,7 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
for (const auto& t : tokens) {
ss << " " << t;
}
FT_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str());
TM_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str());
}
auto& output_ids_tensor = requests_[index]->outputs[rank_].at("output_ids");
......@@ -1039,4 +1039,4 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
template class LlamaBatch<half>;
template class LlamaBatch<float>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,12 +2,12 @@
#pragma once
#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/turbomind/models/llama/LlamaCacheManager.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaV2;
......@@ -150,4 +150,4 @@ private:
IAllocator* allocator_{};
};
} // namespace fastertransformer
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment