Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
...@@ -20,15 +20,15 @@ ...@@ -20,15 +20,15 @@
#include <vector> #include <vector>
// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h" // #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" #include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
enum class AttentionType { enum class AttentionType {
UNFUSED_MHA, UNFUSED_MHA,
...@@ -159,4 +159,4 @@ public: ...@@ -159,4 +159,4 @@ public:
} }
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
#pragma once #pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" #include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h" #include "src/turbomind/utils/ScaleList.h"
namespace fastertransformer { namespace turbomind {
template<typename T1, typename T2> template<typename T1, typename T2>
struct AttentionFP8Weight: public AttentionWeight<T1, T2> { struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
...@@ -31,4 +31,4 @@ struct AttentionFP8Weight: public AttentionWeight<T1, T2> { ...@@ -31,4 +31,4 @@ struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
float* identity_h_scale; float* identity_h_scale;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -19,15 +19,15 @@ ...@@ -19,15 +19,15 @@
#include <assert.h> #include <assert.h>
#include <vector> #include <vector>
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" #include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h" #include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasFP8MMWrapper.h" #include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
// template<typename T> // template<typename T>
// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len, // AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
...@@ -62,4 +62,4 @@ public: ...@@ -62,4 +62,4 @@ public:
virtual ~BaseAttentionFP8Layer() = default; virtual ~BaseAttentionFP8Layer() = default;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,14 +16,14 @@ ...@@ -16,14 +16,14 @@
#pragma once #pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" #include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h" #include "src/turbomind/utils/ScaleList.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct AttentionINT8Weight: AttentionWeight<T> { struct AttentionINT8Weight: AttentionWeight<T> {
ScaleList* scale_list_ptr; ScaleList* scale_list_ptr;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h" #include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
__global__ void update_indir_cache_kernel(int* tgt_indir_cache, __global__ void update_indir_cache_kernel(int* tgt_indir_cache,
const int* src_indir_cache, const int* src_indir_cache,
...@@ -112,7 +112,7 @@ BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_s ...@@ -112,7 +112,7 @@ BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_s
template<typename T> template<typename T>
BaseBeamSearchLayer<T>::~BaseBeamSearchLayer() BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer(); freeBuffer();
} }
...@@ -288,4 +288,4 @@ void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input ...@@ -288,4 +288,4 @@ void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input
template class BaseBeamSearchLayer<float>; template class BaseBeamSearchLayer<float>;
template class BaseBeamSearchLayer<half>; template class BaseBeamSearchLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
#pragma once #pragma once
#include "src/fastertransformer/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h" #include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class BaseBeamSearchLayer: public DynamicDecodeBaseLayer { class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
...@@ -60,8 +60,8 @@ public: ...@@ -60,8 +60,8 @@ public:
~BaseBeamSearchLayer(); ~BaseBeamSearchLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override; void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<fastertransformer::Tensor>* output_tensors, void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) override; const std::vector<turbomind::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors, void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override; const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override; void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
...@@ -77,4 +77,4 @@ void update_indir_cache_kernelLauncher(int* tgt_indir_cache, ...@@ -77,4 +77,4 @@ void update_indir_cache_kernelLauncher(int* tgt_indir_cache,
int ite, int ite,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
__global__ void logProbAddCumLogProb(float* log_probs, __global__ void logProbAddCumLogProb(float* log_probs,
...@@ -278,7 +278,7 @@ void BeamSearchLayer<T>::allocateBuffer() ...@@ -278,7 +278,7 @@ void BeamSearchLayer<T>::allocateBuffer()
template<typename T> template<typename T>
void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width) void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
invokeTopkBeamSearch<float>(nullptr, invokeTopkBeamSearch<float>(nullptr,
topk_softmax_workspace_size_, topk_softmax_workspace_size_,
...@@ -345,10 +345,10 @@ BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer) ...@@ -345,10 +345,10 @@ BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer)
template<typename T> template<typename T>
BeamSearchLayer<T>::~BeamSearchLayer() BeamSearchLayer<T>::~BeamSearchLayer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
} }
template class BeamSearchLayer<float>; template class BeamSearchLayer<float>;
template class BeamSearchLayer<half>; template class BeamSearchLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,11 +16,11 @@ ...@@ -16,11 +16,11 @@
#pragma once #pragma once
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h" #include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include <float.h> #include <float.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class BeamSearchLayer: public BaseBeamSearchLayer<T> { class BeamSearchLayer: public BaseBeamSearchLayer<T> {
...@@ -65,4 +65,4 @@ public: ...@@ -65,4 +65,4 @@ public:
~BeamSearchLayer(); ~BeamSearchLayer();
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
namespace fastertransformer { namespace turbomind {
static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128; static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
static const int MAX_K = 4; static const int MAX_K = 4;
...@@ -184,7 +184,7 @@ void OnlineBeamSearchLayer<T>::allocateBuffer() ...@@ -184,7 +184,7 @@ void OnlineBeamSearchLayer<T>::allocateBuffer()
template<typename T> template<typename T>
void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width) void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// we need to check 2 * beam_width candidates each time // we need to check 2 * beam_width candidates each time
// 64 is the max beam width we support now. // 64 is the max beam width we support now.
topk_softmax_workspace_size_ = topk_softmax_workspace_size_ =
...@@ -234,16 +234,16 @@ template<typename T> ...@@ -234,16 +234,16 @@ template<typename T>
OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer): OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
BaseBeamSearchLayer<T>(beam_search_layer) BaseBeamSearchLayer<T>(beam_search_layer)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
} }
template<typename T> template<typename T>
OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer() OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
} }
template class OnlineBeamSearchLayer<float>; template class OnlineBeamSearchLayer<float>;
template class OnlineBeamSearchLayer<half>; template class OnlineBeamSearchLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
#pragma once #pragma once
#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h" #include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h" #include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> { class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
...@@ -62,4 +62,4 @@ public: ...@@ -62,4 +62,4 @@ public:
~OnlineBeamSearchLayer(); ~OnlineBeamSearchLayer();
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -15,20 +15,20 @@ ...@@ -15,20 +15,20 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
#include <algorithm> #include <algorithm>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
curandstate_buf_ = reinterpret_cast<curandState_t*>( curandstate_buf_ = reinterpret_cast<curandState_t*>(
allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false)); allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
random_seeds_buf_ = reinterpret_cast<unsigned long long*>( random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
...@@ -55,7 +55,7 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso ...@@ -55,7 +55,7 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
template<typename T> template<typename T>
void BaseSamplingLayer<T>::freeBuffer() void BaseSamplingLayer<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)(&curandstate_buf_)); allocator_->free((void**)(&curandstate_buf_));
allocator_->free((void**)(&random_seeds_buf_)); allocator_->free((void**)(&random_seeds_buf_));
...@@ -122,7 +122,7 @@ void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt ...@@ -122,7 +122,7 @@ void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
// repetition_penalty and presence_penalty are mutually exclusive. // repetition_penalty and presence_penalty are mutually exclusive.
// min_length [1] or [batch_size] on cpu, optional // min_length [1] or [batch_size] on cpu, optional
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor(); Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor();
Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor(); Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
allocateBuffer(batch_size, runtime_top_k, runtime_top_p); allocateBuffer(batch_size, runtime_top_k, runtime_top_p);
...@@ -245,7 +245,7 @@ template<typename T> ...@@ -245,7 +245,7 @@ template<typename T>
void BaseSamplingLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors, void BaseSamplingLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) const std::unordered_map<std::string, Tensor>* input_tensors)
{ {
FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
TensorMap input_map(*input_tensors); TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors); TensorMap output_map(*output_tensors);
forward(&output_map, &input_map); forward(&output_map, &input_map);
...@@ -272,7 +272,7 @@ void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_t ...@@ -272,7 +272,7 @@ void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_t
// output_log_probs [local_batch_size], must be float*, optional // output_log_probs [local_batch_size], must be float*, optional
// The log probs at the current step. // The log probs at the current step.
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4); FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1); FT_CHECK(output_tensors->size() >= 1);
const int batch_size = output_tensors->at("output_ids").shape[1]; const int batch_size = output_tensors->at("output_ids").shape[1];
...@@ -355,10 +355,10 @@ void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_t ...@@ -355,10 +355,10 @@ void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_t
freeBuffer(); freeBuffer();
} }
sync_check_cuda_error(); sync_check_cuda_error();
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
} }
template class BaseSamplingLayer<float>; template class BaseSamplingLayer<float>;
template class BaseSamplingLayer<half>; template class BaseSamplingLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -19,10 +19,10 @@ ...@@ -19,10 +19,10 @@
#include <curand_kernel.h> #include <curand_kernel.h>
#include "src/fastertransformer/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h" #include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class BaseSamplingLayer: public DynamicDecodeBaseLayer { class BaseSamplingLayer: public DynamicDecodeBaseLayer {
...@@ -85,11 +85,11 @@ public: ...@@ -85,11 +85,11 @@ public:
~BaseSamplingLayer(); ~BaseSamplingLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override; void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<fastertransformer::Tensor>* output_tensors, void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) override; const std::vector<turbomind::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors, void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override; const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override; void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -17,13 +17,13 @@ ...@@ -17,13 +17,13 @@
#include <float.h> #include <float.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
template<uint TOP_K_MAX> template<uint TOP_K_MAX>
__global__ void setup_topk_runtime_args(int batch_size, __global__ void setup_topk_runtime_args(int batch_size,
...@@ -85,7 +85,7 @@ void TopKSamplingLayer<T>::allocateBuffer() ...@@ -85,7 +85,7 @@ void TopKSamplingLayer<T>::allocateBuffer()
template<typename T> template<typename T>
void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p); BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
uint max_top_k = top_k.size() > 0 ? top_k.max<uint>() : 1; uint max_top_k = top_k.size() > 0 ? top_k.max<uint>() : 1;
if (max_top_k == 0) { if (max_top_k == 0) {
...@@ -120,7 +120,7 @@ void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso ...@@ -120,7 +120,7 @@ void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
template<typename T> template<typename T>
void TopKSamplingLayer<T>::freeBuffer() void TopKSamplingLayer<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)(&sampling_workspace_)); allocator_->free((void**)(&sampling_workspace_));
allocator_->free((void**)(&runtime_top_k_buf_)); allocator_->free((void**)(&runtime_top_k_buf_));
...@@ -140,7 +140,7 @@ void TopKSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt ...@@ -140,7 +140,7 @@ void TopKSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
// runtime_top_p [1] or [batch_size] on cpu, optional, float. // runtime_top_p [1] or [batch_size] on cpu, optional, float.
// temperature [1] or [batch_size] on cpu, optional // temperature [1] or [batch_size] on cpu, optional
// repetition_penalty [1] or [batch_size] on cpu, optional // repetition_penalty [1] or [batch_size] on cpu, optional
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args); BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
uint tmp_top_k = 0; uint tmp_top_k = 0;
...@@ -205,7 +205,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp ...@@ -205,7 +205,7 @@ void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
// output_log_probs [local_batch_size], must be float*, optional // output_log_probs [local_batch_size], must be float*, optional
// The log probs at the current step. // The log probs at the current step.
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4); FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1); FT_CHECK(output_tensors->size() >= 1);
...@@ -308,11 +308,11 @@ TopKSamplingLayer<T>::TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampli ...@@ -308,11 +308,11 @@ TopKSamplingLayer<T>::TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampli
template<typename T> template<typename T>
TopKSamplingLayer<T>::~TopKSamplingLayer() TopKSamplingLayer<T>::~TopKSamplingLayer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer(); freeBuffer();
} }
template class TopKSamplingLayer<float>; template class TopKSamplingLayer<float>;
template class TopKSamplingLayer<half>; template class TopKSamplingLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -17,10 +17,10 @@ ...@@ -17,10 +17,10 @@
#pragma once #pragma once
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class TopKSamplingLayer: public BaseSamplingLayer<T> { class TopKSamplingLayer: public BaseSamplingLayer<T> {
...@@ -71,4 +71,4 @@ public: ...@@ -71,4 +71,4 @@ public:
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override; void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -18,14 +18,14 @@ ...@@ -18,14 +18,14 @@
#include <algorithm> #include <algorithm>
#include <float.h> #include <float.h>
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
static __global__ void set_topp_runtime_args(int batch_size, static __global__ void set_topp_runtime_args(int batch_size,
uint top_k, uint top_k,
...@@ -117,7 +117,7 @@ void TopPSamplingLayer<T>::allocateBuffer() ...@@ -117,7 +117,7 @@ void TopPSamplingLayer<T>::allocateBuffer()
template<typename T> template<typename T>
void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p); BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
invokeTopPSampling<T>(nullptr, // workspace invokeTopPSampling<T>(nullptr, // workspace
sampling_workspace_size_, sampling_workspace_size_,
...@@ -163,7 +163,7 @@ void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso ...@@ -163,7 +163,7 @@ void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
template<typename T> template<typename T>
void TopPSamplingLayer<T>::freeBuffer() void TopPSamplingLayer<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)(&sampling_workspace_)); allocator_->free((void**)(&sampling_workspace_));
allocator_->free((void**)(&topp_id_vals_buf_)); allocator_->free((void**)(&topp_id_vals_buf_));
...@@ -196,7 +196,7 @@ void TopPSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt ...@@ -196,7 +196,7 @@ void TopPSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_widt
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional * \param top_p_reset_ids [batch_size] on gpu, uint32, optional
**/ **/
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args); BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor(); const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
const size_t runtime_top_p_size = runtime_top_p.size(); const size_t runtime_top_p_size = runtime_top_p.size();
...@@ -274,7 +274,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp ...@@ -274,7 +274,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
log probs at the current step. log probs at the current step.
**/ **/
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4); FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1); FT_CHECK(output_tensors->size() >= 1);
...@@ -339,7 +339,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp ...@@ -339,7 +339,7 @@ void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* inp
local_batch_size, local_batch_size,
stream_); stream_);
sync_check_cuda_error(); sync_check_cuda_error();
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
} }
template<typename T> template<typename T>
...@@ -390,4 +390,4 @@ TopPSamplingLayer<T>::~TopPSamplingLayer() ...@@ -390,4 +390,4 @@ TopPSamplingLayer<T>::~TopPSamplingLayer()
template class TopPSamplingLayer<float>; template class TopPSamplingLayer<float>;
template class TopPSamplingLayer<half>; template class TopPSamplingLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
#pragma once #pragma once
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class TopPSamplingLayer: public BaseSamplingLayer<T> { class TopPSamplingLayer: public BaseSamplingLayer<T> {
...@@ -81,4 +81,4 @@ public: ...@@ -81,4 +81,4 @@ public:
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override; void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct FtWeight { struct FtWeight {
...@@ -46,4 +46,4 @@ public: ...@@ -46,4 +46,4 @@ public:
} }
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -2,16 +2,16 @@ ...@@ -2,16 +2,16 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include <pthread.h> #include <pthread.h>
namespace fastertransformer { namespace turbomind {
class Barrier { class Barrier {
public: public:
Barrier(unsigned count) Barrier(unsigned count)
{ {
FT_LOG_INFO("Barrier(%d)", (int)count); TM_LOG_INFO("Barrier(%d)", (int)count);
pthread_barrier_init(&barrier_, nullptr, count); pthread_barrier_init(&barrier_, nullptr, count);
} }
...@@ -34,4 +34,4 @@ private: ...@@ -34,4 +34,4 @@ private:
pthread_barrier_t barrier_{}; pthread_barrier_t barrier_{};
}; };
} // namespace fastertransformer } // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/turbomind/kernels/decoding_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaV2.h"
#include "src/fastertransformer/models/llama/Request.h" #include "src/turbomind/models/llama/Request.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include <cstdint> #include <cstdint>
#include <iomanip> #include <iomanip>
#include <sstream> #include <sstream>
#include <unordered_map> #include <unordered_map>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs, void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
...@@ -28,7 +28,7 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r ...@@ -28,7 +28,7 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
}; };
auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) { auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) {
FT_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec); TM_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
req->signal.set_value(ec); req->signal.set_value(ec);
req.reset(); req.reset();
}; };
...@@ -147,7 +147,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request ...@@ -147,7 +147,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request
template<typename T> template<typename T>
void LlamaBatch<T>::allocateBuffer(size_t batch_size, size_t session_len) void LlamaBatch<T>::allocateBuffer(size_t batch_size, size_t session_len)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const size_t batchxbeam = batch_size; const size_t batchxbeam = batch_size;
const size_t hidden_units = llama_->hidden_units_; const size_t hidden_units = llama_->hidden_units_;
...@@ -239,7 +239,7 @@ void LlamaBatch<T>::allocatePersistantBuffer(size_t max_batch_size) ...@@ -239,7 +239,7 @@ void LlamaBatch<T>::allocatePersistantBuffer(size_t max_batch_size)
template<typename T> template<typename T>
void LlamaBatch<T>::freeBuffer() void LlamaBatch<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)&context_decoder_input_buf_); allocator_->free((void**)&context_decoder_input_buf_);
allocator_->free((void**)&context_decoder_ids_buf_); allocator_->free((void**)&context_decoder_ids_buf_);
...@@ -340,7 +340,7 @@ void LlamaBatch<T>::initializeSampling(int infer_request_count) ...@@ -340,7 +340,7 @@ void LlamaBatch<T>::initializeSampling(int infer_request_count)
} }
inputs.insert({param.first, {ref.where, ref.type, shape, param.second}}); inputs.insert({param.first, {ref.where, ref.type, shape, param.second}});
if (debug_ && rank_ == 0) { if (debug_ && rank_ == 0) {
FT_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str()); TM_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str());
} }
} }
} }
...@@ -441,12 +441,12 @@ void LlamaBatch<T>::initializeGeneration() ...@@ -441,12 +441,12 @@ void LlamaBatch<T>::initializeGeneration()
step_ = max_context_len_; step_ = max_context_len_;
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_); TM_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_);
FT_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_); TM_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_);
FT_LOG_INFO("[initGen] slot sequence_id context_len seq_limit_len finished"); TM_LOG_INFO("[initGen] slot sequence_id context_len seq_limit_len finished");
for (int i = 0; i < batch_size_; ++i) { for (int i = 0; i < batch_size_; ++i) {
FT_LOG_INFO("[initGen] %4d %11ld %11d %13d %8d", TM_LOG_INFO("[initGen] %4d %11ld %11d %13d %8d",
i, i,
(long)cached_seq_[i].id, (long)cached_seq_[i].id,
h_context_length_buf_[i], h_context_length_buf_[i],
...@@ -461,7 +461,7 @@ bool LlamaBatch<T>::generate() ...@@ -461,7 +461,7 @@ bool LlamaBatch<T>::generate()
{ {
constexpr int kLogInterval = 10; constexpr int kLogInterval = 10;
if (rank_ == 0 && (step_ - 1) % kLogInterval == 0) { if (rank_ == 0 && (step_ - 1) % kLogInterval == 0) {
FT_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1); TM_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1);
} }
const bool is_first_step = step_ == max_context_len_; const bool is_first_step = step_ == max_context_len_;
...@@ -530,14 +530,14 @@ bool LlamaBatch<T>::generate() ...@@ -530,14 +530,14 @@ bool LlamaBatch<T>::generate()
for (int k = 0; k < prev.size(); ++k) { for (int k = 0; k < prev.size(); ++k) {
sprev << std::setw(6) << prev[k]; sprev << std::setw(6) << prev[k];
} }
FT_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str()); TM_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str());
} }
std::stringstream scurr; std::stringstream scurr;
for (int k = 0; k < curr.size(); ++k) { for (int k = 0; k < curr.size(); ++k) {
scurr << std::setw(6) << curr[k]; scurr << std::setw(6) << curr[k];
} }
FT_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str()); TM_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str());
} }
//////////////////////////////////////////////// ////////////////////////////////////////////////
...@@ -580,7 +580,7 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe ...@@ -580,7 +580,7 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe
seq.cache_len = std::min(seq.cache_len, (size_t)step); seq.cache_len = std::min(seq.cache_len, (size_t)step);
} }
else if (rank_ == 0) { else if (rank_ == 0) {
FT_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id); TM_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
} }
} }
...@@ -697,7 +697,7 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe ...@@ -697,7 +697,7 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe
request_seq_len_limit_[i] = session_len_ - 1; request_seq_len_limit_[i] = session_len_ - 1;
if (rank_ == 0) { if (rank_ == 0) {
const int trunc_output_len = request_seq_len_limit_[i] - h_context_length_buf_[i]; const int trunc_output_len = request_seq_len_limit_[i] - h_context_length_buf_[i];
FT_LOG_WARNING( TM_LOG_WARNING(
"[initialize] [%ld] total sequence length (%d + %d) exceeds session_len (%d), request_output_len is truncated to %d", "[initialize] [%ld] total sequence length (%d + %d) exceeds session_len (%d), request_output_len is truncated to %d",
(long)seq.id, (long)seq.id,
h_context_length_buf_[i], h_context_length_buf_[i],
...@@ -729,15 +729,15 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe ...@@ -729,15 +729,15 @@ void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infe
v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_)); v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
if (llama_->tensor_para_.rank_ == 0) { if (llama_->tensor_para_.rank_ == 0) {
FT_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count); TM_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count);
FT_LOG_INFO("[init] batch_size = %d", (int)batch_size_); TM_LOG_INFO("[init] batch_size = %d", (int)batch_size_);
FT_LOG_INFO("[init] session_len = %d", (int)session_len_); TM_LOG_INFO("[init] session_len = %d", (int)session_len_);
FT_LOG_INFO("[init] max_input_length = %d", (int)max_input_length); TM_LOG_INFO("[init] max_input_length = %d", (int)max_input_length);
FT_LOG_INFO("[init] max_context_len = %d", (int)max_context_len); TM_LOG_INFO("[init] max_context_len = %d", (int)max_context_len);
FT_LOG_INFO( TM_LOG_INFO(
"[init] slot sequence_id history_len input_len context_len tmp_input_len token_ids.size cache_len"); "[init] slot sequence_id history_len input_len context_len tmp_input_len token_ids.size cache_len");
for (int i = batch_size_ - infer_request_count; i < batch_size_; ++i) { for (int i = batch_size_ - infer_request_count; i < batch_size_; ++i) {
FT_LOG_INFO("[init] %4d %11ld %11d %9d %11d %13d %14d %9d", TM_LOG_INFO("[init] %4d %11ld %11d %9d %11d %13d %14d %9d",
i, i,
(int)cached_seq_[i].id, (int)cached_seq_[i].id,
h_history_length_buf_[i], h_history_length_buf_[i],
...@@ -766,7 +766,7 @@ void LlamaBatch<T>::contextDecode() ...@@ -766,7 +766,7 @@ void LlamaBatch<T>::contextDecode()
const int context_decode_count = batch_size_ - base; const int context_decode_count = batch_size_ - base;
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count); TM_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
} }
invokePlusScalar(input_length_buf_ + base, -1, context_decode_count, stream_); invokePlusScalar(input_length_buf_ + base, -1, context_decode_count, stream_);
invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_); invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);
...@@ -782,7 +782,7 @@ void LlamaBatch<T>::contextDecode() ...@@ -782,7 +782,7 @@ void LlamaBatch<T>::contextDecode()
if (i == batch_size_ || token_num + h_context_length_buf_[i] > max_context_token_num_) { if (i == batch_size_ || token_num + h_context_length_buf_[i] > max_context_token_num_) {
const int context_decode_batch_size = i - offset; const int context_decode_batch_size = i - offset;
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO( TM_LOG_INFO(
"[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d", "[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
base, base,
context_decode_batch_size, context_decode_batch_size,
...@@ -841,11 +841,11 @@ void LlamaBatch<T>::contextDecode() ...@@ -841,11 +841,11 @@ void LlamaBatch<T>::contextDecode()
check_cuda_error(cudaStreamSynchronize(stream_)); check_cuda_error(cudaStreamSynchronize(stream_));
const auto tock = std::chrono::high_resolution_clock::now(); const auto tock = std::chrono::high_resolution_clock::now();
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count()); TM_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
} }
} }
else if (rank_ == 0) { else if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] Context decoding is not needed."); TM_LOG_INFO("[decodeContext] Context decoding is not needed.");
} }
} }
...@@ -874,7 +874,7 @@ void LlamaBatch<T>::finish() ...@@ -874,7 +874,7 @@ void LlamaBatch<T>::finish()
for (int i = 0; i < batch_size_; ++i) { for (int i = 0; i < batch_size_; ++i) {
ss << (i ? ", " : "") << "(" << h_sequence_lengths_[i] << "," << h_finished_buf_[i] << ")"; ss << (i ? ", " : "") << "(" << h_sequence_lengths_[i] << "," << h_finished_buf_[i] << ")";
} }
FT_LOG_INFO("[finish] [%s]", ss.str().c_str()); TM_LOG_INFO("[finish] [%s]", ss.str().c_str());
} }
for (int i = 0; i < batch_size_; ++i) { for (int i = 0; i < batch_size_; ++i) {
...@@ -930,7 +930,7 @@ void LlamaBatch<T>::synchronize() ...@@ -930,7 +930,7 @@ void LlamaBatch<T>::synchronize()
batch_size_ = idx; batch_size_ = idx;
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_); TM_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_);
} }
finished_count_ = 0; finished_count_ = 0;
...@@ -973,7 +973,7 @@ template<typename T> ...@@ -973,7 +973,7 @@ template<typename T>
void LlamaBatch<T>::finishRequest(int index, bool force_end) void LlamaBatch<T>::finishRequest(int index, bool force_end)
{ {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id); TM_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id);
} }
if (debug_ && rank_ == 0) { if (debug_ && rank_ == 0) {
...@@ -988,7 +988,7 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end) ...@@ -988,7 +988,7 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
for (const auto& t : tokens) { for (const auto& t : tokens) {
ss << " " << t; ss << " " << t;
} }
FT_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str()); TM_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str());
} }
auto& output_ids_tensor = requests_[index]->outputs[rank_].at("output_ids"); auto& output_ids_tensor = requests_[index]->outputs[rank_].at("output_ids");
...@@ -1039,4 +1039,4 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end) ...@@ -1039,4 +1039,4 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
template class LlamaBatch<half>; template class LlamaBatch<half>;
template class LlamaBatch<float>; template class LlamaBatch<float>;
} // namespace fastertransformer } // namespace turbomind
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaCacheManager.h" #include "src/turbomind/models/llama/LlamaCacheManager.h"
#include "src/fastertransformer/models/llama/Request.h" #include "src/turbomind/models/llama/Request.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaV2; class LlamaV2;
...@@ -150,4 +150,4 @@ private: ...@@ -150,4 +150,4 @@ private:
IAllocator* allocator_{}; IAllocator* allocator_{};
}; };
} // namespace fastertransformer } // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment