Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/models/llama/LlamaCacheManager.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
namespace fastertransformer {
namespace turbomind {
LlamaCacheManager::~LlamaCacheManager()
{
......@@ -16,7 +16,7 @@ LlamaCacheManager::~LlamaCacheManager()
void* LlamaCacheManager::allocate(bool is_preallocte)
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate]");
TM_LOG_INFO("[LlamaCacheManager][allocate]");
}
void* mem_ptr{};
......@@ -26,7 +26,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
device_free_.pop();
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
}
}
else if (entry_count_ < max_entry_count_) {
......@@ -34,14 +34,14 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
const size_t entry_byte_size = 2 * cache_byte_size_; // 2 for k,v
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
TM_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
}
const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
FT_CHECK(chunk_ptr);
device_mem_.push_back(chunk_ptr);
entry_count_ += alloc_count;
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
TM_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
}
for (int i = 0; i < alloc_count; ++i) {
......@@ -54,7 +54,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
}
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
}
}
else {
......@@ -68,13 +68,13 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
TM_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
}
for (const auto& e : device_cache_) {
if (e.id == id) {
if (rank_ == 0) {
FT_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
TM_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
}
erase(id);
}
......@@ -102,7 +102,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i
auto pred = [&](const Sequence& s) { return s.id == id; };
auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
if (it == device_cache_.end()) {
FT_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
TM_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
FT_CHECK(0);
}
return it;
......@@ -111,7 +111,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i
auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
TM_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
}
auto entry = getEntryOrThrow(id);
......@@ -131,7 +131,7 @@ auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
TM_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
}
auto entry = getEntryOrThrow(seq.id);
......@@ -145,7 +145,7 @@ void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
void LlamaCacheManager::erase(uint64_t id)
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
TM_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
}
auto entry = getEntryOrThrow(id);
......@@ -153,7 +153,7 @@ void LlamaCacheManager::erase(uint64_t id)
if (entry->k_cache) {
device_free_.push(entry->k_cache);
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
TM_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
}
}
device_cache_.erase(entry);
......@@ -171,7 +171,7 @@ void* LlamaCacheManager::evict()
}
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
TM_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
}
FT_CHECK(it->k_cache);
......@@ -189,4 +189,4 @@ bool LlamaCacheManager::contains(uint64_t id) const noexcept
return it != device_cache_.end();
}
} // namespace fastertransformer
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/logger.h"
#include <cstdint>
#include <cuda_runtime.h>
#include <queue>
#include <unordered_map>
#include <vector>
namespace fastertransformer {
namespace turbomind {
// k-cache layout [L, H, D/x, S[s:], x]
// v-cache layout [L, H, S[s:], D/x, x]
......@@ -36,8 +36,8 @@ public:
allocator_(allocator)
{
if (rank == 0) {
FT_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
FT_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
TM_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
TM_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
}
allocate(true);
}
......@@ -99,4 +99,4 @@ private:
std::vector<Sequence> device_cache_;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,18 +17,18 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.cc
#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
......@@ -36,7 +36,7 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
size_t max_q_len,
size_t max_k_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// no padding
qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true);
......@@ -75,7 +75,7 @@ template<typename T>
void LlamaContextAttentionLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&q_buf_2_));
......@@ -98,7 +98,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
const TensorMap* input_tensors,
const LlamaAttentionWeight<T>* weights)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
/**
* input_tensors:
......@@ -403,4 +403,4 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c
template class LlamaContextAttentionLayer<float>;
template class LlamaContextAttentionLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,16 +17,16 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaContextAttentionLayer {
......@@ -124,4 +124,4 @@ private:
bool is_allocate_buffer_ = false;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,17 +16,17 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/Tensor.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void LlamaContextDecoder<T>::allocateBuffer()
......@@ -37,7 +37,7 @@ void LlamaContextDecoder<T>::allocateBuffer()
template<typename T>
void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
attn_ffn_io_ = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false);
attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
......@@ -50,7 +50,7 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token,
template<typename T>
void LlamaContextDecoder<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)&attn_ffn_io_);
allocator_->free((void**)&padding_offset_);
......@@ -94,7 +94,7 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session&
int layer,
bool is_final)
{
// FT_LOG_ERROR(__PRETTY_FUNCTION__);
// TM_LOG_ERROR(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors{
{"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
{"attention_mask",
......@@ -283,4 +283,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
template class LlamaContextDecoder<float>;
template class LlamaContextDecoder<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,25 +16,25 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once
// #include "src/fastertransformer/kernels/add_residual_kernels.h"
// #include "src/fastertransformer/kernels/layernorm_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h"
// #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
namespace fastertransformer {
// #include "src/turbomind/kernels/add_residual_kernels.h"
// #include "src/turbomind/kernels/layernorm_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
// #include "src/turbomind/layers/FfnLayer.h"
// #include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
template<typename T>
class LlamaContextDecoder: public BaseLayer {
......@@ -112,4 +112,4 @@ public:
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,14 +17,14 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
#include "src/fastertransformer/models/llama/LlamaDecoder.h"
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
LlamaDecoder<T>::LlamaDecoder(size_t head_num,
......@@ -50,14 +50,14 @@ LlamaDecoder<T>::LlamaDecoder(size_t head_num,
tensor_para_(tensor_para),
data_type_(getTensorType<T>())
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize(quant_policy);
}
template<typename T>
LlamaDecoder<T>::~LlamaDecoder()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
delete self_attention_layer_;
delete silu_ffn_layer_;
}
......@@ -65,7 +65,7 @@ LlamaDecoder<T>::~LlamaDecoder()
template<typename T>
void LlamaDecoder<T>::initialize(int quant_policy)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
size_per_head_,
......@@ -97,14 +97,14 @@ void LlamaDecoder<T>::allocateBuffer()
template<typename T>
void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaDecoder<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
is_allocate_buffer_ = false;
}
......@@ -116,7 +116,7 @@ void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&
const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors(*input_tensors);
self_attention_input_tensors.insert("input_query",
{MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
......@@ -157,7 +157,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
/**
* input_tensors:
* \param decoder_input [batch_size, hidden_dims]
......@@ -242,4 +242,4 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
template class LlamaDecoder<half>;
template class LlamaDecoder<float>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,17 +17,17 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/layers/BaseLayer.h"
// #include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaDecoder: public BaseLayer {
......@@ -93,4 +93,4 @@ public:
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,13 +16,13 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t hidden_units,
......@@ -122,12 +122,12 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
fastertransformer::mallocWeights(self_attn_weights.qkv, attn_bias_);
fastertransformer::mallocWeights(self_attn_weights.output, attn_bias_);
turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
turbomind::mallocWeights(self_attn_weights.output, attn_bias_);
fastertransformer::mallocWeights(ffn_weights.gating, false);
fastertransformer::mallocWeights(ffn_weights.intermediate, false);
fastertransformer::mallocWeights(ffn_weights.output, false);
turbomind::mallocWeights(ffn_weights.gating, false);
turbomind::mallocWeights(ffn_weights.intermediate, false);
turbomind::mallocWeights(ffn_weights.output, false);
}
template<typename T>
......@@ -175,4 +175,4 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
template struct LlamaDecoderLayerWeight<float>;
template struct LlamaDecoderLayerWeight<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,13 +16,13 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct LlamaDecoderLayerWeight {
......@@ -58,4 +58,4 @@ private:
void mallocWeights();
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,18 +16,18 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/nvtx_utils.h"
#include <string>
// #include <glog/logging.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct SATypeConverter {
......@@ -157,7 +157,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf,
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
qkv_buf_ =
reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false));
context_buf_ =
......@@ -289,4 +289,4 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o
template class LlamaDecoderSelfAttentionLayer<float>;
template class LlamaDecoderSelfAttentionLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,16 +16,16 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaDecoderSelfAttentionLayer {
......@@ -97,4 +97,4 @@ private:
bool is_allocate_buffer_{};
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,15 +15,15 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/DenseWeight.h
#pragma once
#include "src/fastertransformer/layers/FfnWeight.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/layers/FfnWeight.h"
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
enum class WeightType : int {
kFP32,
......@@ -75,4 +75,4 @@ struct LlamaFfnWeight {
LlamaDenseWeight<T> output;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,15 +15,15 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/FfnLayer.h
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/utils/nvtx_utils.h"
// #include <glog/logging.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void LlamaFfnLayer<T>::allocateBuffer(size_t token_num)
......@@ -110,4 +110,4 @@ void LlamaFfnLayer<T>::forward(TensorMap* output_tensors,
template class LlamaFfnLayer<float>;
template class LlamaFfnLayer<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,18 +15,18 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/FfnLayer.cc
#pragma once
// #include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
// #include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
#include <functional>
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaFfnLayer {
......@@ -82,4 +82,4 @@ private:
bool is_allocate_buffer_{};
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,10 +2,10 @@
#pragma once
#include "src/fastertransformer/models/llama/Barrier.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/turbomind/models/llama/Barrier.h"
#include "src/turbomind/utils/instance_comm.h"
namespace fastertransformer {
namespace turbomind {
class LlamaInstanceComm: public AbstractInstanceComm {
public:
......@@ -31,4 +31,4 @@ private:
void* ptr{};
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,12 +2,12 @@
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaLinear {
......@@ -58,4 +58,4 @@ private:
cudaStream_t stream_{};
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,14 +2,14 @@
#pragma once
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
#include <array>
#include <atomic>
#include <condition_variable>
#include <cuda_runtime.h>
#include <mutex>
namespace fastertransformer {
namespace turbomind {
struct NcclGuard {
static constexpr int kMaxGroupCount = 32;
......@@ -89,4 +89,4 @@ struct NcclGuard {
std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -18,24 +18,24 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/kernels/decoding_kernels.h"
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/models/llama/LlamaBatch.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/LlamaWeight.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <functional>
#include <memory>
#include <sstream>
#include <stdexcept>
namespace fastertransformer {
namespace turbomind {
template<typename T>
LlamaV2<T>::LlamaV2(size_t head_num,
......@@ -87,15 +87,15 @@ LlamaV2<T>::LlamaV2(size_t head_num,
shared_state_(shared_state)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
size_t elem_bits = 0;
if (quant_policy & QuantPolicy::kCacheKVInt8) {
elem_bits = sizeof(int8_t) * 8;
if (use_context_fmha) {
FT_LOG_ERROR("use_context_fmha not support int8");
TM_LOG_ERROR("use_context_fmha not support int8");
assert(0);
}
}
......@@ -128,7 +128,7 @@ LlamaV2<T>::~LlamaV2()
template<typename T>
void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
context_decoder_ = new LlamaContextDecoder<T>(head_num_,
size_per_head_,
......@@ -170,7 +170,7 @@ void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
template<typename T>
void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// ! This kernel can't be used in context decoding
invokeEmbeddingLookupPosEncodingPadCount(embeddings,
weights_->pre_decoder_embedding_table,
......@@ -203,10 +203,10 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
size_t session_len,
size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (tensor_para_.rank_ == 0) {
FT_LOG_INFO("context decoding start");
TM_LOG_INFO("context decoding start");
}
invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
......@@ -250,7 +250,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
if (tensor_para_.rank_ == 0) {
FT_LOG_INFO("context decoding end");
TM_LOG_INFO("context decoding end");
}
}
......@@ -267,7 +267,7 @@ void LlamaV2<T>::decoderForward(T* decoder_output,
size_t session_len,
size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const int max_seq_len = session_len;
const auto dtype = getTensorType<T>();
......@@ -298,7 +298,7 @@ void LlamaV2<T>::decoderForward(T* decoder_output,
template<typename T>
void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t data_type = getCudaDataType<T>();
float alpha = 1.f;
float beta = 0.f;
......@@ -375,7 +375,7 @@ void LlamaV2<T>::dynamicDecode(int* token_ids,
size_t token_ids_len,
size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
int local_batch_size = (int)batch_size;
std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
......@@ -421,7 +421,7 @@ void LlamaV2<T>::dynamicDecode(int* token_ids,
template<typename T>
void LlamaV2<T>::internalThreadEntry(int device_id)
{
FT_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
TM_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
check_cuda_error(cudaSetDevice(device_id));
auto& request_queue = shared_state_->request_queue;
......@@ -514,10 +514,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
if (debug_) {
if (tensor_para_.rank_ == 0) {
for (const auto& kv : *inputs) {
FT_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
TM_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
}
for (const auto& kv : *outputs) {
FT_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
TM_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
}
}
}
......@@ -566,10 +566,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
std::vector<int> error_codes;
bool has_error = 0;
if (rank == 0) {
FT_LOG_INFO("[forward] Enqueue requests");
TM_LOG_INFO("[forward] Enqueue requests");
auto futures = shared_state_->request_queue.enqueue(std::move(requests));
FT_LOG_INFO("[forward] Wait for requests to complete ...");
TM_LOG_INFO("[forward] Wait for requests to complete ...");
for (auto& f : futures) {
auto ec = f.get();
error_codes.push_back(ec);
......@@ -594,4 +594,4 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
template class LlamaV2<half>;
template class LlamaV2<float>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,24 +17,24 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.h
#pragma once
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/models/llama/Barrier.h"
#include "src/fastertransformer/models/llama/LlamaBatch.h"
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/models/llama/LlamaDecoder.h"
#include "src/fastertransformer/models/llama/LlamaWeight.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/models/llama/Barrier.h"
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/instance_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
#include <unordered_map>
namespace fastertransformer {
namespace turbomind {
template<typename T>
class LlamaV2 {
......@@ -183,4 +183,4 @@ private:
std::thread internal_thread_;
};
} // namespace fastertransformer
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment