Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/LlamaCacheManager.h" #include "src/turbomind/models/llama/LlamaCacheManager.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
namespace fastertransformer { namespace turbomind {
LlamaCacheManager::~LlamaCacheManager() LlamaCacheManager::~LlamaCacheManager()
{ {
...@@ -16,7 +16,7 @@ LlamaCacheManager::~LlamaCacheManager() ...@@ -16,7 +16,7 @@ LlamaCacheManager::~LlamaCacheManager()
void* LlamaCacheManager::allocate(bool is_preallocte) void* LlamaCacheManager::allocate(bool is_preallocte)
{ {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate]"); TM_LOG_INFO("[LlamaCacheManager][allocate]");
} }
void* mem_ptr{}; void* mem_ptr{};
...@@ -26,7 +26,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte) ...@@ -26,7 +26,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
device_free_.pop(); device_free_.pop();
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size()); TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
} }
} }
else if (entry_count_ < max_entry_count_) { else if (entry_count_ < max_entry_count_) {
...@@ -34,14 +34,14 @@ void* LlamaCacheManager::allocate(bool is_preallocte) ...@@ -34,14 +34,14 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
const size_t entry_byte_size = 2 * cache_byte_size_; // 2 for k,v const size_t entry_byte_size = 2 * cache_byte_size_; // 2 for k,v
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count); TM_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
} }
const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false); const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
FT_CHECK(chunk_ptr); FT_CHECK(chunk_ptr);
device_mem_.push_back(chunk_ptr); device_mem_.push_back(chunk_ptr);
entry_count_ += alloc_count; entry_count_ += alloc_count;
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_); TM_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
} }
for (int i = 0; i < alloc_count; ++i) { for (int i = 0; i < alloc_count; ++i) {
...@@ -54,7 +54,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte) ...@@ -54,7 +54,7 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
} }
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size()); TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
} }
} }
else { else {
...@@ -68,13 +68,13 @@ void* LlamaCacheManager::allocate(bool is_preallocte) ...@@ -68,13 +68,13 @@ void* LlamaCacheManager::allocate(bool is_preallocte)
auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
{ {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id); TM_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
} }
for (const auto& e : device_cache_) { for (const auto& e : device_cache_) {
if (e.id == id) { if (e.id == id) {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id); TM_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
} }
erase(id); erase(id);
} }
...@@ -102,7 +102,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i ...@@ -102,7 +102,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i
auto pred = [&](const Sequence& s) { return s.id == id; }; auto pred = [&](const Sequence& s) { return s.id == id; };
auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred); auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
if (it == device_cache_.end()) { if (it == device_cache_.end()) {
FT_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id); TM_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
FT_CHECK(0); FT_CHECK(0);
} }
return it; return it;
...@@ -111,7 +111,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i ...@@ -111,7 +111,7 @@ auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::i
auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
{ {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id); TM_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
} }
auto entry = getEntryOrThrow(id); auto entry = getEntryOrThrow(id);
...@@ -131,7 +131,7 @@ auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence ...@@ -131,7 +131,7 @@ auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream) void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
{ {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id); TM_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
} }
auto entry = getEntryOrThrow(seq.id); auto entry = getEntryOrThrow(seq.id);
...@@ -145,7 +145,7 @@ void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream) ...@@ -145,7 +145,7 @@ void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
void LlamaCacheManager::erase(uint64_t id) void LlamaCacheManager::erase(uint64_t id)
{ {
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id); TM_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
} }
auto entry = getEntryOrThrow(id); auto entry = getEntryOrThrow(id);
...@@ -153,7 +153,7 @@ void LlamaCacheManager::erase(uint64_t id) ...@@ -153,7 +153,7 @@ void LlamaCacheManager::erase(uint64_t id)
if (entry->k_cache) { if (entry->k_cache) {
device_free_.push(entry->k_cache); device_free_.push(entry->k_cache);
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size()); TM_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
} }
} }
device_cache_.erase(entry); device_cache_.erase(entry);
...@@ -171,7 +171,7 @@ void* LlamaCacheManager::evict() ...@@ -171,7 +171,7 @@ void* LlamaCacheManager::evict()
} }
if (rank_ == 0) { if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id); TM_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
} }
FT_CHECK(it->k_cache); FT_CHECK(it->k_cache);
...@@ -189,4 +189,4 @@ bool LlamaCacheManager::contains(uint64_t id) const noexcept ...@@ -189,4 +189,4 @@ bool LlamaCacheManager::contains(uint64_t id) const noexcept
return it != device_cache_.end(); return it != device_cache_.end();
} }
} // namespace fastertransformer } // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include <cstdint> #include <cstdint>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <queue> #include <queue>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
// k-cache layout [L, H, D/x, S[s:], x] // k-cache layout [L, H, D/x, S[s:], x]
// v-cache layout [L, H, S[s:], D/x, x] // v-cache layout [L, H, S[s:], D/x, x]
...@@ -36,8 +36,8 @@ public: ...@@ -36,8 +36,8 @@ public:
allocator_(allocator) allocator_(allocator)
{ {
if (rank == 0) { if (rank == 0) {
FT_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_); TM_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
FT_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_); TM_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
} }
allocate(true); allocate(true);
} }
...@@ -99,4 +99,4 @@ private: ...@@ -99,4 +99,4 @@ private:
std::vector<Sequence> device_cache_; std::vector<Sequence> device_cache_;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -17,18 +17,18 @@ ...@@ -17,18 +17,18 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.cc
#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h" #include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size, void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
...@@ -36,7 +36,7 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size, ...@@ -36,7 +36,7 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
size_t max_q_len, size_t max_q_len,
size_t max_k_len) size_t max_k_len)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// no padding // no padding
qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true); qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true);
...@@ -75,7 +75,7 @@ template<typename T> ...@@ -75,7 +75,7 @@ template<typename T>
void LlamaContextAttentionLayer<T>::freeBuffer() void LlamaContextAttentionLayer<T>::freeBuffer()
{ {
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&qkv_buf_)); allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&q_buf_2_)); allocator_->free((void**)(&q_buf_2_));
...@@ -98,7 +98,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap* ...@@ -98,7 +98,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
const TensorMap* input_tensors, const TensorMap* input_tensors,
const LlamaAttentionWeight<T>* weights) const LlamaAttentionWeight<T>* weights)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
/** /**
* input_tensors: * input_tensors:
...@@ -403,4 +403,4 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c ...@@ -403,4 +403,4 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c
template class LlamaContextAttentionLayer<float>; template class LlamaContextAttentionLayer<float>;
template class LlamaContextAttentionLayer<half>; template class LlamaContextAttentionLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -17,16 +17,16 @@ ...@@ -17,16 +17,16 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.h
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaContextAttentionLayer { class LlamaContextAttentionLayer {
...@@ -124,4 +124,4 @@ private: ...@@ -124,4 +124,4 @@ private:
bool is_allocate_buffer_ = false; bool is_allocate_buffer_ = false;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,17 +16,17 @@ ...@@ -16,17 +16,17 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h" #include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h" #include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h" #include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void LlamaContextDecoder<T>::allocateBuffer() void LlamaContextDecoder<T>::allocateBuffer()
...@@ -37,7 +37,7 @@ void LlamaContextDecoder<T>::allocateBuffer() ...@@ -37,7 +37,7 @@ void LlamaContextDecoder<T>::allocateBuffer()
template<typename T> template<typename T>
void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len) void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
attn_ffn_io_ = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false); attn_ffn_io_ = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false);
attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false); attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
...@@ -50,7 +50,7 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, ...@@ -50,7 +50,7 @@ void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token,
template<typename T> template<typename T>
void LlamaContextDecoder<T>::freeBuffer() void LlamaContextDecoder<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
allocator_->free((void**)&attn_ffn_io_); allocator_->free((void**)&attn_ffn_io_);
allocator_->free((void**)&padding_offset_); allocator_->free((void**)&padding_offset_);
...@@ -94,7 +94,7 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session& ...@@ -94,7 +94,7 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session&
int layer, int layer,
bool is_final) bool is_final)
{ {
// FT_LOG_ERROR(__PRETTY_FUNCTION__); // TM_LOG_ERROR(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors{ TensorMap self_attention_input_tensors{
{"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}, {"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
{"attention_mask", {"attention_mask",
...@@ -283,4 +283,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ...@@ -283,4 +283,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
template class LlamaContextDecoder<float>; template class LlamaContextDecoder<float>;
template class LlamaContextDecoder<half>; template class LlamaContextDecoder<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,25 +16,25 @@ ...@@ -16,25 +16,25 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once #pragma once
// #include "src/fastertransformer/kernels/add_residual_kernels.h" // #include "src/turbomind/kernels/add_residual_kernels.h"
// #include "src/fastertransformer/kernels/layernorm_kernels.h" // #include "src/turbomind/kernels/layernorm_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h" // #include "src/turbomind/layers/FfnLayer.h"
// #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h" // #include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h" #include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h" #include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/turbomind/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaContextDecoder: public BaseLayer { class LlamaContextDecoder: public BaseLayer {
...@@ -112,4 +112,4 @@ public: ...@@ -112,4 +112,4 @@ public:
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights); const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -17,14 +17,14 @@ ...@@ -17,14 +17,14 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
#include "src/fastertransformer/models/llama/LlamaDecoder.h" #include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
LlamaDecoder<T>::LlamaDecoder(size_t head_num, LlamaDecoder<T>::LlamaDecoder(size_t head_num,
...@@ -50,14 +50,14 @@ LlamaDecoder<T>::LlamaDecoder(size_t head_num, ...@@ -50,14 +50,14 @@ LlamaDecoder<T>::LlamaDecoder(size_t head_num,
tensor_para_(tensor_para), tensor_para_(tensor_para),
data_type_(getTensorType<T>()) data_type_(getTensorType<T>())
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize(quant_policy); initialize(quant_policy);
} }
template<typename T> template<typename T>
LlamaDecoder<T>::~LlamaDecoder() LlamaDecoder<T>::~LlamaDecoder()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
delete self_attention_layer_; delete self_attention_layer_;
delete silu_ffn_layer_; delete silu_ffn_layer_;
} }
...@@ -65,7 +65,7 @@ LlamaDecoder<T>::~LlamaDecoder() ...@@ -65,7 +65,7 @@ LlamaDecoder<T>::~LlamaDecoder()
template<typename T> template<typename T>
void LlamaDecoder<T>::initialize(int quant_policy) void LlamaDecoder<T>::initialize(int quant_policy)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_, self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
size_per_head_, size_per_head_,
...@@ -97,14 +97,14 @@ void LlamaDecoder<T>::allocateBuffer() ...@@ -97,14 +97,14 @@ void LlamaDecoder<T>::allocateBuffer()
template<typename T> template<typename T>
void LlamaDecoder<T>::allocateBuffer(size_t batch_size) void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
is_allocate_buffer_ = true; is_allocate_buffer_ = true;
} }
template<typename T> template<typename T>
void LlamaDecoder<T>::freeBuffer() void LlamaDecoder<T>::freeBuffer()
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) { if (is_allocate_buffer_) {
is_allocate_buffer_ = false; is_allocate_buffer_ = false;
} }
...@@ -116,7 +116,7 @@ void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session& ...@@ -116,7 +116,7 @@ void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&
const std::unordered_map<std::string, Tensor>* input_tensors, const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer) size_t layer)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors(*input_tensors); TensorMap self_attention_input_tensors(*input_tensors);
self_attention_input_tensors.insert("input_query", self_attention_input_tensors.insert("input_query",
{MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io}); {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
...@@ -157,7 +157,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou ...@@ -157,7 +157,7 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
const std::unordered_map<std::string, Tensor>* input_tensors, const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights) const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
/** /**
* input_tensors: * input_tensors:
* \param decoder_input [batch_size, hidden_dims] * \param decoder_input [batch_size, hidden_dims]
...@@ -242,4 +242,4 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou ...@@ -242,4 +242,4 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
template class LlamaDecoder<half>; template class LlamaDecoder<half>;
template class LlamaDecoder<float>; template class LlamaDecoder<float>;
} // namespace fastertransformer } // namespace turbomind
...@@ -17,17 +17,17 @@ ...@@ -17,17 +17,17 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/turbomind/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h" // #include "src/turbomind/layers/FfnLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h" #include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h" #include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/turbomind/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaDecoder: public BaseLayer { class LlamaDecoder: public BaseLayer {
...@@ -93,4 +93,4 @@ public: ...@@ -93,4 +93,4 @@ public:
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights); const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t hidden_units, LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(size_t hidden_units,
...@@ -122,12 +122,12 @@ void LlamaDecoderLayerWeight<T>::mallocWeights() ...@@ -122,12 +122,12 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
deviceMalloc((T**)&self_attn_norm_weights, hidden_units_); deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
deviceMalloc((T**)&ffn_norm_weights, hidden_units_); deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
fastertransformer::mallocWeights(self_attn_weights.qkv, attn_bias_); turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
fastertransformer::mallocWeights(self_attn_weights.output, attn_bias_); turbomind::mallocWeights(self_attn_weights.output, attn_bias_);
fastertransformer::mallocWeights(ffn_weights.gating, false); turbomind::mallocWeights(ffn_weights.gating, false);
fastertransformer::mallocWeights(ffn_weights.intermediate, false); turbomind::mallocWeights(ffn_weights.intermediate, false);
fastertransformer::mallocWeights(ffn_weights.output, false); turbomind::mallocWeights(ffn_weights.output, false);
} }
template<typename T> template<typename T>
...@@ -175,4 +175,4 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType ...@@ -175,4 +175,4 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
template struct LlamaDecoderLayerWeight<float>; template struct LlamaDecoderLayerWeight<float>;
template struct LlamaDecoderLayerWeight<half>; template struct LlamaDecoderLayerWeight<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct LlamaDecoderLayerWeight { struct LlamaDecoderLayerWeight {
...@@ -58,4 +58,4 @@ private: ...@@ -58,4 +58,4 @@ private:
void mallocWeights(); void mallocWeights();
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -16,18 +16,18 @@ ...@@ -16,18 +16,18 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h" #include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/nvtx_utils.h" #include "src/turbomind/utils/nvtx_utils.h"
#include <string> #include <string>
// #include <glog/logging.h> // #include <glog/logging.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct SATypeConverter { struct SATypeConverter {
...@@ -157,7 +157,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf, ...@@ -157,7 +157,7 @@ static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf,
template<typename T> template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len) void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
qkv_buf_ = qkv_buf_ =
reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false)); reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false));
context_buf_ = context_buf_ =
...@@ -289,4 +289,4 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o ...@@ -289,4 +289,4 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o
template class LlamaDecoderSelfAttentionLayer<float>; template class LlamaDecoderSelfAttentionLayer<float>;
template class LlamaDecoderSelfAttentionLayer<half>; template class LlamaDecoderSelfAttentionLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,16 +16,16 @@ ...@@ -16,16 +16,16 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaDecoderSelfAttentionLayer { class LlamaDecoderSelfAttentionLayer {
...@@ -97,4 +97,4 @@ private: ...@@ -97,4 +97,4 @@ private:
bool is_allocate_buffer_{}; bool is_allocate_buffer_{};
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -15,15 +15,15 @@ ...@@ -15,15 +15,15 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/DenseWeight.h
#pragma once #pragma once
#include "src/fastertransformer/layers/FfnWeight.h" #include "src/turbomind/layers/FfnWeight.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h" #include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
enum class WeightType : int { enum class WeightType : int {
kFP32, kFP32,
...@@ -75,4 +75,4 @@ struct LlamaFfnWeight { ...@@ -75,4 +75,4 @@ struct LlamaFfnWeight {
LlamaDenseWeight<T> output; LlamaDenseWeight<T> output;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -15,15 +15,15 @@ ...@@ -15,15 +15,15 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/FfnLayer.h
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h" #include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/kernels/activation_kernels.h" #include "src/turbomind/kernels/activation_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/utils/nvtx_utils.h" #include "src/turbomind/utils/nvtx_utils.h"
// #include <glog/logging.h> // #include <glog/logging.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void LlamaFfnLayer<T>::allocateBuffer(size_t token_num) void LlamaFfnLayer<T>::allocateBuffer(size_t token_num)
...@@ -110,4 +110,4 @@ void LlamaFfnLayer<T>::forward(TensorMap* output_tensors, ...@@ -110,4 +110,4 @@ void LlamaFfnLayer<T>::forward(TensorMap* output_tensors,
template class LlamaFfnLayer<float>; template class LlamaFfnLayer<float>;
template class LlamaFfnLayer<half>; template class LlamaFfnLayer<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -15,18 +15,18 @@ ...@@ -15,18 +15,18 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/FfnLayer.cc
#pragma once #pragma once
// #include "src/fastertransformer/layers/FfnLayer.h" // #include "src/turbomind/layers/FfnLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h" #include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/turbomind/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include <functional> #include <functional>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaFfnLayer { class LlamaFfnLayer {
...@@ -82,4 +82,4 @@ private: ...@@ -82,4 +82,4 @@ private:
bool is_allocate_buffer_{}; bool is_allocate_buffer_{};
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
#pragma once #pragma once
#include "src/fastertransformer/models/llama/Barrier.h" #include "src/turbomind/models/llama/Barrier.h"
#include "src/fastertransformer/utils/instance_comm.h" #include "src/turbomind/utils/instance_comm.h"
namespace fastertransformer { namespace turbomind {
class LlamaInstanceComm: public AbstractInstanceComm { class LlamaInstanceComm: public AbstractInstanceComm {
public: public:
...@@ -31,4 +31,4 @@ private: ...@@ -31,4 +31,4 @@ private:
void* ptr{}; void* ptr{};
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaLinear { class LlamaLinear {
...@@ -58,4 +58,4 @@ private: ...@@ -58,4 +58,4 @@ private:
cudaStream_t stream_{}; cudaStream_t stream_{};
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -2,14 +2,14 @@ ...@@ -2,14 +2,14 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include <array> #include <array>
#include <atomic> #include <atomic>
#include <condition_variable> #include <condition_variable>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <mutex> #include <mutex>
namespace fastertransformer { namespace turbomind {
struct NcclGuard { struct NcclGuard {
static constexpr int kMaxGroupCount = 32; static constexpr int kMaxGroupCount = 32;
...@@ -89,4 +89,4 @@ struct NcclGuard { ...@@ -89,4 +89,4 @@ struct NcclGuard {
std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_; std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -18,24 +18,24 @@ ...@@ -18,24 +18,24 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/fastertransformer/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaV2.h"
#include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/turbomind/kernels/decoding_kernels.h"
#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/fastertransformer/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/fastertransformer/models/llama/Request.h" #include "src/turbomind/models/llama/Request.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
LlamaV2<T>::LlamaV2(size_t head_num, LlamaV2<T>::LlamaV2(size_t head_num,
...@@ -87,15 +87,15 @@ LlamaV2<T>::LlamaV2(size_t head_num, ...@@ -87,15 +87,15 @@ LlamaV2<T>::LlamaV2(size_t head_num,
shared_state_(shared_state) shared_state_(shared_state)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0); FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_); TM_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
size_t elem_bits = 0; size_t elem_bits = 0;
if (quant_policy & QuantPolicy::kCacheKVInt8) { if (quant_policy & QuantPolicy::kCacheKVInt8) {
elem_bits = sizeof(int8_t) * 8; elem_bits = sizeof(int8_t) * 8;
if (use_context_fmha) { if (use_context_fmha) {
FT_LOG_ERROR("use_context_fmha not support int8"); TM_LOG_ERROR("use_context_fmha not support int8");
assert(0); assert(0);
} }
} }
...@@ -128,7 +128,7 @@ LlamaV2<T>::~LlamaV2() ...@@ -128,7 +128,7 @@ LlamaV2<T>::~LlamaV2()
template<typename T> template<typename T>
void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy) void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
context_decoder_ = new LlamaContextDecoder<T>(head_num_, context_decoder_ = new LlamaContextDecoder<T>(head_num_,
size_per_head_, size_per_head_,
...@@ -170,7 +170,7 @@ void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy) ...@@ -170,7 +170,7 @@ void LlamaV2<T>::initialize(bool use_context_fmha, int quant_policy)
template<typename T> template<typename T>
void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step) void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// ! This kernel can't be used in context decoding // ! This kernel can't be used in context decoding
invokeEmbeddingLookupPosEncodingPadCount(embeddings, invokeEmbeddingLookupPosEncodingPadCount(embeddings,
weights_->pre_decoder_embedding_table, weights_->pre_decoder_embedding_table,
...@@ -203,10 +203,10 @@ void LlamaV2<T>::contextDecode(T* deocder_output, ...@@ -203,10 +203,10 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
size_t session_len, size_t session_len,
size_t batch_size) size_t batch_size)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (tensor_para_.rank_ == 0) { if (tensor_para_.rank_ == 0) {
FT_LOG_INFO("context decoding start"); TM_LOG_INFO("context decoding start");
} }
invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf, invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
...@@ -250,7 +250,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output, ...@@ -250,7 +250,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights); context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
if (tensor_para_.rank_ == 0) { if (tensor_para_.rank_ == 0) {
FT_LOG_INFO("context decoding end"); TM_LOG_INFO("context decoding end");
} }
} }
...@@ -267,7 +267,7 @@ void LlamaV2<T>::decoderForward(T* decoder_output, ...@@ -267,7 +267,7 @@ void LlamaV2<T>::decoderForward(T* decoder_output,
size_t session_len, size_t session_len,
size_t batch_size) size_t batch_size)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const int max_seq_len = session_len; const int max_seq_len = session_len;
const auto dtype = getTensorType<T>(); const auto dtype = getTensorType<T>();
...@@ -298,7 +298,7 @@ void LlamaV2<T>::decoderForward(T* decoder_output, ...@@ -298,7 +298,7 @@ void LlamaV2<T>::decoderForward(T* decoder_output,
template<typename T> template<typename T>
void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size) void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t data_type = getCudaDataType<T>(); cudaDataType_t data_type = getCudaDataType<T>();
float alpha = 1.f; float alpha = 1.f;
float beta = 0.f; float beta = 0.f;
...@@ -375,7 +375,7 @@ void LlamaV2<T>::dynamicDecode(int* token_ids, ...@@ -375,7 +375,7 @@ void LlamaV2<T>::dynamicDecode(int* token_ids,
size_t token_ids_len, size_t token_ids_len,
size_t batch_size) size_t batch_size)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
int local_batch_size = (int)batch_size; int local_batch_size = (int)batch_size;
std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{ std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
...@@ -421,7 +421,7 @@ void LlamaV2<T>::dynamicDecode(int* token_ids, ...@@ -421,7 +421,7 @@ void LlamaV2<T>::dynamicDecode(int* token_ids,
template<typename T> template<typename T>
void LlamaV2<T>::internalThreadEntry(int device_id) void LlamaV2<T>::internalThreadEntry(int device_id)
{ {
FT_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_); TM_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
check_cuda_error(cudaSetDevice(device_id)); check_cuda_error(cudaSetDevice(device_id));
auto& request_queue = shared_state_->request_queue; auto& request_queue = shared_state_->request_queue;
...@@ -514,10 +514,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs, ...@@ -514,10 +514,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
if (debug_) { if (debug_) {
if (tensor_para_.rank_ == 0) { if (tensor_para_.rank_ == 0) {
for (const auto& kv : *inputs) { for (const auto& kv : *inputs) {
FT_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str()); TM_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
} }
for (const auto& kv : *outputs) { for (const auto& kv : *outputs) {
FT_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str()); TM_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
} }
} }
} }
...@@ -566,10 +566,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs, ...@@ -566,10 +566,10 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
std::vector<int> error_codes; std::vector<int> error_codes;
bool has_error = 0; bool has_error = 0;
if (rank == 0) { if (rank == 0) {
FT_LOG_INFO("[forward] Enqueue requests"); TM_LOG_INFO("[forward] Enqueue requests");
auto futures = shared_state_->request_queue.enqueue(std::move(requests)); auto futures = shared_state_->request_queue.enqueue(std::move(requests));
FT_LOG_INFO("[forward] Wait for requests to complete ..."); TM_LOG_INFO("[forward] Wait for requests to complete ...");
for (auto& f : futures) { for (auto& f : futures) {
auto ec = f.get(); auto ec = f.get();
error_codes.push_back(ec); error_codes.push_back(ec);
...@@ -594,4 +594,4 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs, ...@@ -594,4 +594,4 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
template class LlamaV2<half>; template class LlamaV2<half>;
template class LlamaV2<float>; template class LlamaV2<float>;
} // namespace fastertransformer } // namespace turbomind
...@@ -17,24 +17,24 @@ ...@@ -17,24 +17,24 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.h
#pragma once #pragma once
#include "src/fastertransformer/layers/DynamicDecodeLayer.h" #include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/models/llama/Barrier.h" #include "src/turbomind/models/llama/Barrier.h"
#include "src/fastertransformer/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h" #include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/models/llama/LlamaDecoder.h" #include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/fastertransformer/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/fastertransformer/models/llama/Request.h" #include "src/turbomind/models/llama/Request.h"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h" #include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/instance_comm.h" #include "src/turbomind/utils/instance_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include <unordered_map> #include <unordered_map>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
class LlamaV2 { class LlamaV2 {
...@@ -183,4 +183,4 @@ private: ...@@ -183,4 +183,4 @@ private:
std::thread internal_thread_; std::thread internal_thread_;
}; };
} // namespace fastertransformer } // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment