Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
...@@ -16,11 +16,11 @@ ...@@ -16,11 +16,11 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.cc
#include "src/fastertransformer/models/llama/LlamaWeight.h" #include "src/turbomind/models/llama/LlamaWeight.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
LlamaWeight<T>::LlamaWeight(size_t hidden_units, LlamaWeight<T>::LlamaWeight(size_t hidden_units,
...@@ -121,4 +121,4 @@ void LlamaWeight<T>::loadModel(std::string dir_path) ...@@ -121,4 +121,4 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
template struct LlamaWeight<float>; template struct LlamaWeight<float>;
template struct LlamaWeight<half>; template struct LlamaWeight<half>;
} // namespace fastertransformer } // namespace turbomind
...@@ -16,14 +16,14 @@ ...@@ -16,14 +16,14 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.h
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h" #include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct LlamaWeight { struct LlamaWeight {
...@@ -67,4 +67,4 @@ private: ...@@ -67,4 +67,4 @@ private:
size_t tensor_para_rank_; size_t tensor_para_rank_;
}; };
} // namespace fastertransformer } // namespace turbomind
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include <condition_variable> #include <condition_variable>
#include <cstdint> #include <cstdint>
#include <future> #include <future>
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include <queue> #include <queue>
#include <unordered_map> #include <unordered_map>
namespace fastertransformer { namespace turbomind {
struct Request { struct Request {
uint64_t id; uint64_t id;
...@@ -87,4 +87,4 @@ private: ...@@ -87,4 +87,4 @@ private:
std::condition_variable cv_; std::condition_variable cv_;
}; };
} // namespace fastertransformer } // namespace turbomind
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "42_fused_multi_head_attention/kernel_forward.h" #include "42_fused_multi_head_attention/kernel_forward.h"
#include "mma_accum_lambda_iterator.h" #include "mma_accum_lambda_iterator.h"
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// modified from: // modified from:
// https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h // https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h
namespace fastertransformer { namespace turbomind {
template< template<
// dtype of Q/K/V/M // dtype of Q/K/V/M
...@@ -907,4 +907,4 @@ void FlashAttentionOp<T>::operator()(Params& params, cudaStream_t st) const ...@@ -907,4 +907,4 @@ void FlashAttentionOp<T>::operator()(Params& params, cudaStream_t st) const
template class FlashAttentionOp<float>; template class FlashAttentionOp<float>;
template class FlashAttentionOp<half>; template class FlashAttentionOp<half>;
} // namespace fastertransformer } // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <cooperative_groups/reduce.h> #include <cooperative_groups/reduce.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct res_norm_ops_t { struct res_norm_ops_t {
...@@ -162,4 +162,4 @@ template void ...@@ -162,4 +162,4 @@ template void
invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t); invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t);
template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t); template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t);
} // namespace fastertransformer } // namespace turbomind
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeFusedAddBiasResidualRMSNorm( void invokeFusedAddBiasResidualRMSNorm(
T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream); T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -15,17 +15,17 @@ ...@@ -15,17 +15,17 @@
*/ */
// Copied from // Copied from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/gpt_gemm.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/gpt_gemm.cc
#include "src/fastertransformer/utils/gemm_test/gpt_gemm_func.h" #include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace ft = fastertransformer; namespace ft = turbomind;
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
if (argc < 9 || argc > 11) { if (argc < 9 || argc > 11) {
FT_LOG_ERROR("./bin/llama_gemm batch_size \\ \n" TM_LOG_ERROR("./bin/llama_gemm batch_size \\ \n"
" beam_width \\ \n" " beam_width \\ \n"
" max_input_len \\ \n" " max_input_len \\ \n"
" head_number \\ \n" " head_number \\ \n"
...@@ -35,7 +35,7 @@ int main(int argc, char* argv[]) ...@@ -35,7 +35,7 @@ int main(int argc, char* argv[])
" data_type \\ \n" " data_type \\ \n"
" tensor_para_size \\\n" " tensor_para_size \\\n"
" is_append (append new config into exist gemm_config.ini or not)"); " is_append (append new config into exist gemm_config.ini or not)");
FT_LOG_ERROR("e.g. ./bin/llama_gemm 8 4 32 96 128 49152 51200 1 8 1"); TM_LOG_ERROR("e.g. ./bin/llama_gemm 8 4 32 96 128 49152 51200 1 8 1");
return 0; return 0;
} }
...@@ -50,17 +50,17 @@ int main(int argc, char* argv[]) ...@@ -50,17 +50,17 @@ int main(int argc, char* argv[])
const int tensor_para_size = argc < 10 ? 1 : atoi(argv[9]); const int tensor_para_size = argc < 10 ? 1 : atoi(argv[9]);
const bool is_append = argc < 11 ? false : (bool)(atoi(argv[10])); const bool is_append = argc < 11 ? false : (bool)(atoi(argv[10]));
FT_LOG_INFO("Arguments:"); TM_LOG_INFO("Arguments:");
FT_LOG_INFO(" batch_size: %d", batch_size); TM_LOG_INFO(" batch_size: %d", batch_size);
FT_LOG_INFO(" beam_width: %d", beam_width); TM_LOG_INFO(" beam_width: %d", beam_width);
FT_LOG_INFO(" max_input_len: %d", max_input_len); TM_LOG_INFO(" max_input_len: %d", max_input_len);
FT_LOG_INFO(" head_num: %d", head_num); TM_LOG_INFO(" head_num: %d", head_num);
FT_LOG_INFO(" size_per_head: %d", size_per_head); TM_LOG_INFO(" size_per_head: %d", size_per_head);
FT_LOG_INFO(" inter_size: %d", inter_size); TM_LOG_INFO(" inter_size: %d", inter_size);
FT_LOG_INFO(" vocab_size: %d", vocab_size); TM_LOG_INFO(" vocab_size: %d", vocab_size);
FT_LOG_INFO(" data_type: %d", data_type); TM_LOG_INFO(" data_type: %d", data_type);
FT_LOG_INFO(" tensor_para_size: %d", tensor_para_size); TM_LOG_INFO(" tensor_para_size: %d", tensor_para_size);
FT_LOG_INFO(" is_append: %d", (int)is_append); TM_LOG_INFO(" is_append: %d", (int)is_append);
std::cout << std::endl; std::cout << std::endl;
void* gemm_test_buf; void* gemm_test_buf;
......
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
namespace fastertransformer { namespace turbomind {
// fp16, bf16 // fp16, bf16
// n is divided by 2 for this impl // n is divided by 2 for this impl
...@@ -688,4 +688,4 @@ void invokeGatherOutput(int* output_ids, ...@@ -688,4 +688,4 @@ void invokeGatherOutput(int* output_ids,
output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size); output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -2,15 +2,15 @@ ...@@ -2,15 +2,15 @@
#pragma once #pragma once
#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <assert.h> #include <assert.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <numeric> #include <numeric>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream); void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream);
...@@ -160,7 +160,7 @@ inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_ ...@@ -160,7 +160,7 @@ inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_
int h_seq_len = -1; int h_seq_len = -1;
cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st); cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st);
cudaStreamSynchronize(st); cudaStreamSynchronize(st);
FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len); TM_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
} }
} // namespace fastertransformer } // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
CmpMode compare_mode = kCmpNone; CmpMode compare_mode = kCmpNone;
...@@ -157,4 +157,4 @@ bool isDebug() ...@@ -157,4 +157,4 @@ bool isDebug()
return is_debug; return is_debug;
} }
} // namespace fastertransformer } // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#pragma once #pragma once
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <vector> #include <vector>
namespace fastertransformer { namespace turbomind {
enum QuantPolicy { enum QuantPolicy {
kNone = 0x00, kNone = 0x00,
...@@ -64,4 +64,4 @@ size_t curandStateGetSize(); ...@@ -64,4 +64,4 @@ size_t curandStateGetSize();
bool isDebug(); bool isDebug();
} // namespace fastertransformer } // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved. // Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/prefix_cache.h" #include "src/turbomind/models/llama/prefix_cache.h"
// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X> // <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
template<typename T> template<typename T>
......
...@@ -52,22 +52,22 @@ ...@@ -52,22 +52,22 @@
#include "triton/core/tritonbackend.h" #include "triton/core/tritonbackend.h"
// FT's libraries have dependency with triton's lib // FT's libraries have dependency with triton's lib
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/instance_comm.h" #include "src/turbomind/utils/instance_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h" #include "src/turbomind/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
std::exception_ptr ptr[8]; std::exception_ptr ptr[8];
namespace ft = fastertransformer; namespace ft = turbomind;
namespace triton { namespace triton {
namespace backend { namespace backend {
namespace fastertransformer_backend { namespace turbomind_backend {
#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \ #define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
do { \ do { \
...@@ -1905,6 +1905,6 @@ TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstan ...@@ -1905,6 +1905,6 @@ TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstan
} // extern "C" } // extern "C"
} // namespace fastertransformer_backend } // namespace turbomind_backend
} // namespace backend } // namespace backend
} // namespace triton } // namespace triton
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt # Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/CMakeLists.txt
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
......
...@@ -16,17 +16,17 @@ ...@@ -16,17 +16,17 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "3rdparty/INIReader.h" #include "3rdparty/INIReader.h"
#include "src/fastertransformer/models/llama/LlamaInstanceComm.h" #include "src/turbomind/models/llama/LlamaInstanceComm.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/allocator.h" #include "src/turbomind/utils/allocator.h"
#include <mutex> #include <mutex>
namespace ft = fastertransformer; namespace ft = turbomind;
std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile) std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
{ {
...@@ -61,34 +61,34 @@ void LlamaTritonModel<T>::handleMissingParams() ...@@ -61,34 +61,34 @@ void LlamaTritonModel<T>::handleMissingParams()
{ {
if (!max_batch_size_) { if (!max_batch_size_) {
max_batch_size_ = 32; max_batch_size_ = 32;
FT_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_); TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
} }
if (!session_len_) { if (!session_len_) {
session_len_ = 2160; session_len_ = 2160;
FT_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_); TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
} }
if (!max_context_token_num_) { if (!max_context_token_num_) {
max_context_token_num_ = (int)std::sqrt(max_batch_size_); max_context_token_num_ = (int)std::sqrt(max_batch_size_);
FT_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.", TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
(int)max_context_token_num_); (int)max_context_token_num_);
} }
if (!step_length_) { if (!step_length_) {
step_length_ = 1; step_length_ = 1;
FT_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_); TM_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
} }
if (!cache_max_entry_count_) { if (!cache_max_entry_count_) {
cache_max_entry_count_ = 32; cache_max_entry_count_ = 32;
FT_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.", TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
(int)cache_max_entry_count_); (int)cache_max_entry_count_);
} }
if (!cache_chunk_size_) { if (!cache_chunk_size_) {
cache_chunk_size_ = cache_max_entry_count_; cache_chunk_size_ = cache_max_entry_count_;
FT_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_); TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
} }
} }
...@@ -341,7 +341,7 @@ LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_sta ...@@ -341,7 +341,7 @@ LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_sta
return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node); return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node);
} }
else { else {
FT_LOG_INFO("Skipping NCCL param creation."); TM_LOG_INFO("Skipping NCCL param creation.");
const int tensor_para_size = getTensorParaSize(); const int tensor_para_size = getTensorParaSize();
const int pipeline_para_size = getPipelineParaSize(); const int pipeline_para_size = getPipelineParaSize();
......
...@@ -16,20 +16,20 @@ ...@@ -16,20 +16,20 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaV2.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/turbomind/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <mutex> #include <mutex>
namespace ft = fastertransformer; namespace ft = turbomind;
template<typename T> template<typename T>
struct LlamaTritonSharedModelInstance; struct LlamaTritonSharedModelInstance;
......
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/triton_backend/triton_utils.hpp" #include "src/turbomind/triton_backend/triton_utils.hpp"
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include <numeric> #include <numeric>
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace ft = fastertransformer; namespace ft = turbomind;
template<typename T> template<typename T>
void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx) void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
...@@ -53,7 +53,7 @@ template<typename T> ...@@ -53,7 +53,7 @@ template<typename T>
std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs( std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_); move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_); move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
...@@ -126,7 +126,7 @@ template<typename T> ...@@ -126,7 +126,7 @@ template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors) LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
std::unordered_map<std::string, triton::Tensor>* outputs_mapping = std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
new std::unordered_map<std::string, triton::Tensor>(); new std::unordered_map<std::string, triton::Tensor>();
...@@ -172,9 +172,9 @@ std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> ...@@ -172,9 +172,9 @@ std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors, LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
ft::AbstractInstanceComm* instance_comm) ft::AbstractInstanceComm* instance_comm)
{ {
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// for (const auto& kv : *input_tensors) { // for (const auto& kv : *input_tensors) {
// FT_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str()); // TM_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
// } // }
FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2, FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
......
...@@ -16,16 +16,16 @@ ...@@ -16,16 +16,16 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once #pragma once
#include "src/fastertransformer/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaV2.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h" #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include <memory> #include <memory>
namespace ft = fastertransformer; namespace ft = turbomind;
template<typename T> template<typename T>
struct LlamaTritonSharedModelInstance { struct LlamaTritonSharedModelInstance {
......
...@@ -16,10 +16,10 @@ ...@@ -16,10 +16,10 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/transformer_triton_backend.cpp
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp" #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node) AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
*/ */
// Modified from // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/transformer_triton_backend.hpp
#pragma once #pragma once
...@@ -25,13 +25,13 @@ ...@@ -25,13 +25,13 @@
#include <sys/time.h> #include <sys/time.h>
#include <vector> #include <vector>
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/custom_ar_comm.h" #include "src/turbomind/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/instance_comm.h" #include "src/turbomind/utils/instance_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h" #include "src/turbomind/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h" #include "src/turbomind/utils/nccl_utils.h"
namespace ft = fastertransformer; namespace ft = turbomind;
namespace triton { namespace triton {
#ifdef USE_TRITONSERVER_DATATYPE #ifdef USE_TRITONSERVER_DATATYPE
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment