Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -16,11 +16,11 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.cc
#include "src/fastertransformer/models/llama/LlamaWeight.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
LlamaWeight<T>::LlamaWeight(size_t hidden_units,
......@@ -121,4 +121,4 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
template struct LlamaWeight<float>;
template struct LlamaWeight<half>;
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,14 +16,14 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct LlamaWeight {
......@@ -67,4 +67,4 @@ private:
size_t tensor_para_rank_;
};
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,7 +2,7 @@
#pragma once
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/utils/Tensor.h"
#include <condition_variable>
#include <cstdint>
#include <future>
......@@ -10,7 +10,7 @@
#include <queue>
#include <unordered_map>
namespace fastertransformer {
namespace turbomind {
struct Request {
uint64_t id;
......@@ -87,4 +87,4 @@ private:
std::condition_variable cv_;
};
} // namespace fastertransformer
} // namespace turbomind
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "42_fused_multi_head_attention/kernel_forward.h"
#include "mma_accum_lambda_iterator.h"
......@@ -13,7 +13,7 @@
// modified from:
// https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h
namespace fastertransformer {
namespace turbomind {
template<
// dtype of Q/K/V/M
......@@ -907,4 +907,4 @@ void FlashAttentionOp<T>::operator()(Params& params, cudaStream_t st) const
template class FlashAttentionOp<float>;
template class FlashAttentionOp<half>;
} // namespace fastertransformer
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cuda_fp16.h>
namespace cg = cooperative_groups;
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct res_norm_ops_t {
......@@ -162,4 +162,4 @@ template void
invokeFusedAddBiasResidualRMSNorm(float*, float*, const float*, const float*, float, int, int, cudaStream_t);
template void invokeFusedAddBiasResidualRMSNorm(half*, half*, const half*, const half*, float, int, int, cudaStream_t);
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,10 +2,10 @@
#include <cuda_runtime.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeFusedAddBiasResidualRMSNorm(
T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -15,17 +15,17 @@
*/
// Copied from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/gpt_gemm.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/gpt_gemm.cc
#include "src/fastertransformer/utils/gemm_test/gpt_gemm_func.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/utils/gemm_test/gpt_gemm_func.h"
#include "src/turbomind/utils/memory_utils.h"
namespace ft = fastertransformer;
namespace ft = turbomind;
int main(int argc, char* argv[])
{
if (argc < 9 || argc > 11) {
FT_LOG_ERROR("./bin/llama_gemm batch_size \\ \n"
TM_LOG_ERROR("./bin/llama_gemm batch_size \\ \n"
" beam_width \\ \n"
" max_input_len \\ \n"
" head_number \\ \n"
......@@ -35,7 +35,7 @@ int main(int argc, char* argv[])
" data_type \\ \n"
" tensor_para_size \\\n"
" is_append (append new config into exist gemm_config.ini or not)");
FT_LOG_ERROR("e.g. ./bin/llama_gemm 8 4 32 96 128 49152 51200 1 8 1");
TM_LOG_ERROR("e.g. ./bin/llama_gemm 8 4 32 96 128 49152 51200 1 8 1");
return 0;
}
......@@ -50,17 +50,17 @@ int main(int argc, char* argv[])
const int tensor_para_size = argc < 10 ? 1 : atoi(argv[9]);
const bool is_append = argc < 11 ? false : (bool)(atoi(argv[10]));
FT_LOG_INFO("Arguments:");
FT_LOG_INFO(" batch_size: %d", batch_size);
FT_LOG_INFO(" beam_width: %d", beam_width);
FT_LOG_INFO(" max_input_len: %d", max_input_len);
FT_LOG_INFO(" head_num: %d", head_num);
FT_LOG_INFO(" size_per_head: %d", size_per_head);
FT_LOG_INFO(" inter_size: %d", inter_size);
FT_LOG_INFO(" vocab_size: %d", vocab_size);
FT_LOG_INFO(" data_type: %d", data_type);
FT_LOG_INFO(" tensor_para_size: %d", tensor_para_size);
FT_LOG_INFO(" is_append: %d", (int)is_append);
TM_LOG_INFO("Arguments:");
TM_LOG_INFO(" batch_size: %d", batch_size);
TM_LOG_INFO(" beam_width: %d", beam_width);
TM_LOG_INFO(" max_input_len: %d", max_input_len);
TM_LOG_INFO(" head_num: %d", head_num);
TM_LOG_INFO(" size_per_head: %d", size_per_head);
TM_LOG_INFO(" inter_size: %d", inter_size);
TM_LOG_INFO(" vocab_size: %d", vocab_size);
TM_LOG_INFO(" data_type: %d", data_type);
TM_LOG_INFO(" tensor_para_size: %d", tensor_para_size);
TM_LOG_INFO(" is_append: %d", (int)is_append);
std::cout << std::endl;
void* gemm_test_buf;
......
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
namespace fastertransformer {
namespace turbomind {
// fp16, bf16
// n is divided by 2 for this impl
......@@ -688,4 +688,4 @@ void invokeGatherOutput(int* output_ids,
output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -2,15 +2,15 @@
#pragma once
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <assert.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <numeric>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream);
......@@ -160,7 +160,7 @@ inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_
int h_seq_len = -1;
cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st);
cudaStreamSynchronize(st);
FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
TM_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
}
} // namespace fastertransformer
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
......@@ -14,7 +14,7 @@
#include <thrust/host_vector.h>
#include <vector>
namespace fastertransformer {
namespace turbomind {
CmpMode compare_mode = kCmpNone;
......@@ -157,4 +157,4 @@ bool isDebug()
return is_debug;
}
} // namespace fastertransformer
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/utils/Tensor.h"
#include "src/turbomind/utils/Tensor.h"
#include <cuda_runtime.h>
#include <sstream>
#include <string>
#include <vector>
namespace fastertransformer {
namespace turbomind {
enum QuantPolicy {
kNone = 0x00,
......@@ -64,4 +64,4 @@ size_t curandStateGetSize();
bool isDebug();
} // namespace fastertransformer
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/prefix_cache.h"
#include "src/turbomind/models/llama/prefix_cache.h"
// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
template<typename T>
......
......@@ -52,22 +52,22 @@
#include "triton/core/tritonbackend.h"
// FT's libraries have dependency with triton's lib
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/instance_comm.h"
#include "src/turbomind/utils/mpi_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
std::exception_ptr ptr[8];
namespace ft = fastertransformer;
namespace ft = turbomind;
namespace triton {
namespace backend {
namespace fastertransformer_backend {
namespace turbomind_backend {
#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
do { \
......@@ -1905,6 +1905,6 @@ TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstan
} // extern "C"
} // namespace fastertransformer_backend
} // namespace turbomind_backend
} // namespace backend
} // namespace triton
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt
# Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/CMakeLists.txt
cmake_minimum_required(VERSION 3.8)
......
......@@ -16,17 +16,17 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "3rdparty/INIReader.h"
#include "src/fastertransformer/models/llama/LlamaInstanceComm.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/allocator.h"
#include "src/turbomind/models/llama/LlamaInstanceComm.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/allocator.h"
#include <mutex>
namespace ft = fastertransformer;
namespace ft = turbomind;
std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
{
......@@ -61,34 +61,34 @@ void LlamaTritonModel<T>::handleMissingParams()
{
if (!max_batch_size_) {
max_batch_size_ = 32;
FT_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
}
if (!session_len_) {
session_len_ = 2160;
FT_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
}
if (!max_context_token_num_) {
max_context_token_num_ = (int)std::sqrt(max_batch_size_);
FT_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
(int)max_context_token_num_);
}
if (!step_length_) {
step_length_ = 1;
FT_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
TM_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
}
if (!cache_max_entry_count_) {
cache_max_entry_count_ = 32;
FT_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
(int)cache_max_entry_count_);
}
if (!cache_chunk_size_) {
cache_chunk_size_ = cache_max_entry_count_;
FT_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
}
}
......@@ -341,7 +341,7 @@ LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_sta
return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node);
}
else {
FT_LOG_INFO("Skipping NCCL param creation.");
TM_LOG_INFO("Skipping NCCL param creation.");
const int tensor_para_size = getTensorParaSize();
const int pipeline_para_size = getPipelineParaSize();
......
......@@ -16,20 +16,20 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
#include <cuda_fp16.h>
#include <mutex>
namespace ft = fastertransformer;
namespace ft = turbomind;
template<typename T>
struct LlamaTritonSharedModelInstance;
......
......@@ -16,13 +16,13 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/triton_backend/triton_utils.hpp"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/triton_backend/triton_utils.hpp"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <algorithm>
#include <functional>
#include <numeric>
......@@ -30,7 +30,7 @@
#include <unordered_map>
#include <vector>
namespace ft = fastertransformer;
namespace ft = turbomind;
template<typename T>
void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
......@@ -53,7 +53,7 @@ template<typename T>
std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
......@@ -126,7 +126,7 @@ template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
new std::unordered_map<std::string, triton::Tensor>();
......@@ -172,9 +172,9 @@ std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
ft::AbstractInstanceComm* instance_comm)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// for (const auto& kv : *input_tensors) {
// FT_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
// TM_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
// }
FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
......
......@@ -16,16 +16,16 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include <memory>
namespace ft = fastertransformer;
namespace ft = turbomind;
template<typename T>
struct LlamaTritonSharedModelInstance {
......
......@@ -16,10 +16,10 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/transformer_triton_backend.cpp
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/utils/nccl_utils.h"
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
......
......@@ -16,7 +16,7 @@
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/transformer_triton_backend.hpp
#pragma once
......@@ -25,13 +25,13 @@
#include <sys/time.h>
#include <vector>
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/instance_comm.h"
#include "src/turbomind/utils/mpi_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace ft = fastertransformer;
namespace ft = turbomind;
namespace triton {
#ifdef USE_TRITONSERVER_DATATYPE
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment