Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <assert.h> #include <assert.h>
#include <float.h> #include <float.h>
#include <type_traits> #include <type_traits>
......
...@@ -15,12 +15,12 @@ ...@@ -15,12 +15,12 @@
*/ */
#pragma once #pragma once
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h" #include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include <assert.h> #include <assert.h>
#include <float.h> #include <float.h>
#include <type_traits> #include <type_traits>
......
...@@ -16,12 +16,12 @@ ...@@ -16,12 +16,12 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include <stdint.h> #include <stdint.h>
using namespace fastertransformer; using namespace turbomind;
namespace mmha { namespace mmha {
......
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/turbomind/kernels/decoding_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
// static const float HALF_FLT_MAX = 65504.F; // static const float HALF_FLT_MAX = 65504.F;
...@@ -803,4 +803,4 @@ void invokeFinalize(int* output_ids, ...@@ -803,4 +803,4 @@ void invokeFinalize(int* output_ids,
max_seq_len); max_seq_len);
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeDecodingInitialize(bool* finished, void invokeDecodingInitialize(bool* finished,
...@@ -167,4 +167,4 @@ void invokeFinalize(int* output_ids, ...@@ -167,4 +167,4 @@ void invokeFinalize(int* output_ids,
const int batch_size, const int batch_size,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -17,11 +17,11 @@ ...@@ -17,11 +17,11 @@
#include "cublas_v2.h" #include "cublas_v2.h"
#include "gen_relative_pos_bias.h" #include "gen_relative_pos_bias.h"
#include "reduce_kernel_utils.cuh" #include "reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/activation_kernels.h" #include "src/turbomind/kernels/activation_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include <cstdio> #include <cstdio>
namespace fastertransformer { namespace turbomind {
/******************* invokeGenRelativePosBias ***********************/ /******************* invokeGenRelativePosBias ***********************/
// relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum] // relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum]
...@@ -301,4 +301,4 @@ template void invokeGenRelativePosBiasV2(half* relative_position_bias, ...@@ -301,4 +301,4 @@ template void invokeGenRelativePosBiasV2(half* relative_position_bias,
const int cpb_mlp_out_dim, const int cpb_mlp_out_dim,
const int head_num, const int head_num,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
#pragma once #pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdint.h> #include <stdint.h>
namespace fastertransformer { namespace turbomind {
enum class PositionEmbeddingType { enum class PositionEmbeddingType {
relative, relative,
...@@ -52,4 +52,4 @@ void invokeGenRelativePosBiasV2(T* relative_position_bias, ...@@ -52,4 +52,4 @@ void invokeGenRelativePosBiasV2(T* relative_position_bias,
const int cpb_mlp_out_dim, const int cpb_mlp_out_dim,
const int head_num, const int head_num,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/fastertransformer/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#ifndef CUDART_VERSION #ifndef CUDART_VERSION
#error CUDART_VERSION Undefined! #error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050) #elif (CUDART_VERSION >= 11050)
...@@ -22,10 +22,10 @@ ...@@ -22,10 +22,10 @@
#else #else
#include "3rdparty/cub/cub.cuh" #include "3rdparty/cub/cub.cuh"
#endif #endif
#include "src/fastertransformer/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts // PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
template<typename T, bool OUTPUT_ID, int PROMPT_SRC> template<typename T, bool OUTPUT_ID, int PROMPT_SRC>
...@@ -1108,4 +1108,4 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16); ...@@ -1108,4 +1108,4 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16);
#endif #endif
#undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION #undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION
} // namespace fastertransformer } // namespace turbomind
...@@ -20,10 +20,10 @@ ...@@ -20,10 +20,10 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <unordered_map> #include <unordered_map>
#include "src/fastertransformer/utils/Tensor.h" #include "src/turbomind/utils/Tensor.h"
#include "src/fastertransformer/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
struct inputIdsEmbeddingLookupPosEncodingSoftPromptParam { struct inputIdsEmbeddingLookupPosEncodingSoftPromptParam {
...@@ -238,4 +238,4 @@ void invokeSumLengthDimension(float* out_buf, ...@@ -238,4 +238,4 @@ void invokeSumLengthDimension(float* out_buf,
const size_t hidden_dim, const size_t hidden_dim,
cudaStream_t stream = 0); cudaStream_t stream = 0);
} // namespace fastertransformer } // namespace turbomind
...@@ -26,11 +26,11 @@ ...@@ -26,11 +26,11 @@
#include "3rdparty/cub/cub.cuh" #include "3rdparty/cub/cub.cuh"
#endif #endif
#include "src/fastertransformer/kernels/logprob_kernels.h" #include "src/turbomind/kernels/logprob_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
__global__ void log_probs_kernel(float* log_probs, __global__ void log_probs_kernel(float* log_probs,
...@@ -158,7 +158,7 @@ void invokeLogProbFromLogits(float* cum_log_probs, ...@@ -158,7 +158,7 @@ void invokeLogProbFromLogits(float* cum_log_probs,
// input_lengths: [batch_size] // input_lengths: [batch_size]
// workspace: workspace buffer of size at least sizeof(float) * max_input_length * batch_size. // workspace: workspace buffer of size at least sizeof(float) * max_input_length * batch_size.
FT_LOG_DEBUG(__PRETTY_FUNCTION__); TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// block_size should be multiple of 32 to use warpReduceMax. // block_size should be multiple of 32 to use warpReduceMax.
const int block_size = vocab_size < 1024 ? (vocab_size + 31) / 32 * 32 : 1024; const int block_size = vocab_size < 1024 ? (vocab_size + 31) / 32 * 32 : 1024;
assert(block_size % 32 == 0); assert(block_size % 32 == 0);
...@@ -207,4 +207,4 @@ template void invokeLogProbFromLogits(float* cum_log_probs, ...@@ -207,4 +207,4 @@ template void invokeLogProbFromLogits(float* cum_log_probs,
const size_t workspace_size, const size_t workspace_size,
cudaStream_t stream, cudaStream_t stream,
const bool batch_first); const bool batch_first);
} // end of namespace fastertransformer } // end of namespace turbomind
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#pragma once #pragma once
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeLogProbFromLogits(float* cum_log_probs, void invokeLogProbFromLogits(float* cum_log_probs,
...@@ -31,4 +31,4 @@ void invokeLogProbFromLogits(float* cum_log_probs, ...@@ -31,4 +31,4 @@ void invokeLogProbFromLogits(float* cum_log_probs,
const size_t workspace_size, const size_t workspace_size,
cudaStream_t stream, cudaStream_t stream,
const bool batch_first = false); const bool batch_first = false);
} // namespace fastertransformer } // namespace turbomind
...@@ -22,11 +22,11 @@ ...@@ -22,11 +22,11 @@
#include "3rdparty/cub/cub.cuh" #include "3rdparty/cub/cub.cuh"
#endif #endif
#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h" #include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
#define DO_SPLIT_SMALL_TOP_K_SOFTMAX #define DO_SPLIT_SMALL_TOP_K_SOFTMAX
static const int SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE = 256; static const int SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE = 256;
...@@ -736,4 +736,4 @@ template void invokeTopkSoftMax<half>(const half* log_probs, ...@@ -736,4 +736,4 @@ template void invokeTopkSoftMax<half>(const half* log_probs,
const float length_penalty, const float length_penalty,
cudaStream_t stream); cudaStream_t stream);
} // end of namespace fastertransformer } // end of namespace turbomind
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
*/ */
#pragma once #pragma once
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h" #include "src/turbomind/kernels/beam_search_topk_kernels.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeTopkSoftMax(const T* log_probs, void invokeTopkSoftMax(const T* log_probs,
...@@ -38,4 +38,4 @@ void invokeTopkSoftMax(const T* log_probs, ...@@ -38,4 +38,4 @@ void invokeTopkSoftMax(const T* log_probs,
const float length_penalty, const float length_penalty,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -19,9 +19,9 @@ ...@@ -19,9 +19,9 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include "src/fastertransformer/utils/string_utils.h" #include "src/turbomind/utils/string_utils.h"
namespace fastertransformer { namespace turbomind {
enum class RepetitionPenaltyType { enum class RepetitionPenaltyType {
Additive, // the presence penalty Additive, // the presence penalty
...@@ -42,4 +42,4 @@ inline float getDefaultPenaltyValue(RepetitionPenaltyType penalty_type) ...@@ -42,4 +42,4 @@ inline float getDefaultPenaltyValue(RepetitionPenaltyType penalty_type)
return 0.0f; return 0.0f;
} }
} // namespace fastertransformer } // namespace turbomind
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#else #else
#include <cooperative_groups.h> #include <cooperative_groups.h>
#endif #endif
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh" #include "src/turbomind/utils/cuda_type_utils.cuh"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <curand_kernel.h> #include <curand_kernel.h>
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
namespace fastertransformer { namespace turbomind {
template<int VPT> template<int VPT>
struct BytesToType; struct BytesToType;
...@@ -363,4 +363,4 @@ __device__ __forceinline__ __nv_bfloat16 clamp_inf_for_half(const float input) ...@@ -363,4 +363,4 @@ __device__ __forceinline__ __nv_bfloat16 clamp_inf_for_half(const float input)
} }
#endif #endif
} // namespace fastertransformer } // namespace turbomind
...@@ -17,9 +17,9 @@ ...@@ -17,9 +17,9 @@
#include <assert.h> #include <assert.h>
#include <float.h> #include <float.h>
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
namespace fastertransformer { namespace turbomind {
// TODO Add half2 implementation // TODO Add half2 implementation
template<typename T> template<typename T>
...@@ -534,4 +534,4 @@ template void invokeMinLengthPenalty(half* logits, ...@@ -534,4 +534,4 @@ template void invokeMinLengthPenalty(half* logits,
const int vocab_size_padded, const int vocab_size_padded,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -17,10 +17,10 @@ ...@@ -17,10 +17,10 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include "src/fastertransformer/kernels/penalty_types.h" #include "src/turbomind/kernels/penalty_types.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeApplyRepetitionPenalty(T* logits, void invokeApplyRepetitionPenalty(T* logits,
...@@ -78,4 +78,4 @@ void invokeMinLengthPenalty(T* logits, ...@@ -78,4 +78,4 @@ void invokeMinLengthPenalty(T* logits,
const int vocab_size_padded, const int vocab_size_padded,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -24,10 +24,10 @@ ...@@ -24,10 +24,10 @@
#include "3rdparty/cub/cub.cuh" #include "3rdparty/cub/cub.cuh"
#endif #endif
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
namespace fastertransformer { namespace turbomind {
__global__ void curandInitialize(curandState_t* state, const int size, const unsigned long long random_seed) __global__ void curandInitialize(curandState_t* state, const int size, const unsigned long long random_seed)
{ {
...@@ -575,4 +575,4 @@ template void invokeTopKTopPSampling(void* workspace, ...@@ -575,4 +575,4 @@ template void invokeTopKTopPSampling(void* workspace,
const int* end_ids, const int* end_ids,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
*/ */
#pragma once #pragma once
#include "src/fastertransformer/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include <curand_kernel.h> #include <curand_kernel.h>
namespace fastertransformer { namespace turbomind {
template<typename T> template<typename T>
void invokeTopKSampling(void* workspace, void invokeTopKSampling(void* workspace,
...@@ -95,4 +95,4 @@ void invokeTopKTopPSampling(void* workspace, ...@@ -95,4 +95,4 @@ void invokeTopKTopPSampling(void* workspace,
const int* end_ids, const int* end_ids,
cudaStream_t stream); cudaStream_t stream);
} // namespace fastertransformer } // namespace turbomind
...@@ -22,14 +22,14 @@ ...@@ -22,14 +22,14 @@
#include "3rdparty/cub/cub.cuh" #include "3rdparty/cub/cub.cuh"
#endif #endif
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
constexpr int ENABLE_SINGLE_PASS_TOP_P = 0; constexpr int ENABLE_SINGLE_PASS_TOP_P = 0;
constexpr float SINGLE_PASS_THRESHOLD = 0.9; constexpr float SINGLE_PASS_THRESHOLD = 0.9;
namespace fastertransformer { namespace turbomind {
namespace segmented_topp_impl { namespace segmented_topp_impl {
...@@ -1426,4 +1426,4 @@ void invokeComputeToppDecay(float* runtime_top_p, ...@@ -1426,4 +1426,4 @@ void invokeComputeToppDecay(float* runtime_top_p,
runtime_top_p, runtime_initial_top_p, output_ids, top_p_decay, top_p_min, top_p_reset_ids, local_batch_size); runtime_top_p, runtime_initial_top_p, output_ids, top_p_decay, top_p_min, top_p_reset_ids, local_batch_size);
} }
} // namespace fastertransformer } // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment