Unverified Commit 35d64462 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

build turbomind (#35)

* build turbomind

* change namespace fastertransformer to turbomind

* change logger name
parent 53d2e42c
......@@ -14,10 +14,10 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <assert.h>
#include <float.h>
#include <type_traits>
......
......@@ -15,12 +15,12 @@
*/
#pragma once
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include <assert.h>
#include <float.h>
#include <type_traits>
......
......@@ -16,12 +16,12 @@
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include <stdint.h>
using namespace fastertransformer;
using namespace turbomind;
namespace mmha {
......
......@@ -14,12 +14,12 @@
* limitations under the License.
*/
#include "src/fastertransformer/kernels/decoding_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
// static const float HALF_FLT_MAX = 65504.F;
......@@ -803,4 +803,4 @@ void invokeFinalize(int* output_ids,
max_seq_len);
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -20,7 +20,7 @@
#include <cuda_fp16.h>
#include <cuda_runtime.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeDecodingInitialize(bool* finished,
......@@ -167,4 +167,4 @@ void invokeFinalize(int* output_ids,
const int batch_size,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,11 +17,11 @@
#include "cublas_v2.h"
#include "gen_relative_pos_bias.h"
#include "reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cstdio>
namespace fastertransformer {
namespace turbomind {
/******************* invokeGenRelativePosBias ***********************/
// relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum]
......@@ -301,4 +301,4 @@ template void invokeGenRelativePosBiasV2(half* relative_position_bias,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,13 +16,13 @@
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <assert.h>
#include <cuda_runtime.h>
#include <stdint.h>
namespace fastertransformer {
namespace turbomind {
enum class PositionEmbeddingType {
relative,
......@@ -52,4 +52,4 @@ void invokeGenRelativePosBiasV2(T* relative_position_bias,
const int cpb_mlp_out_dim,
const int head_num,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -14,7 +14,7 @@
* limitations under the License.
*/
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050)
......@@ -22,10 +22,10 @@
#else
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
template<typename T, bool OUTPUT_ID, int PROMPT_SRC>
......@@ -1108,4 +1108,4 @@ INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16);
#endif
#undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION
} // namespace fastertransformer
} // namespace turbomind
......@@ -20,10 +20,10 @@
#include <cuda_runtime.h>
#include <unordered_map>
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/memory_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
struct inputIdsEmbeddingLookupPosEncodingSoftPromptParam {
......@@ -238,4 +238,4 @@ void invokeSumLengthDimension(float* out_buf,
const size_t hidden_dim,
cudaStream_t stream = 0);
} // namespace fastertransformer
} // namespace turbomind
......@@ -26,11 +26,11 @@
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/fastertransformer/kernels/logprob_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/kernels/logprob_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/logger.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
__global__ void log_probs_kernel(float* log_probs,
......@@ -158,7 +158,7 @@ void invokeLogProbFromLogits(float* cum_log_probs,
// input_lengths: [batch_size]
// workspace: workspace buffer of size at least sizeof(float) * max_input_length * batch_size.
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// block_size should be multiple of 32 to use warpReduceMax.
const int block_size = vocab_size < 1024 ? (vocab_size + 31) / 32 * 32 : 1024;
assert(block_size % 32 == 0);
......@@ -207,4 +207,4 @@ template void invokeLogProbFromLogits(float* cum_log_probs,
const size_t workspace_size,
cudaStream_t stream,
const bool batch_first);
} // end of namespace fastertransformer
} // end of namespace turbomind
......@@ -16,7 +16,7 @@
#pragma once
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeLogProbFromLogits(float* cum_log_probs,
......@@ -31,4 +31,4 @@ void invokeLogProbFromLogits(float* cum_log_probs,
const size_t workspace_size,
cudaStream_t stream,
const bool batch_first = false);
} // namespace fastertransformer
} // namespace turbomind
......@@ -22,11 +22,11 @@
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h"
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
#define DO_SPLIT_SMALL_TOP_K_SOFTMAX
static const int SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE = 256;
......@@ -736,4 +736,4 @@ template void invokeTopkSoftMax<half>(const half* log_probs,
const float length_penalty,
cudaStream_t stream);
} // end of namespace fastertransformer
} // end of namespace turbomind
......@@ -15,9 +15,9 @@
*/
#pragma once
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeTopkSoftMax(const T* log_probs,
......@@ -38,4 +38,4 @@ void invokeTopkSoftMax(const T* log_probs,
const float length_penalty,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -19,9 +19,9 @@
#include <string>
#include <unordered_map>
#include "src/fastertransformer/utils/string_utils.h"
#include "src/turbomind/utils/string_utils.h"
namespace fastertransformer {
namespace turbomind {
enum class RepetitionPenaltyType {
Additive, // the presence penalty
......@@ -42,4 +42,4 @@ inline float getDefaultPenaltyValue(RepetitionPenaltyType penalty_type)
return 0.0f;
}
} // namespace fastertransformer
} // namespace turbomind
......@@ -21,8 +21,8 @@
#else
#include <cooperative_groups.h>
#endif
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
......@@ -31,7 +31,7 @@
namespace cg = cooperative_groups;
namespace fastertransformer {
namespace turbomind {
template<int VPT>
struct BytesToType;
......@@ -363,4 +363,4 @@ __device__ __forceinline__ __nv_bfloat16 clamp_inf_for_half(const float input)
}
#endif
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,9 +17,9 @@
#include <assert.h>
#include <float.h>
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
namespace fastertransformer {
namespace turbomind {
// TODO Add half2 implementation
template<typename T>
......@@ -534,4 +534,4 @@ template void invokeMinLengthPenalty(half* logits,
const int vocab_size_padded,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -17,10 +17,10 @@
#include <cuda_fp16.h>
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeApplyRepetitionPenalty(T* logits,
......@@ -78,4 +78,4 @@ void invokeMinLengthPenalty(T* logits,
const int vocab_size_padded,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -24,10 +24,10 @@
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
namespace fastertransformer {
namespace turbomind {
__global__ void curandInitialize(curandState_t* state, const int size, const unsigned long long random_seed)
{
......@@ -575,4 +575,4 @@ template void invokeTopKTopPSampling(void* workspace,
const int* end_ids,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -16,9 +16,9 @@
*/
#pragma once
#include "src/fastertransformer/utils/logger.h"
#include "src/turbomind/utils/logger.h"
#include <curand_kernel.h>
namespace fastertransformer {
namespace turbomind {
template<typename T>
void invokeTopKSampling(void* workspace,
......@@ -95,4 +95,4 @@ void invokeTopKTopPSampling(void* workspace,
const int* end_ids,
cudaStream_t stream);
} // namespace fastertransformer
} // namespace turbomind
......@@ -22,14 +22,14 @@
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
constexpr int ENABLE_SINGLE_PASS_TOP_P = 0;
constexpr float SINGLE_PASS_THRESHOLD = 0.9;
namespace fastertransformer {
namespace turbomind {
namespace segmented_topp_impl {
......@@ -1426,4 +1426,4 @@ void invokeComputeToppDecay(float* runtime_top_p,
runtime_top_p, runtime_initial_top_p, output_ids, top_p_decay, top_p_min, top_p_reset_ids, local_batch_size);
}
} // namespace fastertransformer
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment