Unverified Commit 981a4610 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

[Fix] Remove unused code to reduce binary size (#181)

* clean-up

* fix lint

* fix lint
parent 83697422
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/layers/FfnFP8Layer.h"
#include "src/turbomind/kernels/activation_fp8_kernels.h"
#include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace turbomind {
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors,
TensorMap* input_tensors,
const FfnFP8Weight<T1, T2>* ffn_weights)
{
// input tensors:
// input_hidden_state [token_num, d_model],
// output tensors:
// output_hidden_state [token_num, d_model],
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() == 1);
FT_CHECK(output_tensors->size() == 1);
const int m = input_tensors->at("input_hidden_state").shape[0];
const int d_model = input_tensors->at("input_hidden_state").shape[1];
const T1* input_hidden_state = input_tensors->at("input_hidden_state").getPtr<T1>();
Tensor output_tensor = output_tensors->at("output_hidden_state");
allocateBuffer(m);
#ifdef FUSE_GEMM_ACT
if (fp8_mode_ == 1) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(inter_buf_bf16_,
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.per_channel_scale_min, // identity_scale
stream_);
invokeAddBiasActivation(m,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
ffn_weights->intermediate_weight.scale,
ffn_weights->intermediate_weight.per_channel_scale_min,
ffn_weights->output_weight.input_scale_inv);
}
else if (fp8_mode_ == 2) {
#ifdef USE_QGMMA
if (getActivationType() == ActivationType::Gelu) {
PUSH_RANGE("FFN gemm 1 bias gelu");
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Conv1x1Gemm<false, true>(inter_buf_,
m,
inter_size_,
d_model,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.bias,
*(ffn_weights->intermediate_weight.input_h_scale), // scale_a,
*(ffn_weights->intermediate_weight.weight_h_scale), // scale_b,
*(ffn_weights->output_weight.input_h_scale_inv), // scale_d,
stream_);
POP_RANGE;
}
else if (getActivationType() == ActivationType::Relu) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Conv1x1Gemm<true, false>(inter_buf_,
m,
inter_size_,
d_model,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.bias,
*(ffn_weights->intermediate_weight.input_h_scale), // scale_a,
*(ffn_weights->intermediate_weight.weight_h_scale), // scale_b,
*(ffn_weights->output_weight.input_h_scale_inv), // scale_d,
stream_);
}
#else // USE_QGMMA
const float alpha = 1.0f;
const float beta = 0.0f;
if (getActivationType() == ActivationType::Gelu) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<false, true>(inter_buf_bf16_,
#else // FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<false, true>(inter_buf_,
#endif // FP8_GEMM_OUTPUT_QUANT_DISABLE
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.weight_scale,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
stream_);
}
else if (getActivationType() == ActivationType::Relu) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<true, false>(inter_buf_bf16_,
#else // FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<true, false>(inter_buf_,
#endif // #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.weight_scale,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
stream_);
}
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
invokeQuantizeMatrix<T1, T2, QUANTIZE_MODE::PER_TENSOR>(
inter_buf_, ffn_weights->output_weight.input_scale_inv, inter_buf_bf16_, m * inter_size_, 1, stream_);
#endif FP8_GEMM_OUTPUT_QUANT_DISABLE
#endif // USE_QGMMA
}
#else // FUSE_GEMM_ACT
PUSH_RANGE("FFN gemm 1");
#ifdef SPARSITY_ENABLED
int m_tmp = m;
if (m_tmp % 8 != 0) {
m_tmp = (m_tmp / 8 + 1) * 8;
}
const int m_padded = m_tmp;
if (sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, d_model)) {
FT_CHECK(false);
// cublas_wrapper_->SpGemm(CUBLAS_OP_N,
// CUBLAS_OP_N,
// inter_size_,
// m_padded,
// d_model,
// ffn_weights->intermediate_weight.sp_kernel,
// input_hidden_state,
// inter_buf_);
}
else {
#endif // SPARSITY_ENABLED
if (fp8_mode_ == 1) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(inter_buf_bf16_,
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.per_channel_scale_min, // identity_scale
stream_);
}
else if (fp8_mode_ == 2) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(inter_buf_bf16_,
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.weight_scale,
stream_);
}
#ifdef SPARSITY_ENABLED
}
#endif // SPARSITY_ENABLED
POP_RANGE;
PUSH_RANGE("FFN add bias act");
if (fp8_mode_ == 1) {
invokeAddBiasActivation(m,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
ffn_weights->intermediate_weight.scale,
ffn_weights->intermediate_weight.per_channel_scale_min,
ffn_weights->output_weight.input_scale_inv);
}
else if (fp8_mode_ == 2) {
invokeAddBiasActivation(m,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
nullptr,
nullptr,
ffn_weights->output_weight.input_scale_inv);
}
sync_check_cuda_error();
POP_RANGE;
#endif // FUSE_GEMM_ACT
PUSH_RANGE("FFN gemm 2");
#ifdef SPARSITY_ENABLED
if (sparse_ && cublas_wrapper_->isUseSparse(1, d_model, m, inter_size_)) {
FT_CHECK(false);
// cublas_wrapper_->SpGemm(CUBLAS_OP_N,
// CUBLAS_OP_N,
// d_model,
// m_padded,
// inter_size_,
// ffn_weights->output_weight.sp_kernel,
// inter_buf_,
// output_tensor);
}
else {
#endif SPARSITY_ENABLED
if (fp8_mode_ == 1) {
const float alpha = 1.0f;
const float beta = 0.0f;
if (output_tensor.type == TYPE_BF16) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T2>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->identity_scale,
stream_);
}
else if (output_tensor.type == TYPE_FP8_E4M3) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T1>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->output_weight.per_channel_scale_min,
ffn_weights->output_weight.output_scale_inv,
stream_);
}
else {
FT_CHECK(false);
}
}
else if (fp8_mode_ == 2) {
if (output_tensor.type == TYPE_BF16) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T2>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->output_weight.weight_scale,
stream_);
}
else if (output_tensor.type == TYPE_FP8_E4M3) {
// It looks like conv1x1Gemm does not bring better performance for this gemm
// because the k dimension of this gemm is large
// #ifdef USE_QGMMA
// reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
// ->Conv1x1Gemm<false, false>(output_tensor.getPtr<T1>(),
// m,
// d_model,
// inter_size_,
// inter_buf_,
// ffn_weights->output_weight.kernel,
// ffn_weights->output_weight.bias,
// *(ffn_weights->output_weight.input_h_scale), //
// scale_a,
// *(ffn_weights->output_weight.weight_h_scale), //
// scale_b,
// *(ffn_weights->output_weight.output_h_scale_inv), //
// scale_d, stream_);
// #else // USE_QGMMA
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T1>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->output_weight.weight_scale,
ffn_weights->output_weight.output_scale_inv,
stream_);
// #endif // USE_QGMMA
}
else {
FT_CHECK(false);
}
}
#ifdef SPARSITY_ENABLED
}
#endif // SPARSITY_ENABLED
POP_RANGE;
sync_check_cuda_error();
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
inter_size_(inter_size),
fp8_mode_(fp8_mode)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
BaseLayer(ffn_layer.stream_,
ffn_layer.cublas_wrapper_,
ffn_layer.allocator_,
ffn_layer.is_free_buffer_after_forward_,
ffn_layer.cuda_device_prop_,
ffn_layer.sparse_),
inter_size_(ffn_layer.inter_size_),
fp8_mode_(ffn_layer.fp8_mode_)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::~FfnFP8Layer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr;
freeBuffer();
}
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
inter_buf_ = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
inter_buf_bf16_ = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
is_allocate_buffer_ = true;
}
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::freeBuffer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_));
allocator_->free((void**)(&inter_buf_bf16_));
is_allocate_buffer_ = false;
}
}
template class FfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
template<typename T1, typename T2>
GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
{
}
template<typename T1, typename T2>
GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& gelu_ffn_layer):
FfnFP8Layer<T1, T2>(gelu_ffn_layer)
{
}
template<typename T1, typename T2>
void GeluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale)
{
FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
inter_buf_,
bias,
input_scale,
input_scale_2,
input_scale_2_min,
output_scale,
(uint32_t)m,
(uint32_t)inter_size_,
stream_};
invokeFP8AddBiasGelu<T1, T2>(param);
}
template class GeluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
template<typename T1, typename T2>
ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
{
}
template<typename T1, typename T2>
ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& relu_ffn_layer):
FfnFP8Layer<T1, T2>(relu_ffn_layer)
{
}
template<typename T1, typename T2>
void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale)
{
FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
inter_buf_,
bias,
input_scale,
input_scale_2,
input_scale_2_min,
output_scale,
(uint32_t)m,
(uint32_t)inter_size_,
stream_};
invokeFP8AddBiasRelu<T1, T2>(param);
}
template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
} // namespace turbomind
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/FfnFP8Weight.h"
#include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace turbomind {
template<typename T1, typename T2>
class FfnFP8Layer: public BaseLayer {
private:
void allocateBuffer() override;
void freeBuffer() override;
void allocateBuffer(size_t token_num);
protected:
const int fp8_mode_;
T1* inter_buf_ = nullptr;
T2* inter_buf_bf16_ = nullptr;
size_t inter_size_;
virtual void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) = 0;
public:
FfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~FfnFP8Layer();
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
virtual ActivationType getActivationType() = 0;
};
template<typename T1, typename T2>
class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
public:
GeluFfnFP8Layer(size_t inter_size,
int fp8_mode_,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~GeluFfnFP8Layer() = default;
ActivationType getActivationType() override
{
return ActivationType::Gelu;
};
protected:
using FfnFP8Layer<T1, T2>::stream_;
private:
using FfnFP8Layer<T1, T2>::inter_buf_;
using FfnFP8Layer<T1, T2>::inter_size_;
using FfnFP8Layer<T1, T2>::fp8_mode_;
using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) override;
};
template<typename T1, typename T2>
class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
public:
ReluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~ReluFfnFP8Layer() = default;
ActivationType getActivationType() override
{
return ActivationType::Relu;
};
protected:
using FfnFP8Layer<T1, T2>::stream_;
private:
using FfnFP8Layer<T1, T2>::inter_buf_;
using FfnFP8Layer<T1, T2>::inter_size_;
using FfnFP8Layer<T1, T2>::fp8_mode_;
using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) override;
};
} // namespace turbomind
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> {
ScaleList* scale_list_ptr;
float* identity_scale;
float* identity_h_scale;
};
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T>
struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "FfnLayerINT8.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace turbomind {
template<typename T>
void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights)
{
// input_tensors: [input (token_num, hidden_dimension)]
// output_tensors: [output (token_num, hidden_dimension)]
ScaleList* scale_list = ((const FfnINT8Weight<T>*)ffn_weights)->scale_list_ptr;
cublasINT8MMWrapper* cublas_wrapper = (cublasINT8MMWrapper*)cublas_wrapper_;
FT_CHECK(isValidTokenNum(input_tensors->at(0).shape[0]));
allocateBuffer();
const int m = static_cast<int>(input_tensors->at(0).shape[0]);
#ifdef SPARSITY_ENABLED
int m_tmp = m;
if (m_tmp % 16 != 0) {
m_tmp = (m_tmp / 16 + 1) * 16;
}
const int m_padded = m_tmp;
#endif
int32_t* output_tensor = output_tensors->at(0).getPtr<int32_t>();
const int8_t* input_tensor = input_tensors->at(0).getPtr<const int8_t>();
PUSH_RANGE("FFN gemm 1");
if (int8_mode_ == 1) {
cublas_wrapper->Gemm(inter_int_buf_,
1,
m,
inter_size_,
hidden_units_,
0,
0,
0,
input_tensor,
(int8_t*)(ffn_weights->intermediate_weight.kernel));
}
else if (int8_mode_ == 2 || int8_mode_ == 3) {
#ifdef SPARSITY_ENABLED
if (sparse_) {
cublas_wrapper->SpGemm(inter_size_,
m_padded,
hidden_units_,
scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
(int8_t*)(ffn_weights->intermediate_weight.sp_kernel),
input_tensor,
(int8_t*)inter_int_buf_);
}
else {
#endif
cublas_wrapper->Gemm((int8_t*)inter_int_buf_,
1,
m,
inter_size_,
hidden_units_,
0,
0,
0,
scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
input_tensor,
(int8_t*)(ffn_weights->intermediate_weight.kernel));
#ifdef SPARSITY_ENABLED
}
#endif
}
POP_RANGE;
PUSH_RANGE("add bias act");
invokeAddBiasActivation(m, ffn_weights->intermediate_weight.bias, scale_list);
POP_RANGE;
sync_check_cuda_error();
PUSH_RANGE("FFN gemm 2");
if (int8_mode_ == 1) {
cublas_wrapper->Gemm(output_tensor,
1,
m,
hidden_units_,
inter_size_,
0,
0,
0,
inter_buf_,
(int8_t*)(ffn_weights->output_weight.kernel));
}
else if (int8_mode_ == 2 || int8_mode_ == 3) {
#ifdef SPARSITY_ENABLED
if (sparse_) {
cublas_wrapper->SpGemm(hidden_units_,
m_padded,
inter_size_,
scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
(int8_t*)(ffn_weights->output_weight.sp_kernel),
inter_buf_,
(int8_t*)output_tensor);
}
else {
#endif
cublas_wrapper->Gemm((int8_t*)output_tensor,
1,
m,
hidden_units_,
inter_size_,
0,
0,
0,
scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
inter_buf_,
(int8_t*)(ffn_weights->output_weight.kernel));
#ifdef SPARSITY_ENABLED
}
#endif
}
POP_RANGE;
sync_check_cuda_error();
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T>
FfnLayerINT8<T>::FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
max_token_num_(max_batch_size * max_seq_len),
head_num_(head_num),
size_per_head_(size_per_head),
hidden_units_(head_num * size_per_head),
inter_size_(inter_size),
int8_mode_(int8_mode),
sparse_(sparse)
{
}
template<typename T>
FfnLayerINT8<T>::FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer):
BaseLayer(
ffn_layer.stream_, ffn_layer.cublas_wrapper_, ffn_layer.allocator_, ffn_layer.is_free_buffer_after_forward_),
max_token_num_(ffn_layer.max_token_num_),
head_num_(ffn_layer.head_num_),
size_per_head_(ffn_layer.size_per_head_),
hidden_units_(ffn_layer.hidden_units_),
inter_size_(ffn_layer.inter_size_),
int8_mode_(ffn_layer.int8_mode_),
sparse_(ffn_layer.sparse_)
{
}
template<typename T>
FfnLayerINT8<T>::~FfnLayerINT8()
{
cublas_wrapper_ = nullptr;
freeBuffer();
}
template<typename T>
void FfnLayerINT8<T>::allocateBuffer()
{
if (is_allocate_buffer_ == false) {
inter_int_buf_ =
(int32_t*)allocator_->reMalloc(inter_int_buf_, sizeof(int32_t) * max_token_num_ * inter_size_, false);
inter_buf_ = (int8_t*)allocator_->reMalloc(inter_buf_, sizeof(int8_t) * max_token_num_ * inter_size_, false);
is_allocate_buffer_ = true;
}
}
template<typename T>
void FfnLayerINT8<T>::freeBuffer()
{
if (is_allocate_buffer_ == true) {
allocator_->free((void**)(&inter_int_buf_));
allocator_->free((void**)(&inter_buf_));
is_allocate_buffer_ = false;
}
}
template<typename T>
bool FfnLayerINT8<T>::isValidTokenNum(size_t token_num)
{
if (max_token_num_ == 0) {
max_token_num_ = token_num;
return true;
}
else {
return token_num <= max_token_num_;
}
}
template class FfnLayerINT8<float>;
template class FfnLayerINT8<half>;
template<typename T>
GeluFfnLayerINT8<T>::GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
FfnLayerINT8<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
inter_size,
int8_mode,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
sparse)
{
}
template<typename T>
GeluFfnLayerINT8<T>::GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& gelu_ffn_layer): FfnLayerINT8<T>(gelu_ffn_layer)
{
}
template<typename T>
void GeluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
{
if (int8_mode_ == 1) {
invokeAddBiasGeluCol32<T>(inter_buf_,
inter_int_buf_,
bias,
m,
inter_size_,
stream_,
&(scale_list->d_scale_list_[scale_list->p2_offset_ + 4 * hidden_units_]),
&(scale_list->d_scale_list_[44 + 2]),
&(scale_list->d_scale_list_[52 + 3]));
}
else if (int8_mode_ == 2 || int8_mode_ == 3) {
#ifdef SPARSITY_ENABLED
if (sparse_) {
invokeAddBiasGeluRow<T>(inter_buf_,
(const int8_t*)inter_int_buf_,
bias,
m,
inter_size_,
stream_,
&(scale_list->d_scale_list_[48 + 1]),
&(scale_list->d_scale_list_[52 + 3]));
}
else {
#endif
invokeAddBiasGeluCol32<T>(inter_buf_,
(const int8_t*)inter_int_buf_,
bias,
m,
inter_size_,
stream_,
&(scale_list->d_scale_list_[48 + 1]),
&(scale_list->d_scale_list_[52 + 3]));
#ifdef SPARSITY_ENABLED
}
#endif
}
}
template class GeluFfnLayerINT8<float>;
template class GeluFfnLayerINT8<half>;
template<typename T>
ReluFfnLayerINT8<T>::ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
FfnLayerINT8<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
inter_size,
int8_mode,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward)
{
}
template<typename T>
ReluFfnLayerINT8<T>::ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& relu_ffn_layer): FfnLayerINT8<T>(relu_ffn_layer)
{
}
template<typename T>
void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
{
// TODO
}
template class ReluFfnLayerINT8<float>;
template class ReluFfnLayerINT8<half>;
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnINT8Weight.h"
#include "src/turbomind/kernels/activation_int8_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/utils/ScaleList.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasINT8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace turbomind {
template<typename T>
class GeluFfnLayerINT8;
template<typename T>
class ReluFfnLayerINT8;
template<typename T>
class FfnLayerINT8: public BaseLayer {
private:
// buffer handling
size_t max_token_num_ = 0;
// meta data
size_t head_num_;
size_t size_per_head_;
// calculated data
size_t hidden_units_;
void allocateBuffer() override;
void freeBuffer() override;
bool isValidTokenNum(size_t token_num);
protected:
size_t inter_size_;
int int8_mode_;
bool sparse_;
int* inter_int_buf_;
int8_t* inter_buf_;
virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
public:
FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
~FfnLayerINT8();
void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>;
friend ReluFfnLayerINT8<T>;
};
template<typename T>
class GeluFfnLayerINT8: public FfnLayerINT8<T> {
public:
GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
~GeluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::sparse_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
template<typename T>
class ReluFfnLayerINT8: public FfnLayerINT8<T> {
public:
ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
~ReluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
} // namespace turbomind
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T1, typename T2>
struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
const float* qk_scale;
const float* qk_scale_inv;
float* qk_h_scale;
float* qk_h_scale_inv;
float* identity_scale;
float* identity_h_scale;
};
} // namespace turbomind
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <assert.h>
#include <vector>
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
namespace turbomind {
// template<typename T>
// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
// const bool is_fuse = true)
// {
// if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm ==
// kSM_72)
// && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
// return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
// }
// else {
// return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
// }
// }
template<typename T1, typename T2>
class BaseAttentionFP8Layer: public BaseLayer {
public:
virtual void forward(TensorMap* output_tensors,
TensorMap* input_tensors,
const AttentionFP8Weight<T1, T2>* attention_weights) = 0;
BaseAttentionFP8Layer(cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
{
}
virtual ~BaseAttentionFP8Layer() = default;
};
} // namespace turbomind
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace turbomind {
template<typename T>
struct AttentionINT8Weight: AttentionWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace turbomind
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
__global__ void update_indir_cache_kernel(int* tgt_indir_cache,
const int* src_indir_cache,
const int* beam_ids,
const bool* finished,
int start_step,
int batch_dim,
int local_batch_size,
int beam_width,
int max_seq_len,
int step)
{
int time_step = threadIdx.x + blockIdx.x * blockDim.x;
int bb_id = threadIdx.y + blockIdx.y * blockDim.y;
const int batch_id = bb_id / beam_width;
const int beam_id = bb_id % beam_width;
if (bb_id >= beam_width * local_batch_size || time_step >= min(step + 1, max_seq_len) || finished[bb_id]) {
return;
}
time_step += start_step;
const int time_step_circ = time_step % max_seq_len;
const int src_beam = beam_ids[batch_id * beam_width + beam_id];
const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
tgt_indir_cache[tgt_offset] = (time_step == step) ? beam_id : src_indir_cache[src_offset];
}
void update_indir_cache_kernelLauncher(int* tgt_indir_cache,
const int* src_indir_cache,
const int* beam_ids,
const bool* finished,
int batch_dim,
int local_batch_size,
int beam_width,
int max_seq_len,
int step,
cudaStream_t stream)
{
const dim3 block(32);
const int start_step = max(0, step + 1 - max_seq_len);
const int num_steps = min(step + 1, max_seq_len);
// Update indirections steps [start_step, step], included
const dim3 grid((num_steps + block.x - 1) / block.x, local_batch_size * beam_width);
update_indir_cache_kernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
src_indir_cache,
beam_ids,
finished,
start_step,
batch_dim,
local_batch_size,
beam_width,
max_seq_len,
step);
}
template<typename T>
BaseBeamSearchLayer<T>::BaseBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr),
vocab_size_(vocab_size),
vocab_size_padded_(vocab_size_padded)
{
}
template<typename T>
BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer):
DynamicDecodeBaseLayer(beam_search_layer),
vocab_size_(beam_search_layer.vocab_size_),
vocab_size_padded_(beam_search_layer.vocab_size_padded_),
topk_softmax_workspace_size_(beam_search_layer.topk_softmax_workspace_size_)
{
}
template<typename T>
BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer();
}
template<typename T>
void BaseBeamSearchLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
allocator_->free((void**)(&topk_softmax_workspace_));
is_allocate_buffer_ = false;
}
}
template<typename T>
void BaseBeamSearchLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
{
// do nothing.
}
template<typename T>
void BaseBeamSearchLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
{"embedding_bias", input_tensors->at(1)},
{"step", input_tensors->at(2)},
{"src_cache_indirection", input_tensors->at(4)},
{"max_input_length", input_tensors->at(5)},
{"input_lengths", input_tensors->at(6)},
{"ite", input_tensors->at(7)}};
std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
{"finished", output_tensors->at(1)},
{"cum_log_probs", output_tensors->at(2)},
{"parent_ids", output_tensors->at(3)},
{"sequence_length", output_tensors->at(4)},
{"tgt_cache_indirection", output_tensors->at(5)}};
forward(&output_tensors_map, &input_tensors_map);
}
template<typename T>
void BaseBeamSearchLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors)
{
TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors);
forward(&output_map, &input_map);
}
template<typename T>
void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// end_id [local_batch_size]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width], optional
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// presence_penalty [1] on cpu, optional
// Only one of repetition and presence penalties is allowed.
// min_length [1] on cpu, int, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width], optional
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width], optional
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size, beam_width], optional
// beam_hyps, optional
FT_CHECK(input_tensors->size() >= 7);
FT_CHECK(output_tensors->size() >= 5);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int beam_width = output_tensors->at("output_ids").shape[2];
allocateBuffer(batch_size, beam_width);
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int local_batch_size = input_tensors->at("logits").shape[0];
const float temperature = input_tensors->getVal<float>("temperature", 1.0f);
const T* embedding_bias = input_tensors->getPtr<const T>("embedding_bias", nullptr);
RepetitionPenaltyType repetition_penalty_type = RepetitionPenaltyType::None;
float repetition_penalty = getDefaultPenaltyValue(repetition_penalty_type);
if (input_tensors->isExist("repetition_penalty") || input_tensors->isExist("presence_penalty")) {
FT_CHECK_WITH_INFO(
!(input_tensors->isExist("repetition_penalty") && input_tensors->isExist("presence_penalty")),
"Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
"Please provide one of repetition_penalty or presence_penalty.");
repetition_penalty_type = input_tensors->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
RepetitionPenaltyType::Additive;
repetition_penalty = repetition_penalty_type == RepetitionPenaltyType::Multiplicative ?
input_tensors->getVal<float>("repetition_penalty") :
input_tensors->getVal<float>("presence_penalty");
}
invokeAddBiasApplyPenalties(
step,
input_tensors->at("logits").getPtr<T>(),
output_tensors->at("output_ids")
.getPtrWithOffset<const int>((step - 1) * batch_size * beam_width + ite * local_batch_size * beam_width),
output_tensors->getPtr<const int>("output_ids"),
output_tensors->getPtr<const int>("parent_ids"),
input_tensors->getPtr<const int>("input_lengths", nullptr),
output_tensors->getPtr<const int>("sequence_length", nullptr),
embedding_bias,
ite,
input_tensors->getVal<int>("max_input_length"),
local_batch_size,
batch_size,
beam_width,
vocab_size_,
vocab_size_padded_,
input_tensors->getPtr<const int>("end_id", nullptr),
temperature,
repetition_penalty,
repetition_penalty_type,
input_tensors->getVal<const int>("min_length", 0),
stream_);
sync_check_cuda_error();
invokeSoftMax(output_tensors, input_tensors);
if (beam_width > 1) {
const int max_seq_len = output_tensors->at("output_ids").shape[0];
update_indir_cache_kernelLauncher(
output_tensors->at("tgt_cache_indirection").getPtr<int>(),
input_tensors->at("src_cache_indirection").getPtr<const int>(),
output_tensors->at("parent_ids")
.getPtrWithOffset<const int>(+step * beam_width * batch_size + ite * local_batch_size * beam_width),
output_tensors->at("finished").getPtr<const bool>(),
batch_size,
local_batch_size,
beam_width,
max_seq_len,
step,
stream_);
sync_check_cuda_error();
}
sync_check_cuda_error();
if (is_free_buffer_after_forward_) {
freeBuffer();
}
sync_check_cuda_error();
}
template class BaseBeamSearchLayer<float>;
template class BaseBeamSearchLayer<half>;
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
namespace turbomind {
template<typename T>
class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
private:
void freeBuffer();
protected:
// meta data
size_t vocab_size_;
size_t vocab_size_padded_;
size_t topk_softmax_workspace_size_;
void* topk_softmax_workspace_ = nullptr;
virtual void allocateBuffer() = 0;
virtual void allocateBuffer(size_t batch_size, size_t beam_width) = 0;
virtual void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
public:
BaseBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer);
~BaseBeamSearchLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
};
void update_indir_cache_kernelLauncher(int* tgt_indir_cache,
const int* src_indir_cache,
const int* beam_ids,
const bool* finished,
int batch_dim,
int beam_width,
int max_seq_len,
int ite,
cudaStream_t stream);
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
namespace turbomind {
template<typename T>
__global__ void logProbAddCumLogProb(float* log_probs,
const T* logits,
const float* cum_log_probs,
const int* end_ids,
const bool* finished,
const int beam_width,
const int n)
{
int bid = blockIdx.x;
bool finish = finished != nullptr ? finished[bid] : false;
int offset = bid * n;
float max_val = -1 * FLT_MAX;
__shared__ float s_max_val;
__shared__ float s_sum_val;
if (finish) {
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = (tid == end_ids[bid / beam_width]) ? cum_log_probs[bid] : -FLT_MAX;
}
}
else {
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = (float)(logits[offset + tid]);
max_val = max(max_val, log_probs[offset + tid]);
}
max_val = blockReduceMax(max_val);
if (threadIdx.x == 0) {
s_max_val = max_val;
}
__syncthreads();
float sum_val = 0.0f;
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = __expf(log_probs[offset + tid] - s_max_val);
sum_val += log_probs[offset + tid];
}
sum_val = blockReduceSum(sum_val);
if (threadIdx.x == 0) {
s_sum_val = sum_val + 1e-6f;
}
__syncthreads();
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = logf(log_probs[offset + tid] / s_sum_val) + cum_log_probs[bid];
}
}
}
template<typename T>
void invokeLogProbAddCumLogProb(float* log_probs,
const T* logits,
const float* cum_log_probs,
const int* end_ids,
const bool* finished,
const int m,
const int beam_width,
const int n,
cudaStream_t stream)
{
dim3 grid(m);
dim3 block(min(n, 1024));
/*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
logProbAddCumLogProb<<<grid, block, 0, stream>>>(
log_probs, logits, cum_log_probs, end_ids, finished, beam_width, n);
}
template<typename T>
__global__ void updateStatesKernel(T* log_probs,
T* cum_log_probs,
float* output_log_probs,
bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses beam_hyps,
const int local_batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids)
{
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
index += blockDim.x * gridDim.x) {
int batch_id = index / beam_width;
sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
int beam_id = (word_ids[index] / vocab_size) % beam_width;
int word_id = word_ids[index] % vocab_size;
if (output_log_probs != nullptr) {
// get the cum_log_probs of previous run
output_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id]
- cum_log_probs[batch_id * beam_width + beam_id];
}
cum_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id];
sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
finished[index] = word_id == end_ids[batch_id] ? 1 : 0;
parent_ids[index] = beam_id;
word_ids[index] = word_id;
output_ids[index] = word_id;
if (beam_hyps.num_beams != nullptr) {
if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
for (int i = 0; i < beam_width; i++) {
finished[batch_id * beam_width + i] = true;
}
}
}
}
}
void invokeUpdateStates(float* log_probs,
float* cum_log_probs,
float* output_log_probs,
bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses* beam_hyps,
const int local_batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
cudaStream_t stream)
{
dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
dim3 block(256);
updateStatesKernel<float><<<grid, block, 0, stream>>>(log_probs,
cum_log_probs,
output_log_probs,
finished,
parent_ids,
sequence_length,
word_ids,
output_ids,
*beam_hyps,
local_batch_size,
beam_width,
vocab_size,
end_ids);
}
template<typename T>
void BeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size * beam_width], optional
// beam_hyps, optional
FT_CHECK(input_tensors->size() >= 7);
FT_CHECK(output_tensors->size() >= 6);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int beam_width = output_tensors->at("output_ids").shape[2];
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int local_batch_size = input_tensors->at("logits").shape[0];
const float diversity_rate = input_tensors->isExist("beam_search_diversity_rate") ?
input_tensors->at("beam_search_diversity_rate").getVal<float>() :
0.0f;
const float length_penalty =
input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
const int id_offset = step * batch_size * beam_width + ite * local_batch_size * beam_width;
invokeLogProbAddCumLogProb(float_log_prob_buf_,
input_tensors->at("logits").getPtr<T>(),
output_tensors->at("cum_log_probs").getPtr<float>(),
input_tensors->at("end_id").getPtr<const int>(),
output_tensors->at("finished").getPtr<bool>(),
local_batch_size * beam_width,
beam_width,
vocab_size_padded_,
stream_);
sync_check_cuda_error();
BeamHypotheses beam_hyps;
if (output_tensors->isExist("beam_hyps") && diversity_rate == 0.0f) {
beam_hyps = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
beam_hyps.step = step;
beam_hyps.ite = ite;
beam_hyps.local_batch_size = local_batch_size;
beam_hyps.batch_size = output_tensors->at("output_ids").shape[1];
beam_hyps.max_seq_len = output_tensors->at("output_ids").shape[0];
beam_hyps.output_ids_src = output_tensors->at("output_ids").getPtr<int>();
beam_hyps.parent_ids_src = output_tensors->at("parent_ids").getPtr<int>();
beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
beam_hyps.length_penalty = length_penalty;
}
invokeTopkBeamSearch<float>(topk_softmax_workspace_,
topk_softmax_workspace_size_,
float_log_prob_buf_,
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
&beam_hyps,
output_tensors->at("finished").getPtr<bool>(),
output_tensors->isExist("sequence_length") ?
output_tensors->at("sequence_length").getPtr<int>() :
(int*)nullptr,
local_batch_size,
beam_width,
vocab_size_padded_,
diversity_rate,
length_penalty,
input_tensors->at("end_id").getPtr<const int>(),
stream_);
sync_check_cuda_error();
invokeUpdateStates(float_log_prob_buf_,
output_tensors->at("cum_log_probs").getPtr<float>(),
output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("sequence_length").getPtr<int>(),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
&beam_hyps,
local_batch_size,
beam_width,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<const int>(),
stream_);
sync_check_cuda_error();
}
template<typename T>
void BeamSearchLayer<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
invokeTopkBeamSearch<float>(nullptr,
topk_softmax_workspace_size_,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
batch_size,
beam_width,
vocab_size_padded_,
0.0f, // diversity rate
0.0f, // length penalty
nullptr,
stream_);
topk_softmax_workspace_ = reinterpret_cast<float*>(allocator_->reMalloc(
topk_softmax_workspace_,
topk_softmax_workspace_size_ + sizeof(float) * batch_size * beam_width * vocab_size_padded_,
false));
float_log_prob_buf_ = (float*)((char*)topk_softmax_workspace_ + topk_softmax_workspace_size_);
is_allocate_buffer_ = true;
}
template<typename T>
BeamSearchLayer<T>::BeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
BaseBeamSearchLayer<T>(max_batch_size,
head_num,
size_per_head,
beam_width,
vocab_size,
vocab_size_padded,
end_id,
diversity_rate,
temperature,
len_penalty,
repetition_penalty,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward)
{
}
template<typename T>
BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer):
BaseBeamSearchLayer<T>(beam_search_layer)
{
}
template<typename T>
BeamSearchLayer<T>::~BeamSearchLayer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template class BeamSearchLayer<float>;
template class BeamSearchLayer<half>;
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include <float.h>
namespace turbomind {
template<typename T>
class BeamSearchLayer: public BaseBeamSearchLayer<T> {
private:
// meta data
using BaseBeamSearchLayer<T>::vocab_size_;
using BaseBeamSearchLayer<T>::vocab_size_padded_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, size_t beam_width) override;
void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
using BaseBeamSearchLayer<T>::stream_;
using BaseBeamSearchLayer<T>::is_allocate_buffer_;
using BaseBeamSearchLayer<T>::allocator_;
float* float_log_prob_buf_ = nullptr;
protected:
public:
BeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer);
~BeamSearchLayer();
};
} // namespace turbomind
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu)
set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels cuda_utils)
add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu)
set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels)
add_library(BeamSearchLayer STATIC BeamSearchLayer.cu)
set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels)
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
namespace turbomind {
static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
static const int MAX_K = 4;
template<typename T>
__global__ void update_kernel(bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses beam_hyps,
const int vocab_size,
const int* end_ids,
const int local_batch_size,
const int beam_width)
{
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
index += blockDim.x * gridDim.x) {
int batch_id = index / beam_width;
sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
int beam_id = (word_ids[index] / vocab_size) % beam_width;
int word_id = word_ids[index] % vocab_size;
sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
finished[index] = word_id == end_ids[index / beam_width] ? 1 : 0;
parent_ids[index] = beam_id;
word_ids[index] = word_id;
output_ids[index] = word_id;
if (beam_hyps.num_beams != nullptr) {
if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
for (int i = 0; i < beam_width; i++) {
finished[batch_id * beam_width + i] = true;
}
}
}
}
}
void invokeUpdate(bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses* beam_hyps,
const int local_batch_size,
const int beam_width,
const int vocab_size_padded,
const int* end_ids,
cudaStream_t stream)
{
dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
dim3 block(256);
update_kernel<float><<<grid, block, 0, stream>>>(finished,
parent_ids,
sequence_length,
word_ids,
output_ids,
*beam_hyps,
vocab_size_padded,
end_ids,
local_batch_size,
beam_width);
}
template<typename T>
void OnlineBeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size, beam_width]
FT_CHECK(input_tensors->size() >= 7);
FT_CHECK(output_tensors->size() >= 6);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int beam_width = output_tensors->at("output_ids").shape[2];
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int local_batch_size = input_tensors->at("logits").shape[0];
const float diversity_rate = input_tensors->isExist("beam_search_diversity_rate") ?
input_tensors->at("beam_search_diversity_rate").getVal<float>() :
0.0f;
const float length_penalty =
input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
const int id_offset = step * batch_size * beam_width + local_batch_size * ite * beam_width;
BeamHypotheses beam_hyps;
if (output_tensors->isExist("beam_hyps")) {
beam_hyps = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
beam_hyps.step = step;
beam_hyps.ite = ite;
beam_hyps.local_batch_size = local_batch_size;
beam_hyps.batch_size = output_tensors->at("output_ids").shape[1];
beam_hyps.max_seq_len = output_tensors->at("output_ids").shape[0];
beam_hyps.output_ids_src = output_tensors->at("output_ids").getPtr<int>();
beam_hyps.parent_ids_src = output_tensors->at("parent_ids").getPtr<int>();
beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
beam_hyps.log_probs_src = output_tensors->getPtr<float>("output_log_probs", nullptr);
beam_hyps.length_penalty = length_penalty;
beam_hyps.end_ids = input_tensors->at("end_id").getPtr<int>();
}
invokeTopkSoftMax(input_tensors->at("logits").getPtr<T>(),
(const T*)(nullptr),
output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("sequence_length").getPtr<int>(),
output_tensors->at("cum_log_probs").getPtr<float>(),
output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
topk_softmax_workspace_,
topk_softmax_workspace_size_,
&beam_hyps,
local_batch_size,
beam_width,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<int>(),
diversity_rate,
length_penalty,
stream_);
sync_check_cuda_error();
invokeUpdate(output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("sequence_length").getPtr<int>(),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
&beam_hyps,
local_batch_size,
beam_width,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<const int>(),
stream_);
sync_check_cuda_error();
}
template<typename T>
void OnlineBeamSearchLayer<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// we need to check 2 * beam_width candidates each time
// 64 is the max beam width we support now.
topk_softmax_workspace_size_ =
(size_t)(ceil(batch_size * 64 * (64 * 2) / 4.) * 4 * 2
+ ceil(batch_size * (64 * 2) * SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * (MAX_K * 2) + 2) / 4.) * 4);
topk_softmax_workspace_ = reinterpret_cast<float*>(
allocator_->reMalloc(topk_softmax_workspace_, sizeof(float) * topk_softmax_workspace_size_, true));
is_allocate_buffer_ = true;
}
template<typename T>
OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
BaseBeamSearchLayer<T>(max_batch_size,
head_num,
size_per_head,
beam_width,
vocab_size,
vocab_size_padded,
end_id,
diversity_rate,
temperature,
len_penalty,
repetition_penalty,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward)
{
}
template<typename T>
OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
BaseBeamSearchLayer<T>(beam_search_layer)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T>
OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template class OnlineBeamSearchLayer<float>;
template class OnlineBeamSearchLayer<half>;
} // namespace turbomind
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
namespace turbomind {
template<typename T>
class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
private:
// meta data
using BaseBeamSearchLayer<T>::vocab_size_;
using BaseBeamSearchLayer<T>::vocab_size_padded_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, size_t beam_width) override;
void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
using BaseBeamSearchLayer<T>::stream_;
using BaseBeamSearchLayer<T>::is_allocate_buffer_;
using BaseBeamSearchLayer<T>::allocator_;
protected:
public:
OnlineBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer);
~OnlineBeamSearchLayer();
};
} // namespace turbomind
......@@ -23,7 +23,6 @@ set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC -lcudart
cublasMMWrapper
DynamicDecodeLayer
BaseBeamSearchLayer
activation_kernels
decoder_masked_multihead_attention
bert_preprocess_kernels
......
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/models/llama/prefix_cache.h"
// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
template<typename T>
__global__ void insertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, size_t S)
{
for (int i = threadIdx.x; i < L * H * Dx * s * X; i += blockDim.x) {
int i0 = i / X;
int x = i % X;
int i1 = i0 / s;
int t = i0 % s;
size_t j = (i1 * S + t) * X + x;
key_cache[j] = src[i];
}
}
template<typename T>
void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st)
{
insertKeyCache<<<1, 512, 0, st>>>(key_cache, src, L, H, Dx, s, X, S);
}
template void
invokeInsertKeyCache(float* key_cache, const float* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
template void
invokeInsertKeyCache(half* key_cache, const half* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
// <L,H,s,D> -> <L,H,S[:s],D>
template<typename T>
__global__ void insertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, size_t S)
{
for (int i = threadIdx.x; i < L * H * s * D; i += blockDim.x) {
int i0 = i / D;
int d = i % D;
int i1 = i0 / s;
int t = i0 % s;
size_t j = (i1 * S + t) * D + d;
value_cache[j] = src[i];
}
}
template<typename T>
void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st)
{
insertValueCache<<<1, 512, 0, st>>>(value_cache, src, L, H, s, D, S);
}
template void
invokeInsertValueCache(float* value_cache, const float* src, int L, int H, int s, int D, int S, cudaStream_t st);
template void
invokeInsertValueCache(half* value_cache, const half* src, int L, int H, int s, int D, int S, cudaStream_t st);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment