Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T>
struct AttentionINT8Weight: AttentionWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
__global__ void update_indir_cache_kernel(int* tgt_indir_cache,
const int* src_indir_cache,
const int* beam_ids,
const bool* finished,
int start_step,
int batch_dim,
int local_batch_size,
int beam_width,
int max_seq_len,
int step)
{
int time_step = threadIdx.x + blockIdx.x * blockDim.x;
int bb_id = threadIdx.y + blockIdx.y * blockDim.y;
const int batch_id = bb_id / beam_width;
const int beam_id = bb_id % beam_width;
if (bb_id >= beam_width * local_batch_size || time_step >= min(step + 1, max_seq_len) || finished[bb_id]) {
return;
}
time_step += start_step;
const int time_step_circ = time_step % max_seq_len;
const int src_beam = beam_ids[batch_id * beam_width + beam_id];
const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
tgt_indir_cache[tgt_offset] = (time_step == step) ? beam_id : src_indir_cache[src_offset];
}
void update_indir_cache_kernelLauncher(int* tgt_indir_cache,
const int* src_indir_cache,
const int* beam_ids,
const bool* finished,
int batch_dim,
int local_batch_size,
int beam_width,
int max_seq_len,
int step,
cudaStream_t stream)
{
const dim3 block(32);
const int start_step = max(0, step + 1 - max_seq_len);
const int num_steps = min(step + 1, max_seq_len);
// Update indirections steps [start_step, step], included
const dim3 grid((num_steps + block.x - 1) / block.x, local_batch_size * beam_width);
update_indir_cache_kernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
src_indir_cache,
beam_ids,
finished,
start_step,
batch_dim,
local_batch_size,
beam_width,
max_seq_len,
step);
}
template<typename T>
BaseBeamSearchLayer<T>::BaseBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr),
vocab_size_(vocab_size),
vocab_size_padded_(vocab_size_padded)
{
}
template<typename T>
BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer):
DynamicDecodeBaseLayer(beam_search_layer),
vocab_size_(beam_search_layer.vocab_size_),
vocab_size_padded_(beam_search_layer.vocab_size_padded_),
topk_softmax_workspace_size_(beam_search_layer.topk_softmax_workspace_size_)
{
}
template<typename T>
BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer();
}
template<typename T>
void BaseBeamSearchLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
allocator_->free((void**)(&topk_softmax_workspace_));
is_allocate_buffer_ = false;
}
}
template<typename T>
void BaseBeamSearchLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
{
// do nothing.
}
template<typename T>
void BaseBeamSearchLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
{"embedding_bias", input_tensors->at(1)},
{"step", input_tensors->at(2)},
{"src_cache_indirection", input_tensors->at(4)},
{"max_input_length", input_tensors->at(5)},
{"input_lengths", input_tensors->at(6)},
{"ite", input_tensors->at(7)}};
std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
{"finished", output_tensors->at(1)},
{"cum_log_probs", output_tensors->at(2)},
{"parent_ids", output_tensors->at(3)},
{"sequence_length", output_tensors->at(4)},
{"tgt_cache_indirection", output_tensors->at(5)}};
forward(&output_tensors_map, &input_tensors_map);
}
template<typename T>
void BaseBeamSearchLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors)
{
TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors);
forward(&output_map, &input_map);
}
template<typename T>
void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// end_id [local_batch_size]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width], optional
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// presence_penalty [1] on cpu, optional
// Only one of repetition and presence penalties is allowed.
// min_length [1] on cpu, int, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width], optional
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width], optional
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size, beam_width], optional
// beam_hyps, optional
FT_CHECK(input_tensors->size() >= 7);
FT_CHECK(output_tensors->size() >= 5);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int beam_width = output_tensors->at("output_ids").shape[2];
allocateBuffer(batch_size, beam_width);
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int local_batch_size = input_tensors->at("logits").shape[0];
const float temperature = input_tensors->getVal<float>("temperature", 1.0f);
const T* embedding_bias = input_tensors->getPtr<const T>("embedding_bias", nullptr);
RepetitionPenaltyType repetition_penalty_type = RepetitionPenaltyType::None;
float repetition_penalty = getDefaultPenaltyValue(repetition_penalty_type);
if (input_tensors->isExist("repetition_penalty") || input_tensors->isExist("presence_penalty")) {
FT_CHECK_WITH_INFO(
!(input_tensors->isExist("repetition_penalty") && input_tensors->isExist("presence_penalty")),
"Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
"Please provide one of repetition_penalty or presence_penalty.");
repetition_penalty_type = input_tensors->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
RepetitionPenaltyType::Additive;
repetition_penalty = repetition_penalty_type == RepetitionPenaltyType::Multiplicative ?
input_tensors->getVal<float>("repetition_penalty") :
input_tensors->getVal<float>("presence_penalty");
}
invokeAddBiasApplyPenalties(
step,
input_tensors->at("logits").getPtr<T>(),
output_tensors->at("output_ids")
.getPtrWithOffset<const int>((step - 1) * batch_size * beam_width + ite * local_batch_size * beam_width),
output_tensors->getPtr<const int>("output_ids"),
output_tensors->getPtr<const int>("parent_ids"),
input_tensors->getPtr<const int>("input_lengths", nullptr),
output_tensors->getPtr<const int>("sequence_length", nullptr),
embedding_bias,
ite,
input_tensors->getVal<int>("max_input_length"),
local_batch_size,
batch_size,
beam_width,
vocab_size_,
vocab_size_padded_,
input_tensors->getPtr<const int>("end_id", nullptr),
temperature,
repetition_penalty,
repetition_penalty_type,
input_tensors->getVal<const int>("min_length", 0),
stream_);
sync_check_cuda_error();
invokeSoftMax(output_tensors, input_tensors);
if (beam_width > 1) {
const int max_seq_len = output_tensors->at("output_ids").shape[0];
update_indir_cache_kernelLauncher(
output_tensors->at("tgt_cache_indirection").getPtr<int>(),
input_tensors->at("src_cache_indirection").getPtr<const int>(),
output_tensors->at("parent_ids")
.getPtrWithOffset<const int>(+step * beam_width * batch_size + ite * local_batch_size * beam_width),
output_tensors->at("finished").getPtr<const bool>(),
batch_size,
local_batch_size,
beam_width,
max_seq_len,
step,
stream_);
sync_check_cuda_error();
}
sync_check_cuda_error();
if (is_free_buffer_after_forward_) {
freeBuffer();
}
sync_check_cuda_error();
}
template class BaseBeamSearchLayer<float>;
template class BaseBeamSearchLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
namespace fastertransformer {
template<typename T>
class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
private:
void freeBuffer();
protected:
// meta data
size_t vocab_size_;
size_t vocab_size_padded_;
size_t topk_softmax_workspace_size_;
void* topk_softmax_workspace_ = nullptr;
virtual void allocateBuffer() = 0;
virtual void allocateBuffer(size_t batch_size, size_t beam_width) = 0;
virtual void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
public:
BaseBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer);
~BaseBeamSearchLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
};
void update_indir_cache_kernelLauncher(int* tgt_indir_cache,
const int* src_indir_cache,
const int* beam_ids,
const bool* finished,
int batch_dim,
int beam_width,
int max_seq_len,
int ite,
cudaStream_t stream);
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
namespace fastertransformer {
template<typename T>
__global__ void logProbAddCumLogProb(float* log_probs,
const T* logits,
const float* cum_log_probs,
const int* end_ids,
const bool* finished,
const int beam_width,
const int n)
{
int bid = blockIdx.x;
bool finish = finished != nullptr ? finished[bid] : false;
int offset = bid * n;
float max_val = -1 * FLT_MAX;
__shared__ float s_max_val;
__shared__ float s_sum_val;
if (finish) {
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = (tid == end_ids[bid / beam_width]) ? cum_log_probs[bid] : -FLT_MAX;
}
}
else {
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = (float)(logits[offset + tid]);
max_val = max(max_val, log_probs[offset + tid]);
}
max_val = blockReduceMax(max_val);
if (threadIdx.x == 0) {
s_max_val = max_val;
}
__syncthreads();
float sum_val = 0.0f;
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = __expf(log_probs[offset + tid] - s_max_val);
sum_val += log_probs[offset + tid];
}
sum_val = blockReduceSum(sum_val);
if (threadIdx.x == 0) {
s_sum_val = sum_val + 1e-6f;
}
__syncthreads();
for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
log_probs[offset + tid] = logf(log_probs[offset + tid] / s_sum_val) + cum_log_probs[bid];
}
}
}
template<typename T>
void invokeLogProbAddCumLogProb(float* log_probs,
const T* logits,
const float* cum_log_probs,
const int* end_ids,
const bool* finished,
const int m,
const int beam_width,
const int n,
cudaStream_t stream)
{
dim3 grid(m);
dim3 block(min(n, 1024));
/*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
logProbAddCumLogProb<<<grid, block, 0, stream>>>(
log_probs, logits, cum_log_probs, end_ids, finished, beam_width, n);
}
template<typename T>
__global__ void updateStatesKernel(T* log_probs,
T* cum_log_probs,
float* output_log_probs,
bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses beam_hyps,
const int local_batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids)
{
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
index += blockDim.x * gridDim.x) {
int batch_id = index / beam_width;
sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
int beam_id = (word_ids[index] / vocab_size) % beam_width;
int word_id = word_ids[index] % vocab_size;
if (output_log_probs != nullptr) {
// get the cum_log_probs of previous run
output_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id]
- cum_log_probs[batch_id * beam_width + beam_id];
}
cum_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id];
sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
finished[index] = word_id == end_ids[batch_id] ? 1 : 0;
parent_ids[index] = beam_id;
word_ids[index] = word_id;
output_ids[index] = word_id;
if (beam_hyps.num_beams != nullptr) {
if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
for (int i = 0; i < beam_width; i++) {
finished[batch_id * beam_width + i] = true;
}
}
}
}
}
void invokeUpdateStates(float* log_probs,
float* cum_log_probs,
float* output_log_probs,
bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses* beam_hyps,
const int local_batch_size,
const int beam_width,
const int vocab_size,
const int* end_ids,
cudaStream_t stream)
{
dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
dim3 block(256);
updateStatesKernel<float><<<grid, block, 0, stream>>>(log_probs,
cum_log_probs,
output_log_probs,
finished,
parent_ids,
sequence_length,
word_ids,
output_ids,
*beam_hyps,
local_batch_size,
beam_width,
vocab_size,
end_ids);
}
template<typename T>
void BeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size * beam_width], optional
// beam_hyps, optional
FT_CHECK(input_tensors->size() >= 7);
FT_CHECK(output_tensors->size() >= 6);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int beam_width = output_tensors->at("output_ids").shape[2];
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int local_batch_size = input_tensors->at("logits").shape[0];
const float diversity_rate = input_tensors->isExist("beam_search_diversity_rate") ?
input_tensors->at("beam_search_diversity_rate").getVal<float>() :
0.0f;
const float length_penalty =
input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
const int id_offset = step * batch_size * beam_width + ite * local_batch_size * beam_width;
invokeLogProbAddCumLogProb(float_log_prob_buf_,
input_tensors->at("logits").getPtr<T>(),
output_tensors->at("cum_log_probs").getPtr<float>(),
input_tensors->at("end_id").getPtr<const int>(),
output_tensors->at("finished").getPtr<bool>(),
local_batch_size * beam_width,
beam_width,
vocab_size_padded_,
stream_);
sync_check_cuda_error();
BeamHypotheses beam_hyps;
if (output_tensors->isExist("beam_hyps") && diversity_rate == 0.0f) {
beam_hyps = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
beam_hyps.step = step;
beam_hyps.ite = ite;
beam_hyps.local_batch_size = local_batch_size;
beam_hyps.batch_size = output_tensors->at("output_ids").shape[1];
beam_hyps.max_seq_len = output_tensors->at("output_ids").shape[0];
beam_hyps.output_ids_src = output_tensors->at("output_ids").getPtr<int>();
beam_hyps.parent_ids_src = output_tensors->at("parent_ids").getPtr<int>();
beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
beam_hyps.length_penalty = length_penalty;
}
invokeTopkBeamSearch<float>(topk_softmax_workspace_,
topk_softmax_workspace_size_,
float_log_prob_buf_,
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
&beam_hyps,
output_tensors->at("finished").getPtr<bool>(),
output_tensors->isExist("sequence_length") ?
output_tensors->at("sequence_length").getPtr<int>() :
(int*)nullptr,
local_batch_size,
beam_width,
vocab_size_padded_,
diversity_rate,
length_penalty,
input_tensors->at("end_id").getPtr<const int>(),
stream_);
sync_check_cuda_error();
invokeUpdateStates(float_log_prob_buf_,
output_tensors->at("cum_log_probs").getPtr<float>(),
output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("sequence_length").getPtr<int>(),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
&beam_hyps,
local_batch_size,
beam_width,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<const int>(),
stream_);
sync_check_cuda_error();
}
template<typename T>
void BeamSearchLayer<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
invokeTopkBeamSearch<float>(nullptr,
topk_softmax_workspace_size_,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
batch_size,
beam_width,
vocab_size_padded_,
0.0f, // diversity rate
0.0f, // length penalty
nullptr,
stream_);
topk_softmax_workspace_ = reinterpret_cast<float*>(allocator_->reMalloc(
topk_softmax_workspace_,
topk_softmax_workspace_size_ + sizeof(float) * batch_size * beam_width * vocab_size_padded_,
false));
float_log_prob_buf_ = (float*)((char*)topk_softmax_workspace_ + topk_softmax_workspace_size_);
is_allocate_buffer_ = true;
}
template<typename T>
BeamSearchLayer<T>::BeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
BaseBeamSearchLayer<T>(max_batch_size,
head_num,
size_per_head,
beam_width,
vocab_size,
vocab_size_padded,
end_id,
diversity_rate,
temperature,
len_penalty,
repetition_penalty,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward)
{
}
template<typename T>
BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer):
BaseBeamSearchLayer<T>(beam_search_layer)
{
}
template<typename T>
BeamSearchLayer<T>::~BeamSearchLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template class BeamSearchLayer<float>;
template class BeamSearchLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include <float.h>
namespace fastertransformer {
template<typename T>
class BeamSearchLayer: public BaseBeamSearchLayer<T> {
private:
// meta data
using BaseBeamSearchLayer<T>::vocab_size_;
using BaseBeamSearchLayer<T>::vocab_size_padded_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, size_t beam_width) override;
void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
using BaseBeamSearchLayer<T>::stream_;
using BaseBeamSearchLayer<T>::is_allocate_buffer_;
using BaseBeamSearchLayer<T>::allocator_;
float* float_log_prob_buf_ = nullptr;
protected:
public:
BeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer);
~BeamSearchLayer();
};
} // namespace fastertransformer
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu)
set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels cuda_utils)
add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu)
set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels)
add_library(BeamSearchLayer STATIC BeamSearchLayer.cu)
set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels)
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
namespace fastertransformer {
static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
static const int MAX_K = 4;
template<typename T>
__global__ void update_kernel(bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses beam_hyps,
const int vocab_size,
const int* end_ids,
const int local_batch_size,
const int beam_width)
{
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
index += blockDim.x * gridDim.x) {
int batch_id = index / beam_width;
sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
int beam_id = (word_ids[index] / vocab_size) % beam_width;
int word_id = word_ids[index] % vocab_size;
sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
finished[index] = word_id == end_ids[index / beam_width] ? 1 : 0;
parent_ids[index] = beam_id;
word_ids[index] = word_id;
output_ids[index] = word_id;
if (beam_hyps.num_beams != nullptr) {
if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
for (int i = 0; i < beam_width; i++) {
finished[batch_id * beam_width + i] = true;
}
}
}
}
}
void invokeUpdate(bool* finished,
int* parent_ids,
int* sequence_length,
int* word_ids,
int* output_ids,
BeamHypotheses* beam_hyps,
const int local_batch_size,
const int beam_width,
const int vocab_size_padded,
const int* end_ids,
cudaStream_t stream)
{
dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
dim3 block(256);
update_kernel<float><<<grid, block, 0, stream>>>(finished,
parent_ids,
sequence_length,
word_ids,
output_ids,
*beam_hyps,
vocab_size_padded,
end_ids,
local_batch_size,
beam_width);
}
template<typename T>
void OnlineBeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size, beam_width]
FT_CHECK(input_tensors->size() >= 7);
FT_CHECK(output_tensors->size() >= 6);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int beam_width = output_tensors->at("output_ids").shape[2];
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int local_batch_size = input_tensors->at("logits").shape[0];
const float diversity_rate = input_tensors->isExist("beam_search_diversity_rate") ?
input_tensors->at("beam_search_diversity_rate").getVal<float>() :
0.0f;
const float length_penalty =
input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
const int id_offset = step * batch_size * beam_width + local_batch_size * ite * beam_width;
BeamHypotheses beam_hyps;
if (output_tensors->isExist("beam_hyps")) {
beam_hyps = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
beam_hyps.step = step;
beam_hyps.ite = ite;
beam_hyps.local_batch_size = local_batch_size;
beam_hyps.batch_size = output_tensors->at("output_ids").shape[1];
beam_hyps.max_seq_len = output_tensors->at("output_ids").shape[0];
beam_hyps.output_ids_src = output_tensors->at("output_ids").getPtr<int>();
beam_hyps.parent_ids_src = output_tensors->at("parent_ids").getPtr<int>();
beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
beam_hyps.log_probs_src = output_tensors->getPtr<float>("output_log_probs", nullptr);
beam_hyps.length_penalty = length_penalty;
beam_hyps.end_ids = input_tensors->at("end_id").getPtr<int>();
}
invokeTopkSoftMax(input_tensors->at("logits").getPtr<T>(),
(const T*)(nullptr),
output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("sequence_length").getPtr<int>(),
output_tensors->at("cum_log_probs").getPtr<float>(),
output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
topk_softmax_workspace_,
topk_softmax_workspace_size_,
&beam_hyps,
local_batch_size,
beam_width,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<int>(),
diversity_rate,
length_penalty,
stream_);
sync_check_cuda_error();
invokeUpdate(output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("sequence_length").getPtr<int>(),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
&beam_hyps,
local_batch_size,
beam_width,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<const int>(),
stream_);
sync_check_cuda_error();
}
template<typename T>
void OnlineBeamSearchLayer<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
// we need to check 2 * beam_width candidates each time
// 64 is the max beam width we support now.
topk_softmax_workspace_size_ =
(size_t)(ceil(batch_size * 64 * (64 * 2) / 4.) * 4 * 2
+ ceil(batch_size * (64 * 2) * SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * (MAX_K * 2) + 2) / 4.) * 4);
topk_softmax_workspace_ = reinterpret_cast<float*>(
allocator_->reMalloc(topk_softmax_workspace_, sizeof(float) * topk_softmax_workspace_size_, true));
is_allocate_buffer_ = true;
}
template<typename T>
OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
BaseBeamSearchLayer<T>(max_batch_size,
head_num,
size_per_head,
beam_width,
vocab_size,
vocab_size_padded,
end_id,
diversity_rate,
temperature,
len_penalty,
repetition_penalty,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward)
{
}
template<typename T>
OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
BaseBeamSearchLayer<T>(beam_search_layer)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T>
OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template class OnlineBeamSearchLayer<float>;
template class OnlineBeamSearchLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
namespace fastertransformer {
template<typename T>
class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
private:
// meta data
using BaseBeamSearchLayer<T>::vocab_size_;
using BaseBeamSearchLayer<T>::vocab_size_padded_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, size_t beam_width) override;
void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
using BaseBeamSearchLayer<T>::stream_;
using BaseBeamSearchLayer<T>::is_allocate_buffer_;
using BaseBeamSearchLayer<T>::allocator_;
protected:
public:
OnlineBeamSearchLayer(size_t max_batch_size,
size_t head_num,
size_t size_per_head,
size_t beam_width,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float diversity_rate,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer);
~OnlineBeamSearchLayer();
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include <algorithm>
namespace fastertransformer {
template<typename T>
void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
curandstate_buf_ = reinterpret_cast<curandState_t*>(
allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
allocator_->reMalloc(random_seeds_buf_, sizeof(unsigned long long) * batch_size, false));
temperature_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false));
repetition_penalty_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(repetition_penalty_buf_, sizeof(float) * batch_size, false));
min_lengths_buf_ = reinterpret_cast<int*>(allocator_->reMalloc(min_lengths_buf_, sizeof(int) * batch_size, false));
runtime_logits_buf_ = reinterpret_cast<T*>(
allocator_->reMalloc(runtime_logits_buf_, sizeof(T) * batch_size * vocab_size_padded_, false));
skip_decode_buf_ =
reinterpret_cast<bool*>(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false));
// host buffers.
temperature_ = new float[batch_size];
repetition_penalty_ = new float[batch_size];
min_lengths_ = new int[batch_size];
skip_decode_ = new bool[batch_size];
is_allocate_buffer_ = true;
}
template<typename T>
void BaseSamplingLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&curandstate_buf_));
allocator_->free((void**)(&random_seeds_buf_));
allocator_->free((void**)(&temperature_buf_));
allocator_->free((void**)(&repetition_penalty_buf_));
allocator_->free((void**)(&min_lengths_buf_));
allocator_->free((void**)(&runtime_logits_buf_));
allocator_->free((void**)(&skip_decode_buf_));
delete[] temperature_;
delete[] repetition_penalty_;
delete[] min_lengths_;
delete[] skip_decode_;
is_allocate_buffer_ = false;
}
}
template<typename T>
BaseSamplingLayer<T>::BaseSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
size_t top_k,
float top_p,
unsigned long long random_seed,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop):
DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
vocab_size_(vocab_size),
vocab_size_padded_(vocab_size_padded)
{
}
template<typename T>
BaseSamplingLayer<T>::BaseSamplingLayer(BaseSamplingLayer const& sampling_layer):
DynamicDecodeBaseLayer(sampling_layer),
vocab_size_(sampling_layer.vocab_size_),
vocab_size_padded_(sampling_layer.vocab_size_padded_),
sampling_workspace_size_(sampling_layer.sampling_workspace_size_)
{
}
template<typename T>
BaseSamplingLayer<T>::~BaseSamplingLayer()
{
}
template<typename T>
void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
{
// Set up the sampling layer for given runtime arguments.
//
// runtime_args:
// runtime_top_k [1] or [batch_size] on cpu, optional.
// runtime_top_p [1] or [batch_size] on cpu, optional
// temperature [1] or [batch_size] on cpu, optional
// repetition_penalty [1] or [batch_size] on cpu, optional
// presence_penalty [1] or [batch_size] on cpu, optional,
// repetition_penalty and presence_penalty are mutually exclusive.
// min_length [1] or [batch_size] on cpu, optional
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor();
Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
allocateBuffer(batch_size, runtime_top_k, runtime_top_p);
// If runtime argument has single random seed, using this random seed to initialize the random table of all
// sentences. If the argument has [batch_size] random seeds, initializing the random table by different random seeds
// respectively. If no random seed, initialize the random table of all sentences by 0 directly.
if (runtime_args->isExist("random_seed")) {
Tensor random_seeds = runtime_args->at("random_seed");
FT_CHECK_WITH_INFO(random_seeds.shape.size() == 1
&& (random_seeds.size() == 1 || random_seeds.size() == batch_size),
fmtstr("random_seeds must be of shape [1] or [batch_size(%ld)], got random_seeds.shape=%s",
batch_size,
vec2str(random_seeds.shape).c_str()));
if (random_seeds.size() == 1) {
invokeCurandInitialize(curandstate_buf_, batch_size, random_seeds.getVal<unsigned long long>(), stream_);
sync_check_cuda_error();
}
else {
unsigned long long* random_seed_ptr = random_seeds.getPtr<unsigned long long>();
cudaAutoCpy(random_seeds_buf_, random_seed_ptr, batch_size, stream_);
invokeCurandBatchInitialize(curandstate_buf_, batch_size, random_seeds_buf_, stream_);
sync_check_cuda_error();
}
}
else {
// Initialize curand states using the default seed 0.
invokeCurandInitialize(curandstate_buf_, batch_size, 0, stream_);
}
// Setup penalties.
const float default_temperature = 1.0f;
Tensor temperature = runtime_args->isExist("temperature") ?
runtime_args->at("temperature") :
Tensor(MEMORY_CPU, TYPE_FP32, {1}, &default_temperature);
if (temperature.size() == 1) {
float tp = temperature.getVal<float>();
deviceFill(temperature_buf_, batch_size, tp, stream_);
std::fill_n(temperature_, batch_size, tp);
}
else {
cudaAutoCpy(temperature_buf_, temperature.getPtr<float>(), batch_size, stream_);
std::copy_n(temperature.getPtr<float>(), batch_size, temperature_);
}
if (runtime_args->isExist("repetition_penalty") || runtime_args->isExist("presence_penalty")) {
FT_CHECK_WITH_INFO(
!(runtime_args->isExist("repetition_penalty") && runtime_args->isExist("presence_penalty")),
"Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
"Please provide one of repetition_penalty or presence_penalty.");
repetition_penalty_type_ = runtime_args->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
RepetitionPenaltyType::Additive;
Tensor repetition_penalty = repetition_penalty_type_ == RepetitionPenaltyType::Multiplicative ?
runtime_args->at("repetition_penalty") :
runtime_args->at("presence_penalty");
if (repetition_penalty.size() == 1) {
float rp = repetition_penalty.getVal<float>();
deviceFill(repetition_penalty_buf_, batch_size, rp, stream_);
std::fill_n(repetition_penalty_, batch_size, rp);
}
else {
cudaAutoCpy(repetition_penalty_buf_, repetition_penalty.getPtr<float>(), batch_size, stream_);
std::copy_n(repetition_penalty.getPtr<float>(), batch_size, repetition_penalty_);
}
}
else {
repetition_penalty_type_ = RepetitionPenaltyType::None;
}
const int default_min_length = 0;
Tensor min_lengths = runtime_args->at("min_length", Tensor(MEMORY_CPU, TYPE_INT32, {1}, &default_min_length));
if (min_lengths.size() == 1) {
int minlen = min_lengths.getVal<int>();
deviceFill(min_lengths_buf_, batch_size, minlen, stream_);
std::fill_n(min_lengths_, batch_size, minlen);
}
else {
cudaAutoCpy(min_lengths_buf_, min_lengths.getPtr<int>(), batch_size, stream_);
std::copy_n(min_lengths.getPtr<int>(), batch_size, min_lengths_);
}
}
template<typename T>
void BaseSamplingLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
{
// input_tensors:
// logits [local_batch_size, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// max_input_length [1] on cpu
// input_lengths [local_batch_size]
// ite [1] on cpu
// random_seed [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size]
// finished [local_batch_size]
// sequence_length [local_batch_size]
// cum_log_probs [local_batch_size], must be float*
FT_CHECK(false); // TODO deprecated, need to remove
std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
{"embedding_bias", input_tensors->at(1)},
{"step", input_tensors->at(2)},
{"max_input_length", input_tensors->at(3)},
{"input_lengths", input_tensors->at(4)},
{"ite", input_tensors->at(5)}};
if (input_tensors->size() == 7) {
input_tensors_map.insert({"random_seed", input_tensors->at(6)});
}
std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
{"finished", output_tensors->at(1)},
{"sequence_length", output_tensors->at(2)},
{"cum_log_probs", output_tensors->at(3)}};
forward(&output_tensors_map, &input_tensors_map);
}
template<typename T>
void BaseSamplingLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors)
{
FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors);
forward(&output_map, &input_map);
}
template<typename T>
void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, vocab_size_padded]
// embedding_bias [vocab_size_padded], optional
// step [1] on cpu
// max_input_length [1] on cpu
// input_lengths [local_batch_size], optional
// ite [1] on cpu
// end_id [local_batch_size], optional
// output_tensors:
// output_ids [max_seq_len, batch_size]
// finished [local_batch_size], optional
// sequence_length [local_batch_size], optional
// cum_log_probs [batch_size], must be float*, optional
// The cumultative log probability of generated tokens.
// output_log_probs [local_batch_size], must be float*, optional
// The log probs at the current step.
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int local_batch_size = input_tensors->at("logits").shape[0];
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
const int max_input_length = input_tensors->at("max_input_length").getVal<int>();
T* logits = input_tensors->at("logits").getPtr<T>();
#define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; }))
bool* skip_decode = skip_decode_ + ite * local_batch_size;
if (ALL_OF(skip_decode, local_batch_size, bool, true)) {
// No sample in the current batch to do TopX sampling.
return;
}
skip_any_ = std::any_of(skip_decode, skip_decode + local_batch_size, [](bool b) { return b; });
if (skip_any_) {
// A TopX Sampling layer directly changes the logit values. In case of skip_any==true,
// meaning topk and topp layers will run simultaneously for a batch in the same step.
// We copy the logits to an internal buffer, not affecting the other sampling layers.
FT_CHECK(input_tensors->at("logits").size() == local_batch_size * vocab_size_padded_);
cudaD2Dcpy(runtime_logits_buf_, logits, input_tensors->at("logits").size());
logits = runtime_logits_buf_;
}
const T* embedding_bias =
input_tensors->isExist("embedding_bias") ? input_tensors->at("embedding_bias").getPtr<T>() : nullptr;
if (embedding_bias != nullptr || !ALL_OF(temperature_ + ite * local_batch_size, local_batch_size, float, 1.0f)) {
invokeBatchApplyTemperaturePenalty(logits,
embedding_bias,
temperature_buf_ + ite * local_batch_size,
local_batch_size,
vocab_size_,
vocab_size_padded_,
stream_);
}
sync_check_cuda_error();
if (step > 1 && repetition_penalty_type_ != RepetitionPenaltyType::None) {
float default_value = getDefaultPenaltyValue(repetition_penalty_type_);
if (!ALL_OF(repetition_penalty_ + ite * local_batch_size, local_batch_size, float, default_value)) {
invokeBatchApplyRepetitionPenalty(
logits,
repetition_penalty_buf_ + ite * local_batch_size,
output_tensors->at("output_ids").getPtrWithOffset<int>(ite * local_batch_size),
batch_size,
local_batch_size,
vocab_size_padded_,
input_tensors->at("input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<int>(),
max_input_length,
step,
repetition_penalty_type_,
stream_);
sync_check_cuda_error();
}
}
const int num_generated_tokens = step - max_input_length;
const int* min_lengths = min_lengths_ + ite * local_batch_size;
const bool invoke_min_length_penalty = std::any_of(
min_lengths, min_lengths + local_batch_size, [&](int min_length) { return min_length > num_generated_tokens; });
if (invoke_min_length_penalty) {
FT_CHECK_WITH_INFO(input_tensors->isExist("end_id"), "Need end_id to apply min length penlaty");
invokeMinLengthPenalty(logits,
min_lengths_buf_ + ite * local_batch_size,
input_tensors->getPtr<const int>("end_id"),
output_tensors->getPtr<const int>("sequence_length"),
max_input_length,
local_batch_size,
vocab_size_padded_,
stream_);
sync_check_cuda_error();
}
#undef ALL_OF
runSampling(output_tensors, input_tensors);
if (is_free_buffer_after_forward_) {
freeBuffer();
}
sync_check_cuda_error();
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template class BaseSamplingLayer<float>;
template class BaseSamplingLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <curand_kernel.h>
#include "src/fastertransformer/kernels/penalty_types.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
namespace fastertransformer {
template<typename T>
class BaseSamplingLayer: public DynamicDecodeBaseLayer {
private:
bool isValidBatchSize(size_t batch_size);
protected:
size_t vocab_size_;
size_t vocab_size_padded_;
size_t sampling_workspace_size_;
void* sampling_workspace_ = nullptr;
curandState_t* curandstate_buf_ = nullptr;
unsigned long long* random_seeds_buf_ = nullptr;
float* temperature_buf_ = nullptr;
float* repetition_penalty_buf_ = nullptr;
int* min_lengths_buf_ = nullptr;
bool* skip_decode_buf_ = nullptr;
T* runtime_logits_buf_ = nullptr;
float* temperature_ = nullptr;
float* repetition_penalty_ = nullptr;
int* min_lengths_ = nullptr;
bool* skip_decode_ = nullptr;
bool skip_any_ = false;
RepetitionPenaltyType repetition_penalty_type_ = RepetitionPenaltyType::None;
virtual void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
virtual void freeBuffer();
virtual void allocateBuffer() = 0;
virtual void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p);
public:
curandState_t* curandstate_buf()
{
return curandstate_buf_;
}
BaseSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
size_t top_k,
float top_p,
unsigned long long random_seed, // TODO(bhsueh) delete
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop);
BaseSamplingLayer(BaseSamplingLayer const& sampling_layer);
~BaseSamplingLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) override;
void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) override;
void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
};
} // namespace fastertransformer
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels memory_utils)
add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels)
add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <float.h>
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
template<uint TOP_K_MAX>
__global__ void setup_topk_runtime_args(int batch_size,
uint top_k,
uint* top_ks,
int top_ks_size,
float top_p,
float* top_ps,
int top_ps_size,
bool* skip_decode)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = index; i < batch_size; i += gridDim.x * blockDim.x) {
uint k = top_ks_size > 1 ? top_ks[i] : top_k;
float p = top_ps_size > 1 ? top_ps[i] : top_p;
if (k == 0 && p == 0.0f) {
// FT's topp implementation does not support topp = 0.0f, but it equivalent to greedy search.
// So, we set the topk = 1 as an alternative solution.
k = 1;
}
if (k > 0 && p == 0.0f) {
// for compatibility <= FT5.0.
// This case corresponds to the old topk sampling, which is equivalent to
// the old topk_topp sampling with topp=1.0f. TopKSamplingLayer and
// TopKTopPSamplingLayer are now merged by TopKSamplingLayer. Thus, we
// replace the case topk>0 and topp=0.0f by topk>0 and topp=1.0f for the
// compatibility.
p = 1.0f;
}
// Clip k value. A topk sampling kernel supports up to TOP_K_MAX=64.
top_ks[i] = k > TOP_K_MAX ? TOP_K_MAX : k;
if (k > TOP_K_MAX) {
printf("[WARNING] topk (%d) is larger than max supported number (%d) for token %d"
" clip to max supported number %d. \n",
k,
TOP_K_MAX,
i,
top_ks[i]);
}
// Clip p value if it is out of range. range = [0.0, 1.0].
top_ps[i] = p < 0.0f ? 0.0f : (p > 1.0f ? 1.0f : p);
if (p < 0.0f || p > 1.0f) {
printf("[WARNING] topp (%f) is out of range ([0.0, 1.0f]) for token %d"
" clip to closest number %f.\n",
p,
i,
top_ps[i]);
}
skip_decode[i] = k == 0;
}
}
template<typename T>
void TopKSamplingLayer<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
uint max_top_k = top_k.size() > 0 ? top_k.max<uint>() : 1;
if (max_top_k == 0) {
// for safety. TopKSamplingLayer handles a case of top_k=0 and top_p=0 as
// a greedy decode, i.e. top_k=1, although such case has max_top_k=0.
max_top_k = 1;
}
invokeTopKSampling<T>(nullptr,
sampling_workspace_size_,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
max_top_k,
1.0f,
vocab_size_padded_,
nullptr,
stream_,
batch_size,
skip_decode_buf_);
sampling_workspace_ = allocator_->reMalloc(sampling_workspace_, sampling_workspace_size_, false);
runtime_top_k_buf_ =
reinterpret_cast<uint*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(uint) * batch_size, false));
runtime_top_p_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
is_allocate_buffer_ = true;
}
template<typename T>
void TopKSamplingLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&sampling_workspace_));
allocator_->free((void**)(&runtime_top_k_buf_));
allocator_->free((void**)(&runtime_top_p_buf_));
}
BaseSamplingLayer<T>::freeBuffer();
is_allocate_buffer_ = false;
}
template<typename T>
void TopKSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
{
// Setup runtime topk and topp arguments.
//
// runtime_args:
// runtime_top_k [1] or [batch_size] on cpu, optional, uint.
// runtime_top_p [1] or [batch_size] on cpu, optional, float.
// temperature [1] or [batch_size] on cpu, optional
// repetition_penalty [1] or [batch_size] on cpu, optional
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
uint tmp_top_k = 0;
const Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ?
runtime_args->at("runtime_top_k") :
Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &tmp_top_k);
const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
const size_t runtime_top_k_size = runtime_top_k.size();
const size_t runtime_top_p_size = runtime_top_p.size();
uint top_k = runtime_top_k.max<uint>();
float top_p = runtime_top_p_size == 0 ? 0.0f : runtime_top_p.getVal<float>();
if (runtime_top_k_size > 1) {
FT_CHECK_WITH_INFO(
runtime_top_k.size() == batch_size,
fmtstr("runtime_top_k.size() (%d) == batch_size (%d) is not satisfied!", runtime_top_k.size(), batch_size));
cudaAutoCpy(runtime_top_k_buf_, runtime_top_k.getPtr<uint>(), batch_size, stream_);
}
if (runtime_top_p_size > 1) {
FT_CHECK_WITH_INFO(
runtime_top_p.size() == batch_size,
fmtstr("runtime_top_p.size() (%d) == batch_size (%d) is not satisfied!", runtime_top_p.size(), batch_size));
cudaAutoCpy(runtime_top_p_buf_, runtime_top_p.getPtr<float>(), batch_size, stream_);
}
dim3 block(std::min((int)batch_size, 256));
dim3 grid(div_up((int)batch_size, (int)block.x));
// support top_k up to 1024.
setup_topk_runtime_args<1024><<<grid, block, 0, stream_>>>(batch_size,
top_k,
runtime_top_k_buf_,
runtime_top_k_size,
top_p,
runtime_top_p_buf_,
runtime_top_p_size,
skip_decode_buf_);
cudaAutoCpy(skip_decode_, skip_decode_buf_, batch_size, stream_);
uint* runtime_top_ks = new uint[batch_size];
cudaAutoCpy(runtime_top_ks, runtime_top_k_buf_, batch_size, stream_);
runtime_max_top_k_ = static_cast<int>(*std::max_element(runtime_top_ks, runtime_top_ks + batch_size));
delete[] runtime_top_ks;
}
template<typename T>
void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* input_tensors)
{
// input_tensors:
// logits [local_batch_size, vocab_size_padded]
// embedding_bias [vocab_size_padded], optional
// step [1] on cpu
// max_input_length [1] on cpu
// input_lengths [local_batch_size], optional
// ite [1] on cpu
// output_tensors:
// output_ids [max_seq_len, batch_size]
// finished [local_batch_size], optional
// sequence_length [local_batch_size], optional
// cum_log_probs [batch_size], must be float*, optional
// The cumultative log probability of generated tokens.
// output_log_probs [local_batch_size], must be float*, optional
// The log probs at the current step.
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int local_batch_size = input_tensors->at("logits").shape[0];
const int ite = input_tensors->at("ite").getVal<int>();
const int step = input_tensors->at("step").getVal<int>();
// in case of skip any, the logit value is already copied and processed.
T* logits = !skip_any_ ? input_tensors->at("logits").getPtr<T>() : runtime_logits_buf_;
invokeAddBiasEndMask(logits,
(T*)(nullptr),
input_tensors->at("end_id").getPtr<const int>(),
output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
local_batch_size,
vocab_size_,
vocab_size_padded_,
stream_);
sync_check_cuda_error();
float* cum_log_probs =
output_tensors->isExist("cum_log_probs") ? output_tensors->at("cum_log_probs").getPtr<float>() : nullptr;
float* output_log_probs =
output_tensors->isExist("output_log_probs") ? output_tensors->at("output_log_probs").getPtr<float>() : nullptr;
if (cum_log_probs != nullptr || output_log_probs != nullptr) {
invokeAddBiasSoftMax(
logits,
(T*)(nullptr),
input_tensors->at("end_id").getPtr<int>(),
output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
local_batch_size,
vocab_size_padded_,
vocab_size_,
stream_);
sync_check_cuda_error();
}
invokeBatchTopKSampling(
sampling_workspace_,
sampling_workspace_size_,
logits,
output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size + ite * local_batch_size),
output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>(),
output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
cum_log_probs,
output_log_probs,
curandstate_buf_ + ite * local_batch_size,
(int)runtime_max_top_k_, // useless because runtime_top_k_buf_ is never nullptr. Keep for legacy.
(int*)(runtime_top_k_buf_ + ite * local_batch_size),
1.0f, // useless because runtime_top_p_buf_ is never nullptr. Keep for legacy.
runtime_top_p_buf_ + ite * local_batch_size,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<int>(),
stream_,
local_batch_size,
skip_decode_buf_ + ite * local_batch_size);
sync_check_cuda_error();
}
template<typename T>
TopKSamplingLayer<T>::TopKSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
size_t top_k,
unsigned long long random_seed,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
BaseSamplingLayer<T>(max_batch_size,
vocab_size,
vocab_size_padded,
end_id,
top_k,
0.0f,
random_seed,
temperature,
len_penalty,
repetition_penalty,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
nullptr)
{
}
template<typename T>
TopKSamplingLayer<T>::TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampling_layer):
BaseSamplingLayer<T>(top_k_sampling_layer)
{
}
template<typename T>
TopKSamplingLayer<T>::~TopKSamplingLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
freeBuffer();
}
template class TopKSamplingLayer<float>;
template class TopKSamplingLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
template<typename T>
class TopKSamplingLayer: public BaseSamplingLayer<T> {
private:
void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) override;
void freeBuffer() override;
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) override;
uint runtime_max_top_k_ = 1;
uint* runtime_top_k_buf_ = nullptr;
float* runtime_top_p_buf_ = nullptr;
using BaseSamplingLayer<T>::vocab_size_;
using BaseSamplingLayer<T>::vocab_size_padded_;
using BaseSamplingLayer<T>::sampling_workspace_size_;
using BaseSamplingLayer<T>::sampling_workspace_;
using BaseSamplingLayer<T>::curandstate_buf_;
using BaseSamplingLayer<T>::random_seeds_buf_;
using BaseSamplingLayer<T>::skip_decode_buf_;
using BaseSamplingLayer<T>::skip_decode_;
using BaseSamplingLayer<T>::skip_any_;
using BaseSamplingLayer<T>::runtime_logits_buf_;
using BaseSamplingLayer<T>::stream_;
using BaseSamplingLayer<T>::allocator_;
using BaseSamplingLayer<T>::is_allocate_buffer_;
protected:
public:
TopKSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
size_t top_k,
unsigned long long random_seed,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampling_layer);
~TopKSamplingLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include <float.h>
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
static __global__ void set_topp_runtime_args(int batch_size,
uint top_k,
uint* top_ks,
int top_ks_size,
float top_p,
float* top_ps,
int top_ps_size,
bool* skip_decode,
float* initial_top_p_buf,
float* top_p_decay_buf,
const float* top_p_decay,
float* top_p_min_buf,
const float* top_p_min,
int32_t* top_p_reset_ids_buf,
const uint32_t* top_p_reset_ids)
{
/**
* @brief Setup the runtime arguments for topp, broadcasting top_p to top_ps
and top_k to top_ks, copying top_p_decay/top_p_min/top_p_reset_ids
to internal buffers.
*
* \param batch_size [batch_size]
* \param op_k [batch_size]
* \param top_ks [batch_size]
* \param top_ks_size [batch_size]
* \param top_p [batch_size]
* \param top_ps [batch_size]
* \param top_ps_size [batch_size]
* \param skip_decode [batch_size]
* \param initial_top_p_buf [batch_size]
* \param top_p_decay_buf [batch_size]
* \param top_p_decay [batch_size], optional, must between [0, 1]
* \param top_p_min_buf [batch_size]
* \param top_p_min [batch_size], optional, must between [0, 1]
* \param top_p_reset_ids_buf [batch_size]
* \param top_p_reset_ids [batch_size], optional
*
*/
int index = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = index; i < batch_size; i += gridDim.x * blockDim.x) {
uint k = top_ks_size > 1 ? top_ks[i] : top_k;
float p = top_ps_size > 1 ? top_ps[i] : top_p;
if (k == 0 && p == 0.0f) {
// FT's topp implementation does not support topp = 0.0f, but it equivalent to greedy search.
// So, we set the topk = 1 as an alternative solution.
k = 1;
}
top_ks[i] = k;
// Clip p value if it is out of range. range = [0.0, 1.0].
top_ps[i] = p < 0.0f ? 0.0f : (p > 1.0f ? 1.0f : p);
if (p < 0.0f || p > 1.0f) {
printf("[WARNING] topp (%f) is out of range ([0.0, 1.0f]) for token %d"
" clip to closest number %f.\n",
p,
i,
top_ps[i]);
}
skip_decode[i] = k > 0;
initial_top_p_buf[i] = top_ps[i];
top_p_decay_buf[i] = top_p_decay == nullptr ? 1.0f : top_p_decay[i];
if (top_p_decay_buf[i] > 1.0f || top_p_decay_buf[i] <= 0.0f) {
printf("[WARNING] top_p_decay_buf (%f) is out of range ([0.0, 1.0f]) for token %d,"
" change to 1.0f.\n",
top_p_decay_buf[i],
i);
top_p_decay_buf[i] = 1.0f;
}
top_p_min_buf[i] = top_p_min == nullptr ? 1e-6f : top_p_min[i]; // prevent topp becoming 0.0
if (top_p_min_buf[i] > 1.0f || top_p_min_buf[i] <= 0.0f) {
printf("[WARNING] top_p_min_buf (%f) is out of range ([0.0, 1.0f]) for token %d,"
" change to 0.5f.\n",
top_p_min_buf[i],
i);
top_p_min_buf[i] = 0.5f;
}
top_p_reset_ids_buf[i] = (int32_t)(top_p_reset_ids == nullptr ? -1 : top_p_reset_ids[i]);
}
}
template<typename T>
void TopPSamplingLayer<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
invokeTopPSampling<T>(nullptr, // workspace
sampling_workspace_size_,
cub_temp_storage_size_,
nullptr, // output_ids
nullptr, // sequence_length
nullptr, // finished_buffer
nullptr, // cum_log_probs
nullptr, // output_log_probs
nullptr, // log_probs
topp_id_vals_buf_,
topp_offset_buf_,
begin_topp_offset_buf_,
curandstate_buf_,
batch_size,
vocab_size_padded_,
nullptr,
top_p.size() > 0 ? top_p.max<float>() : 0.0f,
stream_,
cuda_device_prop_,
skip_decode_buf_);
sampling_workspace_ = allocator_->reMalloc(sampling_workspace_, sampling_workspace_size_, true);
runtime_top_k_buf_ =
reinterpret_cast<uint*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(uint) * batch_size, false));
runtime_top_p_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
initial_top_p_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(initial_top_p_buf_, sizeof(float) * batch_size, false));
top_p_decay_buf_ =
reinterpret_cast<float*>(allocator_->reMalloc(top_p_decay_buf_, sizeof(float) * batch_size, false));
top_p_min_buf_ = reinterpret_cast<float*>(allocator_->reMalloc(top_p_min_buf_, sizeof(float) * batch_size, false));
top_p_reset_ids_buf_ =
reinterpret_cast<int32_t*>(allocator_->reMalloc(top_p_reset_ids_buf_, sizeof(int32_t) * batch_size, false));
topp_id_vals_buf_ = reinterpret_cast<int*>(
allocator_->reMalloc(topp_id_vals_buf_, sizeof(int) * batch_size * vocab_size_padded_, false));
topp_offset_buf_ =
reinterpret_cast<int*>(allocator_->reMalloc(topp_offset_buf_, sizeof(int) * (batch_size + 1), false));
begin_topp_offset_buf_ =
reinterpret_cast<int*>(allocator_->reMalloc(begin_topp_offset_buf_, sizeof(int) * (batch_size + 1), false));
is_allocate_buffer_ = true;
}
template<typename T>
void TopPSamplingLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&sampling_workspace_));
allocator_->free((void**)(&topp_id_vals_buf_));
allocator_->free((void**)(&topp_offset_buf_));
allocator_->free((void**)(&begin_topp_offset_buf_));
allocator_->free((void**)(&runtime_top_k_buf_));
allocator_->free((void**)(&runtime_top_p_buf_));
allocator_->free((void**)(&initial_top_p_buf_));
allocator_->free((void**)(&top_p_decay_buf_));
allocator_->free((void**)(&top_p_min_buf_));
allocator_->free((void**)(&top_p_reset_ids_buf_));
}
BaseSamplingLayer<T>::freeBuffer();
is_allocate_buffer_ = false;
}
template<typename T>
void TopPSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
{
/**
* @brief Set up the sampling layer for given runtime arguments.
* runtime_args:
* \param runtime_top_k [1] or [batch_size] on cpu, optional.
* \param runtime_top_p [1] or [batch_size] on cpu, optional
* \param temperature [1] or [batch_size] on cpu, optional
* \param repetition_penalty [1] or [batch_size] on cpu, optional
* \param top_p_decay [batch_size] on gpu, float, optional
* \param top_p_min [batch_size] on gpu, float, optional
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional
**/
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
const size_t runtime_top_p_size = runtime_top_p.size();
if (runtime_top_p_size == 0) {
std::fill_n(skip_decode_, batch_size, true);
return;
}
uint tmp_top_k = 0;
const Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ?
runtime_args->at("runtime_top_k") :
Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &tmp_top_k);
const size_t runtime_top_k_size = runtime_top_k.size();
uint top_k = runtime_top_k.getVal<uint>();
float top_p = runtime_top_p.getVal<float>();
if (runtime_top_k_size > 1) {
FT_CHECK(runtime_top_k.size() == batch_size);
cudaH2Dcpy(runtime_top_k_buf_, runtime_top_k.getPtr<uint>(), batch_size);
}
if (runtime_top_p_size > 1) {
FT_CHECK(runtime_top_p.size() == batch_size);
cudaH2Dcpy(runtime_top_p_buf_, runtime_top_p.getPtr<float>(), batch_size);
}
dim3 block(std::min((int)batch_size, 256));
dim3 grid(div_up((int)batch_size, (int)block.x));
const float* top_p_decay = runtime_args->getPtr<float>("top_p_decay", nullptr);
const float* top_p_min = runtime_args->getPtr<float>("top_p_min", nullptr);
const uint32_t* top_p_reset_ids = runtime_args->getPtr<uint32_t>("top_p_reset_ids", nullptr);
set_topp_runtime_args<<<grid, block, 0, stream_>>>(batch_size,
top_k,
runtime_top_k_buf_,
runtime_top_k_size,
top_p,
runtime_top_p_buf_,
runtime_top_p_size,
skip_decode_buf_,
initial_top_p_buf_,
top_p_decay_buf_,
top_p_decay,
top_p_min_buf_,
top_p_min,
top_p_reset_ids_buf_,
top_p_reset_ids);
sync_check_cuda_error();
cudaAutoCpy(skip_decode_, skip_decode_buf_, batch_size, stream_);
float* runtime_top_ps = new float[batch_size];
cudaAutoCpy(runtime_top_ps, runtime_top_p_buf_, batch_size, stream_);
runtime_max_top_p_ = *std::max_element(runtime_top_ps, runtime_top_ps + batch_size);
delete[] runtime_top_ps;
}
template<typename T>
void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* input_tensors)
{
/**
* input_tensors:
* \param logits [local_batch_size, vocab_size_padded]
* \param embedding_bias [vocab_size_padded], optional
* \param step [1] on cpu
* \param max_input_length [1] on cpu
* \param input_lengths [local_batch_size], optional
* \param ite [1] on cpu
* output_tensors:
* \param output_ids [max_seq_len, batch_size]
* \param finished [local_batch_size], optional
* \param sequence_length [local_batch_size], optional
* \param cum_log_probs [batch_size], must be float*, optional
* \param The cumultative log probability of generated tokens.
* \param output_log_probs [local_batch_size], must be float*, optional
log probs at the current step.
**/
FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 4);
FT_CHECK(output_tensors->size() >= 1);
const int batch_size = output_tensors->at("output_ids").shape[1];
const int local_batch_size = input_tensors->at("logits").shape[0];
const int step = input_tensors->at("step").getVal<int>();
const int ite = input_tensors->at("ite").getVal<int>();
// in case of skip any, the logit value is already copied and processed.
T* logits = !skip_any_ ? input_tensors->at("logits").getPtr<T>() : runtime_logits_buf_;
invokeTopPInitialize(
topp_id_vals_buf_, topp_offset_buf_, begin_topp_offset_buf_, local_batch_size, vocab_size_padded_, stream_);
sync_check_cuda_error();
invokeAddBiasSoftMax(logits,
(T*)(nullptr),
input_tensors->at("end_id").getPtr<int>(),
output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
local_batch_size,
vocab_size_padded_,
vocab_size_,
stream_);
sync_check_cuda_error();
float* cum_log_probs =
output_tensors->isExist("cum_log_probs") ? output_tensors->at("cum_log_probs").getPtr<float>() : nullptr;
float* output_log_probs =
output_tensors->isExist("output_log_probs") ? output_tensors->at("output_log_probs").getPtr<float>() : nullptr;
invokeBatchTopPSampling<T>(
sampling_workspace_,
sampling_workspace_size_,
cub_temp_storage_size_,
output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size + ite * local_batch_size),
output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>(),
output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
cum_log_probs,
output_log_probs,
logits,
topp_id_vals_buf_,
topp_offset_buf_,
begin_topp_offset_buf_,
curandstate_buf_ + ite * local_batch_size,
local_batch_size,
vocab_size_padded_,
input_tensors->at("end_id").getPtr<int>(),
runtime_max_top_p_,
runtime_top_p_buf_ + ite * local_batch_size,
stream_,
cuda_device_prop_,
skip_decode_buf_ + ite * local_batch_size);
sync_check_cuda_error();
invokeComputeToppDecay(
runtime_top_p_buf_ + ite * local_batch_size,
initial_top_p_buf_ + ite * local_batch_size,
output_tensors->getPtrWithOffset<int>("output_ids", step * batch_size + ite * local_batch_size),
top_p_decay_buf_ + ite * local_batch_size,
top_p_min_buf_ + ite * local_batch_size,
top_p_reset_ids_buf_ + ite * local_batch_size,
local_batch_size,
stream_);
sync_check_cuda_error();
FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
}
template<typename T>
TopPSamplingLayer<T>::TopPSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float top_p,
unsigned long long random_seed,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop):
BaseSamplingLayer<T>(max_batch_size,
vocab_size,
vocab_size_padded,
end_id,
0,
top_p,
random_seed,
temperature,
len_penalty,
repetition_penalty,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
cuda_device_prop)
{
}
template<typename T>
TopPSamplingLayer<T>::TopPSamplingLayer(TopPSamplingLayer<T> const& top_p_sampling_layer):
BaseSamplingLayer<T>(top_p_sampling_layer)
{
}
template<typename T>
TopPSamplingLayer<T>::~TopPSamplingLayer()
{
freeBuffer();
}
template class TopPSamplingLayer<float>;
template class TopPSamplingLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
namespace fastertransformer {
template<typename T>
class TopPSamplingLayer: public BaseSamplingLayer<T> {
private:
void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) override;
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) override;
void freeBuffer() override;
uint* runtime_top_k_buf_ = nullptr;
float* runtime_top_p_buf_ = nullptr;
float runtime_max_top_p_;
float* initial_top_p_buf_ = nullptr;
float* top_p_decay_buf_ = nullptr;
float* top_p_min_buf_ = nullptr;
int32_t* top_p_reset_ids_buf_ = nullptr;
int* topp_id_vals_buf_ = nullptr;
int* topp_offset_buf_ = nullptr;
int* begin_topp_offset_buf_ = nullptr;
size_t cub_temp_storage_size_;
using BaseSamplingLayer<T>::vocab_size_;
using BaseSamplingLayer<T>::vocab_size_padded_;
using BaseSamplingLayer<T>::sampling_workspace_size_;
using BaseSamplingLayer<T>::sampling_workspace_;
using BaseSamplingLayer<T>::curandstate_buf_;
using BaseSamplingLayer<T>::random_seeds_buf_;
using BaseSamplingLayer<T>::skip_decode_buf_;
using BaseSamplingLayer<T>::skip_decode_;
using BaseSamplingLayer<T>::skip_any_;
using BaseSamplingLayer<T>::runtime_logits_buf_;
using BaseSamplingLayer<T>::stream_;
using BaseSamplingLayer<T>::allocator_;
using BaseSamplingLayer<T>::is_allocate_buffer_;
using BaseSamplingLayer<T>::cuda_device_prop_;
protected:
public:
TopPSamplingLayer(size_t max_batch_size,
size_t vocab_size,
size_t vocab_size_padded,
int end_id,
float top_p,
unsigned long long random_seed,
float temperature,
float len_penalty,
float repetition_penalty,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop);
TopPSamplingLayer(TopPSamplingLayer<T> const& top_p_sampling_layer);
~TopPSamplingLayer();
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <string>
#include <vector>
#pragma once
namespace fastertransformer {
template<typename T>
struct FtWeight {
public:
std::string name_;
std::vector<size_t> shape_;
size_t size_ = 0;
T* ptr_ = nullptr;
FtWeight() {}
FtWeight(const std::string name, const std::vector<size_t> shape, T* ptr): name_(name), shape_(shape), ptr_(ptr)
{
size_ = 1;
for (uint i = 0; i < shape_.size(); i++) {
size_ *= shape_[i];
}
}
~FtWeight()
{
size_ = 0;
ptr_ = nullptr;
}
};
} // namespace fastertransformer
\ No newline at end of file
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_subdirectory(llama)
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/utils/logger.h"
#include <pthread.h>
namespace fastertransformer {
class Barrier {
public:
Barrier(unsigned count)
{
FT_LOG_INFO("Barrier(%d)", (int)count);
pthread_barrier_init(&barrier_, nullptr, count);
}
Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete;
Barrier& operator=(Barrier&&) noexcept = delete;
void wait()
{
pthread_barrier_wait(&barrier_);
}
~Barrier()
{
pthread_barrier_destroy(&barrier_);
}
private:
pthread_barrier_t barrier_{};
};
} // namespace fastertransformer
\ No newline at end of file
# Copyright (c) OpenMMLab. All rights reserved.
cmake_minimum_required(VERSION 3.8)
add_subdirectory(fused_multi_head_attention)
add_library(Llama STATIC
LlamaV2.cc
LlamaBatch.cc
LlamaCacheManager.cc
LlamaContextDecoder.cc
LlamaContextAttentionLayer.cc
LlamaDecoderSelfAttentionLayer.cc
LlamaDecoder.cc
LlamaWeight.cc
LlamaDecoderLayerWeight.cc
LlamaFfnLayer.cc
llama_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu)
set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC -lcudart
cublasMMWrapper
DynamicDecodeLayer
BaseBeamSearchLayer
activation_kernels
decoder_masked_multihead_attention
bert_preprocess_kernels
decoding_kernels
unfused_attention_kernels
custom_ar_kernels
custom_ar_comm
gpt_kernels
tensor
memory_utils
nccl_utils
cuda_utils
logger
llama_fmha)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment