/* * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2021, NAVER Corp. Authored by CLOVA. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h" #include "src/fastertransformer/kernels/sampling_penalty_kernels.h" #include "src/fastertransformer/kernels/sampling_topk_kernels.h" #include "src/fastertransformer/utils/cuda_utils.h" #include "src/fastertransformer/utils/memory_utils.h" #include namespace fastertransformer { template void BaseSamplingLayer::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) { FT_LOG_DEBUG(__PRETTY_FUNCTION__); curandstate_buf_ = reinterpret_cast( allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false)); random_seeds_buf_ = reinterpret_cast( allocator_->reMalloc(random_seeds_buf_, sizeof(unsigned long long) * batch_size, false)); temperature_buf_ = reinterpret_cast(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false)); repetition_penalty_buf_ = reinterpret_cast(allocator_->reMalloc(repetition_penalty_buf_, sizeof(float) * batch_size, false)); min_lengths_buf_ = reinterpret_cast(allocator_->reMalloc(min_lengths_buf_, sizeof(int) * batch_size, false)); runtime_logits_buf_ = reinterpret_cast( allocator_->reMalloc(runtime_logits_buf_, sizeof(T) * batch_size * vocab_size_padded_, false)); skip_decode_buf_ = reinterpret_cast(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false)); // host buffers. temperature_ = new float[batch_size]; repetition_penalty_ = new float[batch_size]; min_lengths_ = new int[batch_size]; skip_decode_ = new bool[batch_size]; is_allocate_buffer_ = true; } template void BaseSamplingLayer::freeBuffer() { FT_LOG_DEBUG(__PRETTY_FUNCTION__); if (is_allocate_buffer_) { allocator_->free((void**)(&curandstate_buf_)); allocator_->free((void**)(&random_seeds_buf_)); allocator_->free((void**)(&temperature_buf_)); allocator_->free((void**)(&repetition_penalty_buf_)); allocator_->free((void**)(&min_lengths_buf_)); allocator_->free((void**)(&runtime_logits_buf_)); allocator_->free((void**)(&skip_decode_buf_)); delete[] temperature_; delete[] repetition_penalty_; delete[] min_lengths_; delete[] skip_decode_; is_allocate_buffer_ = false; } } template BaseSamplingLayer::BaseSamplingLayer(size_t max_batch_size, size_t vocab_size, size_t vocab_size_padded, int end_id, size_t top_k, float top_p, unsigned long long random_seed, float temperature, float len_penalty, float repetition_penalty, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, cudaDeviceProp* cuda_device_prop): DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop), vocab_size_(vocab_size), vocab_size_padded_(vocab_size_padded) { } template BaseSamplingLayer::BaseSamplingLayer(BaseSamplingLayer const& sampling_layer): DynamicDecodeBaseLayer(sampling_layer), vocab_size_(sampling_layer.vocab_size_), vocab_size_padded_(sampling_layer.vocab_size_padded_), sampling_workspace_size_(sampling_layer.sampling_workspace_size_) { } template BaseSamplingLayer::~BaseSamplingLayer() { } template void BaseSamplingLayer::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) { // Set up the sampling layer for given runtime arguments. // // runtime_args: // runtime_top_k [1] or [batch_size] on cpu, optional. // runtime_top_p [1] or [batch_size] on cpu, optional // temperature [1] or [batch_size] on cpu, optional // repetition_penalty [1] or [batch_size] on cpu, optional // presence_penalty [1] or [batch_size] on cpu, optional, // repetition_penalty and presence_penalty are mutually exclusive. // min_length [1] or [batch_size] on cpu, optional FT_LOG_DEBUG(__PRETTY_FUNCTION__); Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor(); Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor(); allocateBuffer(batch_size, runtime_top_k, runtime_top_p); // If runtime argument has single random seed, using this random seed to initialize the random table of all // sentences. If the argument has [batch_size] random seeds, initializing the random table by different random seeds // respectively. If no random seed, initialize the random table of all sentences by 0 directly. if (runtime_args->isExist("random_seed")) { Tensor random_seeds = runtime_args->at("random_seed"); FT_CHECK_WITH_INFO(random_seeds.shape.size() == 1 && (random_seeds.size() == 1 || random_seeds.size() == batch_size), fmtstr("random_seeds must be of shape [1] or [batch_size(%ld)], got random_seeds.shape=%s", batch_size, vec2str(random_seeds.shape).c_str())); if (random_seeds.size() == 1) { invokeCurandInitialize(curandstate_buf_, batch_size, random_seeds.getVal(), stream_); sync_check_cuda_error(); } else { unsigned long long* random_seed_ptr = random_seeds.getPtr(); cudaAutoCpy(random_seeds_buf_, random_seed_ptr, batch_size, stream_); invokeCurandBatchInitialize(curandstate_buf_, batch_size, random_seeds_buf_, stream_); sync_check_cuda_error(); } } else { // Initialize curand states using the default seed 0. invokeCurandInitialize(curandstate_buf_, batch_size, 0, stream_); } // Setup penalties. const float default_temperature = 1.0f; Tensor temperature = runtime_args->isExist("temperature") ? runtime_args->at("temperature") : Tensor(MEMORY_CPU, TYPE_FP32, {1}, &default_temperature); if (temperature.size() == 1) { float tp = temperature.getVal(); deviceFill(temperature_buf_, batch_size, tp, stream_); std::fill_n(temperature_, batch_size, tp); } else { cudaAutoCpy(temperature_buf_, temperature.getPtr(), batch_size, stream_); std::copy_n(temperature.getPtr(), batch_size, temperature_); } if (runtime_args->isExist("repetition_penalty") || runtime_args->isExist("presence_penalty")) { FT_CHECK_WITH_INFO( !(runtime_args->isExist("repetition_penalty") && runtime_args->isExist("presence_penalty")), "Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. " "Please provide one of repetition_penalty or presence_penalty."); repetition_penalty_type_ = runtime_args->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative : RepetitionPenaltyType::Additive; Tensor repetition_penalty = repetition_penalty_type_ == RepetitionPenaltyType::Multiplicative ? runtime_args->at("repetition_penalty") : runtime_args->at("presence_penalty"); if (repetition_penalty.size() == 1) { float rp = repetition_penalty.getVal(); deviceFill(repetition_penalty_buf_, batch_size, rp, stream_); std::fill_n(repetition_penalty_, batch_size, rp); } else { cudaAutoCpy(repetition_penalty_buf_, repetition_penalty.getPtr(), batch_size, stream_); std::copy_n(repetition_penalty.getPtr(), batch_size, repetition_penalty_); } } else { repetition_penalty_type_ = RepetitionPenaltyType::None; } const int default_min_length = 0; Tensor min_lengths = runtime_args->at("min_length", Tensor(MEMORY_CPU, TYPE_INT32, {1}, &default_min_length)); if (min_lengths.size() == 1) { int minlen = min_lengths.getVal(); deviceFill(min_lengths_buf_, batch_size, minlen, stream_); std::fill_n(min_lengths_, batch_size, minlen); } else { cudaAutoCpy(min_lengths_buf_, min_lengths.getPtr(), batch_size, stream_); std::copy_n(min_lengths.getPtr(), batch_size, min_lengths_); } } template void BaseSamplingLayer::forward(std::vector* output_tensors, const std::vector* input_tensors) { // input_tensors: // logits [local_batch_size, vocab_size_padded] // embedding_bias [vocab_size_padded] // step [1] on cpu // max_input_length [1] on cpu // input_lengths [local_batch_size] // ite [1] on cpu // random_seed [1] on cpu, optional // output_tensors: // output_ids [max_seq_len, batch_size] // finished [local_batch_size] // sequence_length [local_batch_size] // cum_log_probs [local_batch_size], must be float* FT_CHECK(false); // TODO deprecated, need to remove std::unordered_map input_tensors_map{{"logits", input_tensors->at(0)}, {"embedding_bias", input_tensors->at(1)}, {"step", input_tensors->at(2)}, {"max_input_length", input_tensors->at(3)}, {"input_lengths", input_tensors->at(4)}, {"ite", input_tensors->at(5)}}; if (input_tensors->size() == 7) { input_tensors_map.insert({"random_seed", input_tensors->at(6)}); } std::unordered_map output_tensors_map{{"output_ids", output_tensors->at(0)}, {"finished", output_tensors->at(1)}, {"sequence_length", output_tensors->at(2)}, {"cum_log_probs", output_tensors->at(3)}}; forward(&output_tensors_map, &input_tensors_map); } template void BaseSamplingLayer::forward(std::unordered_map* output_tensors, const std::unordered_map* input_tensors) { FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__); TensorMap input_map(*input_tensors); TensorMap output_map(*output_tensors); forward(&output_map, &input_map); } template void BaseSamplingLayer::forward(TensorMap* output_tensors, TensorMap* input_tensors) { // input_tensors: // logits [local_batch_size, vocab_size_padded] // embedding_bias [vocab_size_padded], optional // step [1] on cpu // max_input_length [1] on cpu // input_lengths [local_batch_size], optional // ite [1] on cpu // end_id [local_batch_size], optional // output_tensors: // output_ids [max_seq_len, batch_size] // finished [local_batch_size], optional // sequence_length [local_batch_size], optional // cum_log_probs [batch_size], must be float*, optional // The cumultative log probability of generated tokens. // output_log_probs [local_batch_size], must be float*, optional // The log probs at the current step. FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); FT_CHECK(input_tensors->size() >= 4); FT_CHECK(output_tensors->size() >= 1); const int batch_size = output_tensors->at("output_ids").shape[1]; const int local_batch_size = input_tensors->at("logits").shape[0]; const int step = input_tensors->at("step").getVal(); const int ite = input_tensors->at("ite").getVal(); const int max_input_length = input_tensors->at("max_input_length").getVal(); T* logits = input_tensors->at("logits").getPtr(); #define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; })) bool* skip_decode = skip_decode_ + ite * local_batch_size; if (ALL_OF(skip_decode, local_batch_size, bool, true)) { // No sample in the current batch to do TopX sampling. return; } skip_any_ = std::any_of(skip_decode, skip_decode + local_batch_size, [](bool b) { return b; }); if (skip_any_) { // A TopX Sampling layer directly changes the logit values. In case of skip_any==true, // meaning topk and topp layers will run simultaneously for a batch in the same step. // We copy the logits to an internal buffer, not affecting the other sampling layers. FT_CHECK(input_tensors->at("logits").size() == local_batch_size * vocab_size_padded_); cudaD2Dcpy(runtime_logits_buf_, logits, input_tensors->at("logits").size()); logits = runtime_logits_buf_; } const T* embedding_bias = input_tensors->isExist("embedding_bias") ? input_tensors->at("embedding_bias").getPtr() : nullptr; if (embedding_bias != nullptr || !ALL_OF(temperature_ + ite * local_batch_size, local_batch_size, float, 1.0f)) { invokeBatchApplyTemperaturePenalty(logits, embedding_bias, temperature_buf_ + ite * local_batch_size, local_batch_size, vocab_size_, vocab_size_padded_, stream_); } sync_check_cuda_error(); if (step > 1 && repetition_penalty_type_ != RepetitionPenaltyType::None) { float default_value = getDefaultPenaltyValue(repetition_penalty_type_); if (!ALL_OF(repetition_penalty_ + ite * local_batch_size, local_batch_size, float, default_value)) { invokeBatchApplyRepetitionPenalty( logits, repetition_penalty_buf_ + ite * local_batch_size, output_tensors->at("output_ids").getPtrWithOffset(ite * local_batch_size), batch_size, local_batch_size, vocab_size_padded_, input_tensors->at("input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr(), max_input_length, step, repetition_penalty_type_, stream_); sync_check_cuda_error(); } } const int num_generated_tokens = step - max_input_length; const int* min_lengths = min_lengths_ + ite * local_batch_size; const bool invoke_min_length_penalty = std::any_of( min_lengths, min_lengths + local_batch_size, [&](int min_length) { return min_length > num_generated_tokens; }); if (invoke_min_length_penalty) { FT_CHECK_WITH_INFO(input_tensors->isExist("end_id"), "Need end_id to apply min length penlaty"); invokeMinLengthPenalty(logits, min_lengths_buf_ + ite * local_batch_size, input_tensors->getPtr("end_id"), output_tensors->getPtr("sequence_length"), max_input_length, local_batch_size, vocab_size_padded_, stream_); sync_check_cuda_error(); } #undef ALL_OF runSampling(output_tensors, input_tensors); if (is_free_buffer_after_forward_) { freeBuffer(); } sync_check_cuda_error(); FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__); } template class BaseSamplingLayer; template class BaseSamplingLayer; } // namespace fastertransformer