Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
add_subdirectory(beam_search_layers)
add_subdirectory(sampling_layers)
add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
TopKSamplingLayer TopPSamplingLayer
OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
gpt_kernels tensor nvtx_utils)
\ No newline at end of file
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "stdlib.h"
namespace fastertransformer {
// Note that the int8 mode of BERT and GPT are different.
// For int8 mode = 2 on GPT:
// scale (gemm input scale): quantize input of GEMM (float/half) in the int8 range. Namely, int8_x = scale * x
// scale_inter: (gemm output scale) / (gemm input scale * gemm weight scale)
// scale_out: 1 / (gemm output scale), dequantize activation from int8 range to float/half.
template<typename T1, typename T2 = T1>
struct DenseWeight {
const T1* kernel = nullptr;
const T2* bias = nullptr;
const T1* fp8_bias = nullptr;
const T1* sp_kernel = nullptr;
// for int8 kernel
const int8_t* int8_kernel = nullptr;
const float* scale = nullptr;
const T2* weight_only_quant_scale = nullptr;
const T2* moe_scale = nullptr;
const float* scale_inter = nullptr;
const float* scale_out = nullptr;
// FP8 scales
// scale = AMAX(tensor) / FP8_MAX
// During GEMM, A (original) = A_scaled (fp8) * "scale of A"
const float* input_scale = nullptr; // a scalar
const float* input_scale_inv = nullptr; // a scalar
const float* weight_scale = nullptr; // a scalar or a vector
const float* weight_scale_inv = nullptr; // a scalar or a vector
const float* output_scale = nullptr; // a scalar
const float* output_scale_inv = nullptr; // a scalar
// host pointer of scales, all are scalars
const float* input_h_scale = nullptr;
const float* input_h_scale_inv = nullptr;
const float* weight_h_scale = nullptr;
const float* weight_h_scale_inv = nullptr;
const float* output_h_scale = nullptr;
const float* output_h_scale_inv = nullptr;
// TODO(bhsueh) check do we need this param
const float* per_channel_scale_min =
nullptr; // = min(weight_scale), used to adjust the scaling of per channel scaling
bool fuse_gemm_bias = false;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string>
#include <unordered_map>
#include "src/fastertransformer/layers/BaseLayer.h"
namespace fastertransformer {
class DynamicDecodeBaseLayer: public BaseLayer {
protected:
virtual void allocateBuffer() = 0;
virtual void freeBuffer() = 0;
public:
DynamicDecodeBaseLayer(cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop){};
~DynamicDecodeBaseLayer() = default;
DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){};
virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors) = 0;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) = 0;
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/kernels/ban_bad_words.h"
#include "src/fastertransformer/kernels/stop_criteria_kernels.h"
#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
namespace fastertransformer {
template<typename T>
void DynamicDecodeLayer<T>::allocateBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true);
return;
}
template<typename T>
void DynamicDecodeLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&h_pinned_finished_sum_), true);
return;
}
template<typename T>
void DynamicDecodeLayer<T>::initialize()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0, // max_batch_size, deprecated
0, // local_head_num, deprecated
0, // size_per_head, deprecated
0, // beam_width, deprecated
vocab_size_,
vocab_size_padded_,
0, // end_id, deprecated
0.0f, // beam_search_diversity_rate_, deprecated
1.0f, // temperature_, deprecated
0.0f, // len_penalty_, deprecated
1.0f, // repetition_penalty_, deprecated
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
beamsearch_decode_ = new BeamSearchLayer<T>(0, // max_batch_size, deprecated
0, // local_head_num, deprecated
0, // size_per_head, deprecated
0, // beam_width, deprecated
vocab_size_,
vocab_size_padded_,
0, // end_id, deprecated
0.0f, // beam_search_diversity_rate_, deprecated
1.0f, // temperature_, deprecated
0.0f, // len_penalty_, deprecated
1.0f, // repetition_penalty_, deprecated
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
topk_decode_ = new TopKSamplingLayer<T>(0,
vocab_size_,
vocab_size_padded_,
0, // end_id, deprecated
0, // top_k_, deprecated
0, // random_seed_, deprecated
1.0f, // temperature_, deprecated
0.0f, // len_penalty_, deprecated
1.0f, // repetition_penalty_, deprecated
stream_,
cublas_wrapper_,
allocator_,
false);
topp_decode_ = new TopPSamplingLayer<T>(0,
vocab_size_,
vocab_size_padded_,
0, // end_id, deprecated
0.0f, // top_p_, deprecated
0, // random_seed_, deprecated
1.0f, // temperature_, deprecated
0.0f, // len_penalty_, deprecated
1.0f, // repetition_penalty_, deprecated
stream_,
cublas_wrapper_,
allocator_,
false,
cuda_device_prop_);
allocateBuffer();
}
template<typename T>
DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t vocab_size,
size_t vocab_size_padded,
int end_id,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
vocab_size_(vocab_size),
vocab_size_padded_(vocab_size_padded),
cuda_device_prop_(cuda_device_prop)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize();
}
template<typename T>
DynamicDecodeLayer<T>::~DynamicDecodeLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
delete online_beamsearch_decode_;
delete beamsearch_decode_;
delete topk_decode_;
delete topp_decode_;
freeBuffer();
}
template<typename T>
DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer):
BaseLayer(dynamic_decode_layer),
vocab_size_(dynamic_decode_layer.vocab_size_),
vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_),
cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize();
}
template<typename T>
void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
{
/**
* @brief Set up the dynamic decode layer for given input runtime arguments.
*
* runtime_args:
* \param runtime_top_k [1] or [batch_size] on cpu, optional.
* \param runtime_top_p [1] or [batch_size] on cpu, optional
* \param beam_search_diversity_rate [1] or [batch_size] on cpu, optional
* \param temperature [1] or [batch_size] on cpu, optional
* \param len_penalty [1] or [batch_size] on cpu, optional
* \param repetition_penalty [1] or [batch_size] on cpu, optional
* \param presence_penalty [1] or [batch_size] on cpu, optional, float
* \param min_length [1] or [batch_size], optional
* \param top_p_decay [batch_size] on gpu, float, optional
* \param top_p_min [batch_size] on gpu, float, optional
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional
*/
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
has_diff_runtime_args_ = hasDiffRuntimeArgs(runtime_args);
if (beam_width == 1) { // sampling layers
topk_decode_->setup(batch_size, beam_width, runtime_args);
topp_decode_->setup(batch_size, beam_width, runtime_args);
}
}
template<typename T>
void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap input_map(*input_tensors);
TensorMap output_map(*output_tensors);
forward(&output_map, &input_map);
}
template<typename T>
void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
{
/**
* @brief
* input_tensors:
* \param logits [batch_size, beam_width, vocab_size_padded]
* \param embedding_bias [vocab_size_padded], optional
* \param step [1] on cpu
* \param max_input_length [1] on cpu
* \param input_lengths [batch_size, beam_width], optional
* \param min_length [batch_size], optional
* \param sequence_limit_length [batch_size]
* \param ite [1] on cpu
* \param local_batch_size [1] on cpu
* \param stop_words_list [batch_size, 2, stop_words_length], optional
* \param runtime_top_k [1] or [batch_size] on cpu, optional, uint
* \param runtime_top_p [1] or [batch_size] on cpu, optional, float
* \param temperature [1] or [batch_size] on cpu, optional, float
* \param len_penalty [1] or [batch_size] on cpu, optional, float
* \param repetition_penalty [1] or [batch_size] on cpu, optional, float
* \param presence_penalty [1] or [batch_size] on cpu, optional, float
* Only one of repetition and presence penalties is allowed.
* \param random_seed [1] or [batch_size] on cpu, optional, unsigned long long int
* \param bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
* \param src_cache_indirection
* [local_batch_size, beam_width, max_seq_len]
* the k/v cache index for beam search
* \param is_initialize_random_table [1] on cpu, bool
* \param top_p_decay [batch_size] on gpu, float, optional
* \param top_p_min [batch_size] on gpu, float, optional
* \param top_p_reset_ids [batch_size] on gpu, uint32, optional
*
* output_tensors:
* \param output_ids [max_seq_len, batch_size]
* \param finished [batch_size * beam_width], optional
* \param should_stop [1] on cpu
* \param cum_log_probs [batch_size * beam_width], necessary in beam search
* \param parent_ids [max_seq_len, batch_size * beam_width]
* \param sequence_length [batch_size * beam_width], optional
* \param output_log_probs [request_ouptut_length, batch_size * beam_width], must be float*, optional
* \param tgt_cache_indirection
* [local_batch_size, beam_width, max_seq_len]
* the k/v cache index for beam search
* \param beam_hyps: [1] on cpu, a special structure which maintains some pointers of beam search
*
*/
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
const int ite = (int)input_tensors->at("ite").getVal<uint>();
const int step = input_tensors->at("step").getVal<int>();
FT_CHECK(input_tensors->at("logits").shape.size() == 3);
const size_t batch_size = input_tensors->at("logits").shape[0];
const size_t beam_width = input_tensors->at("logits").shape[1];
const size_t local_batch_size = (size_t)input_tensors->at("local_batch_size").getVal<int>();
if (input_tensors->isExist("bad_words_list")) {
const auto& bad_words = input_tensors->at("bad_words_list");
const int* bad_words_ptr = bad_words.getPtr<const int>();
FT_CHECK_WITH_INFO(bad_words.shape.size() == 2 || bad_words.shape.size() == 3,
"Bad words dimension must be 2 or 3.");
const bool is_matrix = bad_words.shape.size() == 2;
if (bad_words.shape.size() == 3) {
FT_CHECK_WITH_INFO(bad_words.shape[0] == batch_size,
fmtstr("Shape of dim 0 of bad words is invalid. It must be equal to batch size."
" However, it is %d and the batch size is %d.",
bad_words.shape[0],
batch_size));
}
const bool shared_bad_words = is_matrix || bad_words.shape[0] == 1;
const size_t bad_words_len = bad_words.shape[is_matrix ? 1 : 2];
// Add check on batch size of bad words
const int id_offset = ite * local_batch_size;
const int decode_vocab_size_units_offset = id_offset * vocab_size_padded_;
invokeBanBadWords((T*)input_tensors->at("logits").getPtrWithOffset(decode_vocab_size_units_offset),
output_tensors->at("output_ids").getPtr<const int>(),
beam_width > 1 ? output_tensors->at("parent_ids").getPtr<const int>() : nullptr,
batch_size,
local_batch_size,
beam_width,
shared_bad_words ?
bad_words_ptr :
bad_words.getPtrWithOffset<const int>(ite * local_batch_size * 2 * bad_words_len),
shared_bad_words,
bad_words_len,
id_offset,
vocab_size_padded_,
step,
stream_);
}
// dynamic decode GPT
if (beam_width > 1) {
// Because we still not support batch beam search now, so we need to compute one by one if there are different
// runtime arguments.
const size_t dynamic_decode_batch_size = has_diff_runtime_args_ ? 1 : local_batch_size;
const int dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size;
for (uint dynamic_ite = ite * dynamic_decode_total_iteration;
dynamic_ite < (ite + 1) * dynamic_decode_total_iteration;
++dynamic_ite) {
const int dynamic_id_offset = dynamic_ite * dynamic_decode_batch_size * beam_width;
const int dynamic_decode_vocab_size_units_offset = dynamic_id_offset * vocab_size_padded_;
// common inputs
Tensor logits = input_tensors->at("logits");
Tensor end_id = input_tensors->at("end_id");
TensorMap dynamic_decode_input_tensors(
{{"logits",
Tensor{logits.where,
logits.type,
{dynamic_decode_batch_size, logits.shape[1], logits.shape[2]},
logits.getPtrWithOffset(dynamic_decode_vocab_size_units_offset)}},
{"step", input_tensors->at("step")},
{"max_input_length", input_tensors->at("max_input_length")},
{"end_id",
Tensor{end_id.where,
end_id.type,
{dynamic_decode_batch_size},
end_id.getPtrWithOffset(dynamic_ite * dynamic_decode_batch_size)}},
{"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &dynamic_ite}}});
if (input_tensors->isExist("embedding_bias")) {
dynamic_decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
}
if (input_tensors->isExist("input_lengths")) {
Tensor input_lengths = input_tensors->at("input_lengths");
dynamic_decode_input_tensors.insert(
{"input_lengths",
input_lengths.slice({dynamic_decode_batch_size, input_lengths.shape[1]}, dynamic_id_offset)});
}
for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
if (t->first.find("random_seed") == std::string::npos) {
dynamic_decode_input_tensors.insert(*t);
}
}
// common outputs
TensorMap dynamic_decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
if (output_tensors->isExist("sequence_length")) {
Tensor sequence_length = output_tensors->at("sequence_length");
dynamic_decode_output_tensors.insert({"sequence_length",
Tensor{sequence_length.where,
sequence_length.type,
{dynamic_decode_batch_size * beam_width},
sequence_length.getPtrWithOffset(dynamic_id_offset)}});
}
if (output_tensors->isExist("finished")) {
Tensor finished = output_tensors->at("finished");
dynamic_decode_output_tensors.insert({"finished",
Tensor{finished.where,
finished.type,
{dynamic_decode_batch_size * beam_width},
finished.getPtrWithOffset(dynamic_id_offset)}});
}
if (output_tensors->isExist("cum_log_probs")) {
Tensor cum_log_probs = output_tensors->at("cum_log_probs");
dynamic_decode_output_tensors.insert({"cum_log_probs",
Tensor{cum_log_probs.where,
cum_log_probs.type,
{dynamic_decode_batch_size * beam_width},
cum_log_probs.getPtrWithOffset(dynamic_id_offset)}});
}
if (output_tensors->isExist("beam_hyps")) {
dynamic_decode_output_tensors.insert("beam_hyps", output_tensors->at("beam_hyps"));
}
if (output_tensors->isExist("output_log_probs")) {
dynamic_decode_output_tensors.insert({"output_log_probs", output_tensors->at("output_log_probs")});
}
dynamic_decode_input_tensors.insert({"src_cache_indirection", input_tensors->at("src_cache_indirection")});
dynamic_decode_output_tensors.insert({"parent_ids", output_tensors->at("parent_ids")});
dynamic_decode_output_tensors.insert(
{"tgt_cache_indirection", output_tensors->at("tgt_cache_indirection")});
FT_CHECK_WITH_INFO(dynamic_decode_output_tensors.isExist("cum_log_probs"),
"cum_log_probs should be provided in beam search.");
if (true || beam_width < 16
|| (output_tensors->isExist("beam_hyps")
&& input_tensors->getVal<float>("beam_search_diversity_rate", 0.0f) != 0.0f)) {
// only online_beamsearch_decode_ support beam_search_diversity_rate when beam_hyps is used
online_beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
}
else {
FT_CHECK(false); // deprecate this module
beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
}
} // end of dynamic_ite
}
else { // beam_width=1
// In sampling, we have supported batch sampling. So, we always compute all sentences once.
const size_t local_batch_offset = ite * local_batch_size * beam_width;
Tensor logits = input_tensors->at("logits");
Tensor end_id = input_tensors->at("end_id");
TensorMap decode_input_tensors(
{{"logits",
logits.slice({local_batch_size, beam_width, logits.shape[2]}, local_batch_offset * logits.shape[2])},
{"step", input_tensors->at("step")},
{"max_input_length", input_tensors->at("max_input_length")},
{"end_id", end_id.slice({local_batch_size}, ite * local_batch_size)},
{"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}});
if (input_tensors->isExist("embedding_bias")) {
decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
}
if (input_tensors->isExist("input_lengths")) {
Tensor input_lengths = input_tensors->at("input_lengths");
decode_input_tensors.insert(
{"input_lengths", input_lengths.slice({local_batch_size, beam_width}, local_batch_offset)});
}
TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
if (output_tensors->isExist("sequence_length")) {
Tensor sequence_length = output_tensors->at("sequence_length");
decode_output_tensors.insert(
{"sequence_length", sequence_length.slice({local_batch_size * beam_width}, local_batch_offset)});
}
if (output_tensors->isExist("finished")) {
Tensor finished = output_tensors->at("finished");
decode_output_tensors.insert(
{"finished", finished.slice({local_batch_size * beam_width}, local_batch_offset)});
}
if (output_tensors->isExist("cum_log_probs")) {
Tensor cum_log_probs = output_tensors->at("cum_log_probs");
decode_output_tensors.insert(
{"cum_log_probs", cum_log_probs.slice({local_batch_size * beam_width}, local_batch_offset)});
}
if (output_tensors->isExist("output_log_probs")) {
Tensor output_log_probs = output_tensors->at("output_log_probs");
int max_input_length = input_tensors->at("max_input_length").getVal<int>();
size_t step_offset = (step - max_input_length) * batch_size * beam_width;
decode_output_tensors.insert({"output_log_probs",
output_log_probs.slice({output_log_probs.shape[0] - (step - max_input_length),
local_batch_size * beam_width},
step_offset + local_batch_offset)});
}
// Run topk / topp decode layers.
// Currently, we support batch sampling. If the runtime arguments are like
// topk = [4, 0, 4]. topp = [0.0, 0.5, 0.5]
// then topk_decode handles [4, x, 4 + 0.5]
// topp_decode handles [x, 0.5, x]
// where "x" are skipped.
topk_decode_->forward(&decode_output_tensors, &decode_input_tensors);
topp_decode_->forward(&decode_output_tensors, &decode_input_tensors);
}
if (input_tensors->isExist("stop_words_list")) {
const size_t id_offset = ite * local_batch_size * beam_width;
const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2];
invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr<const int>(),
beam_width > 1 ? output_tensors->at("parent_ids").getPtr<const int>() : nullptr,
input_tensors->at("stop_words_list")
.getPtrWithOffset<const int>(ite * local_batch_size * 2 * stop_words_length),
output_tensors->at("finished").getPtrWithOffset<bool>(id_offset),
id_offset,
stop_words_length,
batch_size,
beam_width,
step,
stream_);
}
if (input_tensors->isExist("sequence_limit_length")) {
invokeLengthCriterion(output_tensors->at("finished").getPtr<bool>(),
output_tensors->at("should_stop").getPtr<bool>(),
h_pinned_finished_sum_,
input_tensors->at("sequence_limit_length").getPtr<const uint32_t>(),
batch_size,
beam_width,
step,
stream_);
}
}
template<typename T>
bool DynamicDecodeLayer<T>::hasDiffRuntimeArgs(TensorMap* input_tensors)
{
for (int i = 0; i < (int)runtime_arg_names_.size(); i++) {
if (input_tensors->isExist(runtime_arg_names_[i])) {
auto tensor = input_tensors->at(runtime_arg_names_[i]);
FT_CHECK(tensor.shape.size() == 1);
for (int j = 1; j < (int)tensor.shape[0]; j++) {
const void* data = tensor.data;
switch (tensor.type) {
case TYPE_FP32:
if (((const float*)data)[0] != ((const float*)data)[j]) {
return true;
}
break;
case TYPE_INT32:
if (((const int*)data)[0] != ((const int*)data)[j]) {
return true;
}
break;
case TYPE_UINT32:
if (((const uint*)data)[0] != ((const uint*)data)[j]) {
return true;
}
break;
case TYPE_UINT64:
if (((const unsigned long long int*)data)[0] != ((const unsigned long long int*)data)[j]) {
return true;
}
break;
default:
FT_CHECK_WITH_INFO(false, runtime_arg_names_[i] + ": " + tensor.toString() + " is invalid.");
break;
}
}
}
}
return false;
}
template class DynamicDecodeLayer<float>;
template class DynamicDecodeLayer<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2022-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <string>
#include <unordered_map>
#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
namespace fastertransformer {
template<typename T>
class DynamicDecodeLayer: public BaseLayer {
protected:
void allocateBuffer() override;
void freeBuffer() override;
void initialize();
bool hasDiffRuntimeArgs(TensorMap* input_tensors);
DynamicDecodeBaseLayer* online_beamsearch_decode_;
DynamicDecodeBaseLayer* beamsearch_decode_;
DynamicDecodeBaseLayer* topk_decode_;
DynamicDecodeBaseLayer* topp_decode_;
size_t vocab_size_;
size_t vocab_size_padded_;
cudaDeviceProp* cuda_device_prop_;
// List of argument names which can have different values in runtime
// and does not support a batched version of kernel in beam search.
const std::vector<std::string> runtime_arg_names_ = {"beam_search_diversity_rate",
"temperature",
"len_penalty",
"repetition_penalty",
"presence_penalty",
"min_length"};
bool has_diff_runtime_args_ = false;
int* h_pinned_finished_sum_ = nullptr;
public:
curandState_t* topk_curandstate_buf()
{
return static_cast<BaseSamplingLayer<T>*>(topk_decode_)->curandstate_buf();
}
curandState_t* topp_curandstate_buf()
{
return static_cast<BaseSamplingLayer<T>*>(topp_decode_)->curandstate_buf();
}
DynamicDecodeLayer(size_t vocab_size,
size_t vocab_size_padded,
int end_id,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop);
~DynamicDecodeLayer();
DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer);
void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args);
void forward(TensorMap* output_tensors, TensorMap* input_tensors);
void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors);
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/layers/FfnFP8Layer.h"
#include "src/fastertransformer/kernels/activation_fp8_kernels.h"
#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
namespace fastertransformer {
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::forward(TensorMap* output_tensors,
TensorMap* input_tensors,
const FfnFP8Weight<T1, T2>* ffn_weights)
{
// input tensors:
// input_hidden_state [token_num, d_model],
// output tensors:
// output_hidden_state [token_num, d_model],
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() == 1);
FT_CHECK(output_tensors->size() == 1);
const int m = input_tensors->at("input_hidden_state").shape[0];
const int d_model = input_tensors->at("input_hidden_state").shape[1];
const T1* input_hidden_state = input_tensors->at("input_hidden_state").getPtr<T1>();
Tensor output_tensor = output_tensors->at("output_hidden_state");
allocateBuffer(m);
#ifdef FUSE_GEMM_ACT
if (fp8_mode_ == 1) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(inter_buf_bf16_,
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.per_channel_scale_min, // identity_scale
stream_);
invokeAddBiasActivation(m,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
ffn_weights->intermediate_weight.scale,
ffn_weights->intermediate_weight.per_channel_scale_min,
ffn_weights->output_weight.input_scale_inv);
}
else if (fp8_mode_ == 2) {
#ifdef USE_QGMMA
if (getActivationType() == ActivationType::Gelu) {
PUSH_RANGE("FFN gemm 1 bias gelu");
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Conv1x1Gemm<false, true>(inter_buf_,
m,
inter_size_,
d_model,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.bias,
*(ffn_weights->intermediate_weight.input_h_scale), // scale_a,
*(ffn_weights->intermediate_weight.weight_h_scale), // scale_b,
*(ffn_weights->output_weight.input_h_scale_inv), // scale_d,
stream_);
POP_RANGE;
}
else if (getActivationType() == ActivationType::Relu) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Conv1x1Gemm<true, false>(inter_buf_,
m,
inter_size_,
d_model,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.bias,
*(ffn_weights->intermediate_weight.input_h_scale), // scale_a,
*(ffn_weights->intermediate_weight.weight_h_scale), // scale_b,
*(ffn_weights->output_weight.input_h_scale_inv), // scale_d,
stream_);
}
#else // USE_QGMMA
const float alpha = 1.0f;
const float beta = 0.0f;
if (getActivationType() == ActivationType::Gelu) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<false, true>(inter_buf_bf16_,
#else // FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<false, true>(inter_buf_,
#endif // FP8_GEMM_OUTPUT_QUANT_DISABLE
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.weight_scale,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
stream_);
}
else if (getActivationType() == ActivationType::Relu) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<true, false>(inter_buf_bf16_,
#else // FP8_GEMM_OUTPUT_QUANT_DISABLE
->Gemm_Bias_Act<true, false>(inter_buf_,
#endif // #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.weight_scale,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
stream_);
}
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
invokeQuantizeMatrix<T1, T2, QUANTIZE_MODE::PER_TENSOR>(
inter_buf_, ffn_weights->output_weight.input_scale_inv, inter_buf_bf16_, m * inter_size_, 1, stream_);
#endif FP8_GEMM_OUTPUT_QUANT_DISABLE
#endif // USE_QGMMA
}
#else // FUSE_GEMM_ACT
PUSH_RANGE("FFN gemm 1");
#ifdef SPARSITY_ENABLED
int m_tmp = m;
if (m_tmp % 8 != 0) {
m_tmp = (m_tmp / 8 + 1) * 8;
}
const int m_padded = m_tmp;
if (sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, d_model)) {
FT_CHECK(false);
// cublas_wrapper_->SpGemm(CUBLAS_OP_N,
// CUBLAS_OP_N,
// inter_size_,
// m_padded,
// d_model,
// ffn_weights->intermediate_weight.sp_kernel,
// input_hidden_state,
// inter_buf_);
}
else {
#endif // SPARSITY_ENABLED
if (fp8_mode_ == 1) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(inter_buf_bf16_,
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.per_channel_scale_min, // identity_scale
stream_);
}
else if (fp8_mode_ == 2) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(inter_buf_bf16_,
(int)1,
(int)m,
(int)inter_size_,
(int)d_model,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
input_hidden_state,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.input_scale,
ffn_weights->intermediate_weight.weight_scale,
stream_);
}
#ifdef SPARSITY_ENABLED
}
#endif // SPARSITY_ENABLED
POP_RANGE;
PUSH_RANGE("FFN add bias act");
if (fp8_mode_ == 1) {
invokeAddBiasActivation(m,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
ffn_weights->intermediate_weight.scale,
ffn_weights->intermediate_weight.per_channel_scale_min,
ffn_weights->output_weight.input_scale_inv);
}
else if (fp8_mode_ == 2) {
invokeAddBiasActivation(m,
ffn_weights->intermediate_weight.bias,
ffn_weights->intermediate_weight.output_scale,
nullptr,
nullptr,
ffn_weights->output_weight.input_scale_inv);
}
sync_check_cuda_error();
POP_RANGE;
#endif // FUSE_GEMM_ACT
PUSH_RANGE("FFN gemm 2");
#ifdef SPARSITY_ENABLED
if (sparse_ && cublas_wrapper_->isUseSparse(1, d_model, m, inter_size_)) {
FT_CHECK(false);
// cublas_wrapper_->SpGemm(CUBLAS_OP_N,
// CUBLAS_OP_N,
// d_model,
// m_padded,
// inter_size_,
// ffn_weights->output_weight.sp_kernel,
// inter_buf_,
// output_tensor);
}
else {
#endif SPARSITY_ENABLED
if (fp8_mode_ == 1) {
const float alpha = 1.0f;
const float beta = 0.0f;
if (output_tensor.type == TYPE_BF16) {
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T2>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->identity_scale,
stream_);
}
else if (output_tensor.type == TYPE_FP8_E4M3) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T1>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->output_weight.per_channel_scale_min,
ffn_weights->output_weight.output_scale_inv,
stream_);
}
else {
FT_CHECK(false);
}
}
else if (fp8_mode_ == 2) {
if (output_tensor.type == TYPE_BF16) {
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T2>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->output_weight.weight_scale,
stream_);
}
else if (output_tensor.type == TYPE_FP8_E4M3) {
// It looks like conv1x1Gemm does not bring better performance for this gemm
// because the k dimension of this gemm is large
// #ifdef USE_QGMMA
// reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
// ->Conv1x1Gemm<false, false>(output_tensor.getPtr<T1>(),
// m,
// d_model,
// inter_size_,
// inter_buf_,
// ffn_weights->output_weight.kernel,
// ffn_weights->output_weight.bias,
// *(ffn_weights->output_weight.input_h_scale), //
// scale_a,
// *(ffn_weights->output_weight.weight_h_scale), //
// scale_b,
// *(ffn_weights->output_weight.output_h_scale_inv), //
// scale_d, stream_);
// #else // USE_QGMMA
const float alpha = 1.0f;
const float beta = 0.0f;
reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
->Gemm(output_tensor.getPtr<T1>(),
(int)1,
(int)m,
(int)d_model,
(int)inter_size_,
(int64_t)0,
(int64_t)0,
(int64_t)0,
&alpha,
&beta,
(const __nv_fp8_e4m3*)inter_buf_,
(const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
ffn_weights->output_weight.input_scale,
ffn_weights->output_weight.weight_scale,
ffn_weights->output_weight.output_scale_inv,
stream_);
// #endif // USE_QGMMA
}
else {
FT_CHECK(false);
}
}
#ifdef SPARSITY_ENABLED
}
#endif // SPARSITY_ENABLED
POP_RANGE;
sync_check_cuda_error();
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
inter_size_(inter_size),
fp8_mode_(fp8_mode)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
BaseLayer(ffn_layer.stream_,
ffn_layer.cublas_wrapper_,
ffn_layer.allocator_,
ffn_layer.is_free_buffer_after_forward_,
ffn_layer.cuda_device_prop_,
ffn_layer.sparse_),
inter_size_(ffn_layer.inter_size_),
fp8_mode_(ffn_layer.fp8_mode_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T1, typename T2>
FfnFP8Layer<T1, T2>::~FfnFP8Layer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr;
freeBuffer();
}
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
inter_buf_ = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
inter_buf_bf16_ = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
is_allocate_buffer_ = true;
}
template<typename T1, typename T2>
void FfnFP8Layer<T1, T2>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_));
allocator_->free((void**)(&inter_buf_bf16_));
is_allocate_buffer_ = false;
}
}
template class FfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
template<typename T1, typename T2>
GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
{
}
template<typename T1, typename T2>
GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& gelu_ffn_layer):
FfnFP8Layer<T1, T2>(gelu_ffn_layer)
{
}
template<typename T1, typename T2>
void GeluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale)
{
FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
inter_buf_,
bias,
input_scale,
input_scale_2,
input_scale_2_min,
output_scale,
(uint32_t)m,
(uint32_t)inter_size_,
stream_};
invokeFP8AddBiasGelu<T1, T2>(param);
}
template class GeluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
template<typename T1, typename T2>
ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
{
}
template<typename T1, typename T2>
ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& relu_ffn_layer):
FfnFP8Layer<T1, T2>(relu_ffn_layer)
{
}
template<typename T1, typename T2>
void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale)
{
FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
inter_buf_,
bias,
input_scale,
input_scale_2,
input_scale_2_min,
output_scale,
(uint32_t)m,
(uint32_t)inter_size_,
stream_};
invokeFP8AddBiasRelu<T1, T2>(param);
}
template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/FfnFP8Weight.h"
#include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include <vector>
namespace fastertransformer {
template<typename T1, typename T2>
class FfnFP8Layer: public BaseLayer {
private:
void allocateBuffer() override;
void freeBuffer() override;
void allocateBuffer(size_t token_num);
protected:
const int fp8_mode_;
T1* inter_buf_ = nullptr;
T2* inter_buf_bf16_ = nullptr;
size_t inter_size_;
virtual void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) = 0;
public:
FfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~FfnFP8Layer();
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
virtual ActivationType getActivationType() = 0;
};
template<typename T1, typename T2>
class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
public:
GeluFfnFP8Layer(size_t inter_size,
int fp8_mode_,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~GeluFfnFP8Layer() = default;
ActivationType getActivationType() override
{
return ActivationType::Gelu;
};
protected:
using FfnFP8Layer<T1, T2>::stream_;
private:
using FfnFP8Layer<T1, T2>::inter_buf_;
using FfnFP8Layer<T1, T2>::inter_size_;
using FfnFP8Layer<T1, T2>::fp8_mode_;
using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) override;
};
template<typename T1, typename T2>
class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
public:
ReluFfnFP8Layer(size_t inter_size,
int fp8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
virtual ~ReluFfnFP8Layer() = default;
ActivationType getActivationType() override
{
return ActivationType::Relu;
};
protected:
using FfnFP8Layer<T1, T2>::stream_;
private:
using FfnFP8Layer<T1, T2>::inter_buf_;
using FfnFP8Layer<T1, T2>::inter_size_;
using FfnFP8Layer<T1, T2>::fp8_mode_;
using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
void invokeAddBiasActivation(const int m,
const T2* bias,
const float* input_scale,
const float* input_scale_2,
const float* input_scale_2_min,
const float* output_scale) override;
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> {
ScaleList* scale_list_ptr;
float* identity_scale;
float* identity_h_scale;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T>
struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
namespace fastertransformer {
template<typename T>
void FfnLayer<T>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights)
{
TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
TensorMap output_tensor({{"ffn_output", output_tensors->at(0)}});
forward(&output_tensor, &input_tensor, ffn_weights);
}
template<typename T>
void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights)
{
// input tensors:
// ffn_input [token_num, hidden_dimension],
// ia3_tasks [batch_size] (optional)
// moe_k [1], uint64 (optional)
// padding_offset [token_num] (optional)
// seq_len [1], int32, (optional), only used for ia3
// output tensors:
// ffn_output [token_num, hidden_dimension] or [moe_k * token_num, hidden_dimension] if use_moe
// expert_scales [token_num, moe_k] (optional)
// expanded_source_row_to_expanded_dest_row [token_num, moe_k] (optional)
// expert_for_source_row [token_num, moe_k] (optional)
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(input_tensors->size() >= 1 && input_tensors->size() <= 5);
FT_CHECK(output_tensors->size() >= 1 || output_tensors->size() <= 4);
bool use_moe = false;
size_t moe_k = 0;
if (input_tensors->isExist("moe_k")) {
use_moe = true;
moe_k = input_tensors->at("moe_k").getVal<size_t>();
}
allocateBuffer(input_tensors->at("ffn_input").shape[0], moe_k, use_moe);
const int m = input_tensors->at("ffn_input").shape[0];
T* output_tensor = output_tensors->at("ffn_output").getPtr<T>();
const T* input_tensor = input_tensors->at("ffn_input").getPtr<const T>();
// for moe output
T* expert_scales = nullptr;
int* permuted_rows = nullptr;
int* permuted_experts = nullptr;
// moe outputs should exist or not together
FT_CHECK((use_moe && output_tensors->isExist("expert_scales")
&& output_tensors->isExist("expanded_source_row_to_expanded_dest_row")
&& output_tensors->isExist("expert_for_source_row"))
|| (!use_moe && !output_tensors->isExist("expert_scales")
&& !output_tensors->isExist("expanded_source_row_to_expanded_dest_row")
&& !output_tensors->isExist("expert_for_source_row")));
if (use_moe) {
expert_scales = output_tensors->at("expert_scales").getPtr<T>();
permuted_rows = output_tensors->at("expanded_source_row_to_expanded_dest_row").getPtr<int>();
permuted_experts = output_tensors->at("expert_for_source_row").getPtr<int>();
}
// TODO: INT8 and Sparsity are currently not implemented (geglu or reglu)
const bool use_gated_activation = use_gated_activation_ && ffn_weights->intermediate_weight2.kernel != nullptr;
// moe can't be used with use_gated_activation currently
FT_CHECK(!(use_gated_activation && use_moe));
auto activation_type = getActivationType();
const int* ia3_tasks = input_tensors->getPtr<const int>("ia3_tasks", nullptr);
if (use_moe) {
PUSH_RANGE("FFN moe");
FT_CHECK(ia3_tasks == nullptr);
cublas_wrapper_->Gemm(CUBLAS_OP_N,
CUBLAS_OP_N,
expert_num_,
m,
hidden_units_,
ffn_weights->gating_weight.kernel,
expert_num_,
input_tensor,
hidden_units_,
moe_gates_buf_,
expert_num_);
if (int8_mode_ == 0) {
moe_fc_runner_->run_moe_fc(input_tensor,
moe_gates_buf_,
ffn_weights->intermediate_weight.kernel,
ffn_weights->intermediate_weight.weight_only_quant_scale,
ffn_weights->intermediate_weight.bias,
activation_type,
ffn_weights->output_weight.kernel,
ffn_weights->output_weight.weight_only_quant_scale,
m,
hidden_units_,
inter_size_,
expert_num_,
moe_k,
moe_fc_workspace_,
output_tensor,
expert_scales,
permuted_rows,
permuted_experts,
stream_);
}
else if (int8_mode_ == 1) {
FT_CHECK_WITH_INFO(moe_int8_weight_only_fc_runner_.get() != NULL,
"weight only runner was not initialized.");
FT_CHECK(ffn_weights->intermediate_weight.int8_kernel != NULL
&& ffn_weights->intermediate_weight.weight_only_quant_scale != NULL);
FT_CHECK(ffn_weights->output_weight.int8_kernel != NULL
&& ffn_weights->output_weight.weight_only_quant_scale != NULL);
moe_int8_weight_only_fc_runner_->run_moe_fc(
input_tensor,
moe_gates_buf_,
reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
ffn_weights->intermediate_weight.weight_only_quant_scale,
ffn_weights->intermediate_weight.bias,
activation_type,
reinterpret_cast<const uint8_t*>(ffn_weights->output_weight.int8_kernel),
ffn_weights->output_weight.weight_only_quant_scale,
m,
hidden_units_,
inter_size_,
expert_num_,
moe_k,
moe_fc_workspace_,
output_tensor,
expert_scales,
permuted_rows,
permuted_experts,
stream_);
}
else {
FT_CHECK_WITH_INFO(false, "Invalid int8 mode for MoE");
}
sync_check_cuda_error();
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
POP_RANGE;
return;
}
PUSH_RANGE("FFN gemm 1");
int m_tmp = input_tensors->at("ffn_input").shape[0];
if (m_tmp % 8 != 0) {
m_tmp = (m_tmp / 8 + 1) * 8;
}
const int m_padded = m_tmp;
#ifdef SPARSITY_ENABLED
bool use_sparse_gemm = sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, hidden_units_);
#else
constexpr bool use_sparse_gemm = false;
#endif
if (use_sparse_gemm) {
FT_CHECK(!use_gated_activation);
#ifdef SPARSITY_ENABLED
cublas_wrapper_->SpGemm(CUBLAS_OP_N,
CUBLAS_OP_N,
inter_size_,
m_padded,
hidden_units_,
ffn_weights->intermediate_weight.sp_kernel,
input_tensor,
inter_buf_);
#endif
}
else {
if (int8_mode_ == 1) {
FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
FT_CHECK(ffn_weights->intermediate_weight.int8_kernel != NULL
&& ffn_weights->intermediate_weight.weight_only_quant_scale != NULL);
if (ia3_tasks == nullptr && !use_gated_activation) {
// launch fused GEMM + activation
weight_only_int8_fc_runner_->gemm_bias_act(
input_tensor,
reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
ffn_weights->intermediate_weight.weight_only_quant_scale,
ffn_weights->intermediate_weight.bias,
inter_buf_,
m,
inter_size_,
hidden_units_,
activation_type,
mixed_gemm_workspace_,
mixed_gemm_ws_bytes_,
stream_);
}
else {
// Otherwise, let FT handle activation
weight_only_int8_fc_runner_->gemm(
input_tensor,
reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
ffn_weights->intermediate_weight.weight_only_quant_scale,
inter_buf_,
m,
inter_size_,
hidden_units_,
mixed_gemm_workspace_,
mixed_gemm_ws_bytes_,
stream_);
if (use_gated_activation) {
FT_CHECK(ffn_weights->intermediate_weight2.int8_kernel != NULL
&& ffn_weights->intermediate_weight2.weight_only_quant_scale != NULL);
weight_only_int8_fc_runner_->gemm(
input_tensor,
reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight2.int8_kernel),
ffn_weights->intermediate_weight2.weight_only_quant_scale,
inter_buf_2_,
m,
inter_size_,
hidden_units_,
mixed_gemm_workspace_,
mixed_gemm_ws_bytes_,
stream_);
}
}
}
else if (int8_mode_ == 2) {
FT_CHECK(!use_gated_activation);
cublas_wrapper_->Int8Gemm(inter_size_,
m,
hidden_units_,
ffn_weights->intermediate_weight.int8_kernel,
hidden_units_,
input_tensors->getPtr<int8_t>("ffn_input"),
hidden_units_,
reinterpret_cast<int8_t*>(inter_buf_),
inter_size_,
ffn_weights->intermediate_weight.scale_inter);
}
else {
cublas_wrapper_->Gemm(CUBLAS_OP_N,
CUBLAS_OP_N,
inter_size_,
m,
hidden_units_,
ffn_weights->intermediate_weight.kernel,
inter_size_,
input_tensor,
hidden_units_,
inter_buf_,
inter_size_);
if (use_gated_activation) {
cublas_wrapper_->Gemm(CUBLAS_OP_N,
CUBLAS_OP_N,
inter_size_,
m,
hidden_units_,
ffn_weights->intermediate_weight2.kernel,
inter_size_,
input_tensor,
hidden_units_,
inter_buf_2_,
inter_size_);
}
}
}
POP_RANGE;
if (int8_mode_ != 1 || ia3_tasks != nullptr || use_gated_activation) {
// if int8_mode == 1 && ia3_tasks == nullptr && we don't use gated activations, we use cutlass
// to fuse GEMM + bias + activation, so we skip the activation function here. In all
// other cases, we must apply the activation function separately.
PUSH_RANGE("add bias act");
genericActivation(m,
ffn_weights->intermediate_weight.bias,
use_gated_activation ? ffn_weights->intermediate_weight2.bias : nullptr,
input_tensors->at("ia3_tasks", {MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<const int>(),
ffn_weights->ia3_weight.kernel,
int8_mode_ == 2 ? ffn_weights->intermediate_weight.scale_out : (float*)nullptr,
int8_mode_ == 2 ? ffn_weights->output_weight.scale : (float*)nullptr,
input_tensors->getPtr<int>("padding_offset", nullptr),
input_tensors->getVal<int>("seq_len", 1));
POP_RANGE;
}
sync_check_cuda_error();
PUSH_RANGE("FFN gemm 2");
#ifdef SPARSITY_ENABLED
use_sparse_gemm = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m, inter_size_);
#endif
if (use_sparse_gemm) {
#ifdef SPARSITY_ENABLED
cublas_wrapper_->SpGemm(CUBLAS_OP_N,
CUBLAS_OP_N,
hidden_units_,
m_padded,
inter_size_,
ffn_weights->output_weight.sp_kernel,
inter_buf_,
output_tensor);
#endif
}
else {
if (int8_mode_ == 1) {
FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
FT_CHECK(ffn_weights->output_weight.int8_kernel != NULL
&& ffn_weights->output_weight.weight_only_quant_scale != NULL);
weight_only_int8_fc_runner_->gemm(inter_buf_,
reinterpret_cast<const uint8_t*>(ffn_weights->output_weight.int8_kernel),
ffn_weights->output_weight.weight_only_quant_scale,
output_tensor,
m,
hidden_units_,
inter_size_,
mixed_gemm_workspace_,
mixed_gemm_ws_bytes_,
stream_);
}
else if (int8_mode_ == 2) {
int8_fc_runner_->gemm(reinterpret_cast<int8_t*>(inter_buf_),
ffn_weights->output_weight.int8_kernel,
QuantMode::PerTensorQuant,
ffn_weights->output_weight.scale_inter,
ffn_weights->output_weight.scale_out,
output_tensors->getPtr<T>("ffn_output"),
m,
hidden_units_,
inter_size_,
nullptr,
0,
stream_);
}
else {
cublas_wrapper_->Gemm(CUBLAS_OP_N,
CUBLAS_OP_N,
hidden_units_,
m,
inter_size_,
ffn_weights->output_weight.kernel,
hidden_units_,
inter_buf_,
inter_size_,
output_tensor,
hidden_units_);
}
}
sync_check_cuda_error();
POP_RANGE;
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T>
FfnLayer<T>::FfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse,
int int8_mode,
bool use_gated_activation):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
max_token_num_(max_batch_size * max_seq_len),
head_num_(head_num),
size_per_head_(size_per_head),
expert_num_(expert_num),
hidden_units_(head_num * size_per_head),
max_inter_size_(inter_size),
inter_size_(inter_size),
int8_mode_(int8_mode),
use_gated_activation_(use_gated_activation),
int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (int8_mode_ == 0) {
moe_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, T>>();
}
else if (int8_mode_ == 1) {
FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value), "Weight only quant not supported for fp32.");
moe_int8_weight_only_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, uint8_t>>();
weight_only_int8_fc_runner_ = std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>();
}
}
template<typename T>
FfnLayer<T>::FfnLayer(FfnLayer<T> const& ffn_layer):
BaseLayer(ffn_layer.stream_,
ffn_layer.cublas_wrapper_,
ffn_layer.allocator_,
ffn_layer.is_free_buffer_after_forward_,
ffn_layer.cuda_device_prop_,
ffn_layer.sparse_),
max_token_num_(ffn_layer.max_token_num_),
head_num_(ffn_layer.head_num_),
size_per_head_(ffn_layer.size_per_head_),
expert_num_(ffn_layer.expert_num_),
hidden_units_(ffn_layer.hidden_units_),
max_inter_size_(ffn_layer.max_inter_size_),
inter_size_(ffn_layer.inter_size_),
int8_mode_(ffn_layer.int8_mode_),
use_gated_activation_(ffn_layer.use_gated_activation_),
moe_fc_runner_(ffn_layer.moe_fc_runner_),
moe_int8_weight_only_fc_runner_(ffn_layer.moe_int8_weight_only_fc_runner_),
weight_only_int8_fc_runner_(ffn_layer.weight_only_int8_fc_runner_),
int8_fc_runner_(ffn_layer.int8_fc_runner_)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
}
template<typename T>
FfnLayer<T>::~FfnLayer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
cublas_wrapper_ = nullptr;
freeBuffer();
}
template<typename T>
void FfnLayer<T>::allocateBuffer()
{
FT_CHECK_WITH_INFO(false,
"FfnLayer::allocateBuffer() is deprecated. Use `allocateBuffer(size_t token_num, ...)` instead");
}
template<typename T>
void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (use_moe) {
moe_gates_buf_ =
(T*)allocator_->reMalloc(moe_gates_buf_, sizeof(T) * pad_to_multiple_of_16(token_num * expert_num_), false);
size_t ws_size_moe = 0;
if (int8_mode_ == 0) {
FT_CHECK_WITH_INFO(moe_fc_runner_.get() != NULL, "moe runner was not initialized.");
ws_size_moe = moe_fc_runner_->getWorkspaceSize(token_num, hidden_units_, inter_size_, expert_num_, moe_k);
}
else if (int8_mode_ == 1) {
FT_CHECK_WITH_INFO(moe_int8_weight_only_fc_runner_.get() != NULL,
"weight only moe runner was not initialized.");
ws_size_moe = moe_int8_weight_only_fc_runner_->getWorkspaceSize(
token_num, hidden_units_, inter_size_, expert_num_, moe_k);
}
moe_fc_workspace_ = (char*)allocator_->reMalloc(moe_fc_workspace_, sizeof(char) * ws_size_moe, false);
}
else {
const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
inter_buf_ = (T*)allocator_->reMalloc(inter_buf_, type_size * token_num * max_inter_size_, false);
if (use_gated_activation_) {
inter_buf_2_ = (T*)allocator_->reMalloc(inter_buf_2_, sizeof(T) * token_num * max_inter_size_, false);
}
if (int8_mode_ == 1) {
FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
// We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
// possible memory that would be required by any of the individual gemms.
const int max_size = std::max(hidden_units_, inter_size_);
mixed_gemm_ws_bytes_ = weight_only_int8_fc_runner_->getWorkspaceSize(token_num, max_size, max_size);
mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
}
else if (int8_mode_ == 2) {
const int max_size = std::max(hidden_units_, inter_size_);
int8_gemm_ws_bytes_ = int8_fc_runner_->getWorkspaceSize(token_num, max_size, max_size);
int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false);
}
}
is_allocate_buffer_ = true;
}
template<typename T>
void FfnLayer<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)(&inter_buf_));
if (use_gated_activation_) {
allocator_->free((void**)(&inter_buf_2_));
}
if (expert_num_ != 0) {
allocator_->free((void**)(&moe_gates_buf_));
allocator_->free((void**)(&moe_fc_workspace_));
}
if (mixed_gemm_workspace_) {
allocator_->free((void**)(&mixed_gemm_workspace_));
mixed_gemm_ws_bytes_ = 0;
}
is_allocate_buffer_ = false;
}
}
#define INVOKE_GENERIC_ACT(ACT) \
invokeGenericActivation<ACT>(inter_buf_, \
bias1, \
inter_buf_2_, \
bias2, \
ia3_tasks, \
ia3_weights, \
m, \
inter_size_, \
int8_mode_, \
activation_in, \
activation_out, \
padding_offset, \
seq_len, \
stream_);
template<typename T>
void FfnLayer<T>::genericActivation(int m,
const T* bias1,
const T* bias2,
const int* ia3_tasks,
const T* ia3_weights,
const float* activation_in,
const float* activation_out,
const int* padding_offset,
const int seq_len)
{
if (ia3_tasks != nullptr) {
FT_CHECK(seq_len > 0);
}
// dispatch according to actual activation
switch (getActivationType()) {
case ActivationType::Gelu:
case ActivationType::GeGLU:
if (inter_buf_2_ == nullptr && int8_mode_ <= 1) {
invokeAddBiasGeluV2(
inter_buf_, bias1, ia3_tasks, ia3_weights, padding_offset, seq_len, m, inter_size_, stream_);
}
else {
INVOKE_GENERIC_ACT(GeluActivation);
}
break;
case ActivationType::Relu:
case ActivationType::ReGLU:
INVOKE_GENERIC_ACT(ReluActivation);
break;
case ActivationType::Silu:
case ActivationType::SiGLU:
INVOKE_GENERIC_ACT(SiluActivation);
break;
case ActivationType::Identity:
INVOKE_GENERIC_ACT(IdentityActivation);
break;
}
}
#undef INVOKE_GENERIC_ACT
template class FfnLayer<float>;
template class FfnLayer<half>;
#ifdef ENABLE_BF16
template class FfnLayer<__nv_bfloat16>;
#endif
template<typename T>
GeluFfnLayer<T>::GeluFfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse,
int int8_mode,
bool use_gated_activation):
FfnLayer<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
expert_num,
inter_size,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
sparse,
int8_mode,
use_gated_activation)
{
}
template<typename T>
GeluFfnLayer<T>::GeluFfnLayer(GeluFfnLayer<T> const& gelu_ffn_layer): FfnLayer<T>(gelu_ffn_layer)
{
}
template class GeluFfnLayer<float>;
template class GeluFfnLayer<half>;
#ifdef ENABLE_BF16
template class GeluFfnLayer<__nv_bfloat16>;
#endif
template<typename T>
ReluFfnLayer<T>::ReluFfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse,
int int8_mode,
bool use_gated_activation):
FfnLayer<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
expert_num,
inter_size,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
sparse,
int8_mode,
use_gated_activation)
{
}
template<typename T>
ReluFfnLayer<T>::ReluFfnLayer(ReluFfnLayer<T> const& relu_ffn_layer): FfnLayer<T>(relu_ffn_layer)
{
}
template class ReluFfnLayer<float>;
template class ReluFfnLayer<half>;
#ifdef ENABLE_BF16
template class ReluFfnLayer<__nv_bfloat16>;
#endif
template<typename T>
SiluFfnLayer<T>::SiluFfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse,
bool use_gated_activation):
FfnLayer<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
expert_num,
inter_size,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
sparse,
0,
use_gated_activation)
{
}
template<typename T>
SiluFfnLayer<T>::SiluFfnLayer(SiluFfnLayer<T> const& gelu_ffn_layer): FfnLayer<T>(gelu_ffn_layer)
{
}
template class SiluFfnLayer<float>;
template class SiluFfnLayer<half>;
#ifdef ENABLE_BF16
template class SiluFfnLayer<__nv_bfloat16>;
#endif
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/fastertransformer/kernels/matrix_vector_multiplication.h"
#include "src/fastertransformer/kernels/moe_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/FfnWeight.h"
#include "src/fastertransformer/utils/activation_types.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include <stdint.h>
#include <vector>
namespace fastertransformer {
template<typename T>
class FfnLayer: public BaseLayer {
private:
// buffer handling
size_t max_token_num_ = 0;
// meta data
size_t head_num_; // (martinma): this member is not used in this class. Remove it?
size_t size_per_head_; // (martinma): this member is not used in this class. Remove it?
size_t expert_num_;
// calculated data
size_t hidden_units_;
// gated activation
bool use_gated_activation_;
std::shared_ptr<CutlassMoeFCRunner<T, T>> moe_fc_runner_;
std::shared_ptr<CutlassMoeFCRunner<T, uint8_t>> moe_int8_weight_only_fc_runner_;
std::shared_ptr<CutlassFpAIntBGemmRunner<T, uint8_t>> weight_only_int8_fc_runner_;
std::shared_ptr<CutlassInt8GemmRunner<T>> int8_fc_runner_;
void allocateBuffer() override;
void freeBuffer() override;
void allocateBuffer(int moe_k = 0, bool use_moe = false);
void allocateBuffer(size_t token_num, int moe_k = 0, bool use_moe = false);
protected:
T* inter_buf_ = nullptr;
T* inter_buf_2_ = nullptr; // for gated activation
T* moe_gates_buf_ = nullptr;
char* moe_fc_workspace_ = nullptr;
char* mixed_gemm_workspace_ = nullptr;
size_t mixed_gemm_ws_bytes_ = 0;
char* int8_gemm_workspace_ = nullptr;
size_t int8_gemm_ws_bytes_ = 0;
size_t inter_size_;
/* used to allocater memory buffers
different ffn layers (inter_size) will
reuse the same ffn layer with the max inter size.
max_inter_size will be passed as inter_size when initializing the ffn layer
*/
size_t max_inter_size_;
// int8_mode_ == 0 means we don't use any mechanism related to INT8.
// int8_mode_ == 1 for weight quantized only gemm for GPT
// int8_mode_ == 2 for SmoothQuant O3 (per tensor scales)
int int8_mode_ = 0;
virtual ActivationType getActivationType() const
{
return ActivationType::InvalidType;
};
void genericActivation(int m,
const T* bias1,
const T* bias2,
const int* ia3_tasks,
const T* ia3_weights,
const float* activation_in,
const float* activation_out,
const int* padding_offset,
const int seq_len);
public:
FfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num, // (martinma): redundant parameter?
size_t size_per_head, // (martinma): redundant parameter?
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false,
int int8_mode = 0,
bool use_gated_activation = false);
FfnLayer(FfnLayer<T> const& ffn_layer);
virtual ~FfnLayer();
void resetInterSize(size_t runtime_inter_size)
{
inter_size_ = runtime_inter_size;
}
virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
};
template<typename T>
class GeluFfnLayer: public FfnLayer<T> {
public:
GeluFfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false,
int int8_mode = 0,
bool use_gated_activation = false);
GeluFfnLayer(GeluFfnLayer<T> const& ffn_layer);
virtual ~GeluFfnLayer() = default;
protected:
using FfnLayer<T>::stream_;
virtual ActivationType getActivationType() const override
{
return ActivationType::Gelu;
};
private:
using FfnLayer<T>::inter_buf_;
using FfnLayer<T>::inter_buf_2_;
using FfnLayer<T>::inter_size_;
};
template<typename T>
class ReluFfnLayer: public FfnLayer<T> {
public:
ReluFfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false,
int int8_mode = 0,
bool use_gated_activation = false);
ReluFfnLayer(ReluFfnLayer<T> const& ffn_layer);
virtual ~ReluFfnLayer() = default;
protected:
using FfnLayer<T>::stream_;
virtual ActivationType getActivationType() const override
{
return ActivationType::Relu;
};
private:
using FfnLayer<T>::inter_buf_;
using FfnLayer<T>::inter_buf_2_;
using FfnLayer<T>::inter_size_;
};
template<typename T>
class SiluFfnLayer: public FfnLayer<T> {
public:
SiluFfnLayer(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t expert_num,
size_t inter_size,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false,
bool use_gated_activation = false);
SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);
virtual ~SiluFfnLayer() = default;
protected:
using FfnLayer<T>::stream_;
virtual ActivationType getActivationType() const override
{
return ActivationType::Silu;
};
private:
using FfnLayer<T>::inter_buf_;
using FfnLayer<T>::inter_buf_2_;
using FfnLayer<T>::inter_size_;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "FfnLayerINT8.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
namespace fastertransformer {
template<typename T>
void FfnLayerINT8<T>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights)
{
// input_tensors: [input (token_num, hidden_dimension)]
// output_tensors: [output (token_num, hidden_dimension)]
ScaleList* scale_list = ((const FfnINT8Weight<T>*)ffn_weights)->scale_list_ptr;
cublasINT8MMWrapper* cublas_wrapper = (cublasINT8MMWrapper*)cublas_wrapper_;
FT_CHECK(isValidTokenNum(input_tensors->at(0).shape[0]));
allocateBuffer();
const int m = static_cast<int>(input_tensors->at(0).shape[0]);
#ifdef SPARSITY_ENABLED
int m_tmp = m;
if (m_tmp % 16 != 0) {
m_tmp = (m_tmp / 16 + 1) * 16;
}
const int m_padded = m_tmp;
#endif
int32_t* output_tensor = output_tensors->at(0).getPtr<int32_t>();
const int8_t* input_tensor = input_tensors->at(0).getPtr<const int8_t>();
PUSH_RANGE("FFN gemm 1");
if (int8_mode_ == 1) {
cublas_wrapper->Gemm(inter_int_buf_,
1,
m,
inter_size_,
hidden_units_,
0,
0,
0,
input_tensor,
(int8_t*)(ffn_weights->intermediate_weight.kernel));
}
else if (int8_mode_ == 2 || int8_mode_ == 3) {
#ifdef SPARSITY_ENABLED
if (sparse_) {
cublas_wrapper->SpGemm(inter_size_,
m_padded,
hidden_units_,
scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
(int8_t*)(ffn_weights->intermediate_weight.sp_kernel),
input_tensor,
(int8_t*)inter_int_buf_);
}
else {
#endif
cublas_wrapper->Gemm((int8_t*)inter_int_buf_,
1,
m,
inter_size_,
hidden_units_,
0,
0,
0,
scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
input_tensor,
(int8_t*)(ffn_weights->intermediate_weight.kernel));
#ifdef SPARSITY_ENABLED
}
#endif
}
POP_RANGE;
PUSH_RANGE("add bias act");
invokeAddBiasActivation(m, ffn_weights->intermediate_weight.bias, scale_list);
POP_RANGE;
sync_check_cuda_error();
PUSH_RANGE("FFN gemm 2");
if (int8_mode_ == 1) {
cublas_wrapper->Gemm(output_tensor,
1,
m,
hidden_units_,
inter_size_,
0,
0,
0,
inter_buf_,
(int8_t*)(ffn_weights->output_weight.kernel));
}
else if (int8_mode_ == 2 || int8_mode_ == 3) {
#ifdef SPARSITY_ENABLED
if (sparse_) {
cublas_wrapper->SpGemm(hidden_units_,
m_padded,
inter_size_,
scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
(int8_t*)(ffn_weights->output_weight.sp_kernel),
inter_buf_,
(int8_t*)output_tensor);
}
else {
#endif
cublas_wrapper->Gemm((int8_t*)output_tensor,
1,
m,
hidden_units_,
inter_size_,
0,
0,
0,
scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
inter_buf_,
(int8_t*)(ffn_weights->output_weight.kernel));
#ifdef SPARSITY_ENABLED
}
#endif
}
POP_RANGE;
sync_check_cuda_error();
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T>
FfnLayerINT8<T>::FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
max_token_num_(max_batch_size * max_seq_len),
head_num_(head_num),
size_per_head_(size_per_head),
hidden_units_(head_num * size_per_head),
inter_size_(inter_size),
int8_mode_(int8_mode),
sparse_(sparse)
{
}
template<typename T>
FfnLayerINT8<T>::FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer):
BaseLayer(
ffn_layer.stream_, ffn_layer.cublas_wrapper_, ffn_layer.allocator_, ffn_layer.is_free_buffer_after_forward_),
max_token_num_(ffn_layer.max_token_num_),
head_num_(ffn_layer.head_num_),
size_per_head_(ffn_layer.size_per_head_),
hidden_units_(ffn_layer.hidden_units_),
inter_size_(ffn_layer.inter_size_),
int8_mode_(ffn_layer.int8_mode_),
sparse_(ffn_layer.sparse_)
{
}
template<typename T>
FfnLayerINT8<T>::~FfnLayerINT8()
{
cublas_wrapper_ = nullptr;
freeBuffer();
}
template<typename T>
void FfnLayerINT8<T>::allocateBuffer()
{
if (is_allocate_buffer_ == false) {
inter_int_buf_ =
(int32_t*)allocator_->reMalloc(inter_int_buf_, sizeof(int32_t) * max_token_num_ * inter_size_, false);
inter_buf_ = (int8_t*)allocator_->reMalloc(inter_buf_, sizeof(int8_t) * max_token_num_ * inter_size_, false);
is_allocate_buffer_ = true;
}
}
template<typename T>
void FfnLayerINT8<T>::freeBuffer()
{
if (is_allocate_buffer_ == true) {
allocator_->free((void**)(&inter_int_buf_));
allocator_->free((void**)(&inter_buf_));
is_allocate_buffer_ = false;
}
}
template<typename T>
bool FfnLayerINT8<T>::isValidTokenNum(size_t token_num)
{
if (max_token_num_ == 0) {
max_token_num_ = token_num;
return true;
}
else {
return token_num <= max_token_num_;
}
}
template class FfnLayerINT8<float>;
template class FfnLayerINT8<half>;
template<typename T>
GeluFfnLayerINT8<T>::GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse):
FfnLayerINT8<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
inter_size,
int8_mode,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward,
sparse)
{
}
template<typename T>
GeluFfnLayerINT8<T>::GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& gelu_ffn_layer): FfnLayerINT8<T>(gelu_ffn_layer)
{
}
template<typename T>
void GeluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
{
if (int8_mode_ == 1) {
invokeAddBiasGeluCol32<T>(inter_buf_,
inter_int_buf_,
bias,
m,
inter_size_,
stream_,
&(scale_list->d_scale_list_[scale_list->p2_offset_ + 4 * hidden_units_]),
&(scale_list->d_scale_list_[44 + 2]),
&(scale_list->d_scale_list_[52 + 3]));
}
else if (int8_mode_ == 2 || int8_mode_ == 3) {
#ifdef SPARSITY_ENABLED
if (sparse_) {
invokeAddBiasGeluRow<T>(inter_buf_,
(const int8_t*)inter_int_buf_,
bias,
m,
inter_size_,
stream_,
&(scale_list->d_scale_list_[48 + 1]),
&(scale_list->d_scale_list_[52 + 3]));
}
else {
#endif
invokeAddBiasGeluCol32<T>(inter_buf_,
(const int8_t*)inter_int_buf_,
bias,
m,
inter_size_,
stream_,
&(scale_list->d_scale_list_[48 + 1]),
&(scale_list->d_scale_list_[52 + 3]));
#ifdef SPARSITY_ENABLED
}
#endif
}
}
template class GeluFfnLayerINT8<float>;
template class GeluFfnLayerINT8<half>;
template<typename T>
ReluFfnLayerINT8<T>::ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
FfnLayerINT8<T>(max_batch_size,
max_seq_len,
head_num,
size_per_head,
inter_size,
int8_mode,
stream,
cublas_wrapper,
allocator,
is_free_buffer_after_forward)
{
}
template<typename T>
ReluFfnLayerINT8<T>::ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& relu_ffn_layer): FfnLayerINT8<T>(relu_ffn_layer)
{
}
template<typename T>
void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
{
// TODO
}
template class ReluFfnLayerINT8<float>;
template class ReluFfnLayerINT8<half>;
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnINT8Weight.h"
#include "src/fastertransformer/kernels/activation_int8_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/utils/ScaleList.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include <vector>
namespace fastertransformer {
template<typename T>
class GeluFfnLayerINT8;
template<typename T>
class ReluFfnLayerINT8;
template<typename T>
class FfnLayerINT8: public BaseLayer {
private:
// buffer handling
size_t max_token_num_ = 0;
// meta data
size_t head_num_;
size_t size_per_head_;
// calculated data
size_t hidden_units_;
void allocateBuffer() override;
void freeBuffer() override;
bool isValidTokenNum(size_t token_num);
protected:
size_t inter_size_;
int int8_mode_;
bool sparse_;
int* inter_int_buf_;
int8_t* inter_buf_;
virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
public:
FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
~FfnLayerINT8();
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>;
friend ReluFfnLayerINT8<T>;
};
template<typename T>
class GeluFfnLayerINT8: public FfnLayerINT8<T> {
public:
GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
~GeluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::sparse_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
template<typename T>
class ReluFfnLayerINT8: public FfnLayerINT8<T> {
public:
ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
~ReluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "DenseWeight.h"
namespace fastertransformer {
template<typename T1, typename T2 = T1>
struct FfnWeight {
DenseWeight<T1, T2> gating_weight;
DenseWeight<T1, T2> intermediate_weight;
DenseWeight<T1, T2> intermediate_weight2; // for gated activation
DenseWeight<T1, T2> output_weight;
DenseWeight<T1, T2> ia3_weight;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/DenseWeight.h"
namespace fastertransformer {
template<typename T1, typename T2 = T1>
struct AttentionWeight {
DenseWeight<T1, T2> query_weight;
DenseWeight<T1, T2> key_weight;
DenseWeight<T1, T2> value_weight;
DenseWeight<T1, T2> attention_output_weight;
DenseWeight<T1, T2> ia3_key_weight;
DenseWeight<T1, T2> ia3_value_weight;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <assert.h>
#include <vector>
// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
enum class AttentionType {
UNFUSED_MHA,
UNFUSED_PADDED_MHA,
FUSED_MHA,
FUSED_PADDED_MHA
};
/* NOTE:
1. only swin-style relative position bias is supported currently
2. gpt-style (causal-mask) models support any-sequence-length fmha, so we don't need to call isValidSeqLen at run-time
3. bert/vit can also support any-seq-length fmha
*/
template<typename T>
AttentionType getAttentionType(size_t size_per_head,
const int sm,
const bool remove_padding,
const int max_seq_len,
const bool is_fuse = true,
const bool with_swin_relative_position_bias = false,
const bool causal_mask = false)
{
if (std::is_same<T, half>::value && is_fuse) {
// Bert/Vit
if (!causal_mask) {
if (!with_swin_relative_position_bias
&& (((sm == kSM_70 || sm == kSM_72) && size_per_head == 64)
|| ((sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
&& (size_per_head == 64 || size_per_head == 32)))) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
}
else if (with_swin_relative_position_bias && (sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
&& max_seq_len <= 256 && size_per_head == 32) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
}
}
// GPT and its variants
else {
// FMHA_ENABLE only affects gpt-style models (causal-mask)
char * fused_qkv = std::getenv("FMHA_ENABLE");
if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
&& (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
|| size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
}
}
}
}
#ifdef ENABLE_FP8
else if (std::is_same<T, __nv_fp8_e4m3>::value && is_fuse) {
if (!causal_mask) {
if ((sm == kSM_89 || sm == kSM_90) && max_seq_len < 512 && is_fuse && size_per_head == 64) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
}
else {
return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
}
}
}
#endif
return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
}
template<typename T>
AttentionType getAttentionTypeINT8(
size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len, const int int8_mode)
{
if ((int8_mode == 1 || int8_mode == 2)
&& (((sm == kSM_80 || sm == kSM_86) && (size_per_head == 64 || size_per_head == 32) && max_seq_len <= 512)
|| (sm == kSM_75
&& ((size_per_head == 64 && max_seq_len <= 384) || (size_per_head == 32 && max_seq_len <= 512))))) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
}
else {
return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
}
}
inline bool isFusedMHA(AttentionType attention_type)
{
return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA;
}
inline bool isUnPaddedMHA(AttentionType attention_type)
{
return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::UNFUSED_MHA;
}
inline bool isPaddedMHA(AttentionType attention_type)
{
return attention_type == AttentionType::FUSED_PADDED_MHA || attention_type == AttentionType::UNFUSED_PADDED_MHA;
}
inline AttentionType getUnfusedAttentionType(AttentionType attention_type)
{
if (attention_type == AttentionType::FUSED_MHA) {
return AttentionType::UNFUSED_MHA;
}
else if (attention_type == AttentionType::FUSED_PADDED_MHA) {
return AttentionType::UNFUSED_PADDED_MHA;
}
return attention_type;
}
template<typename T>
class BaseAttentionLayer: public BaseLayer {
public:
virtual void
forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight<T>* attention_weights) = 0;
BaseAttentionLayer(cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
{
}
virtual ~BaseAttentionLayer() = default;
virtual bool isValidSeqLen(const size_t seq_len)
{
return true;
}
};
} // namespace fastertransformer
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T1, typename T2>
struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
const float* qk_scale;
const float* qk_scale_inv;
float* qk_h_scale;
float* qk_h_scale_inv;
float* identity_scale;
float* identity_h_scale;
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <assert.h>
#include <vector>
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
// template<typename T>
// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
// const bool is_fuse = true)
// {
// if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm ==
// kSM_72)
// && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
// return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
// }
// else {
// return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
// }
// }
template<typename T1, typename T2>
class BaseAttentionFP8Layer: public BaseLayer {
public:
virtual void forward(TensorMap* output_tensors,
TensorMap* input_tensors,
const AttentionFP8Weight<T1, T2>* attention_weights) = 0;
BaseAttentionFP8Layer(cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
{
}
virtual ~BaseAttentionFP8Layer() = default;
};
} // namespace fastertransformer
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment