/* * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "FfnINT8Weight.h" #include "src/fastertransformer/kernels/activation_int8_kernels.h" #include "src/fastertransformer/layers/BaseLayer.h" #include "src/fastertransformer/utils/ScaleList.h" #include "src/fastertransformer/utils/Tensor.h" #include "src/fastertransformer/utils/allocator.h" #include "src/fastertransformer/utils/cublasINT8MMWrapper.h" #include "src/fastertransformer/utils/memory_utils.h" #include namespace fastertransformer { template class GeluFfnLayerINT8; template class ReluFfnLayerINT8; template class FfnLayerINT8: public BaseLayer { private: // buffer handling size_t max_token_num_ = 0; // meta data size_t head_num_; size_t size_per_head_; // calculated data size_t hidden_units_; void allocateBuffer() override; void freeBuffer() override; bool isValidTokenNum(size_t token_num); protected: size_t inter_size_; int int8_mode_; bool sparse_; int* inter_int_buf_; int8_t* inter_buf_; virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0; public: FfnLayerINT8(size_t max_batch_size, size_t max_seq_len, size_t head_num, size_t size_per_head, size_t inter_size, int int8_mode, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, bool sparse = false); FfnLayerINT8(FfnLayerINT8 const& ffn_layer); ~FfnLayerINT8(); void forward(std::vector* output_tensors, const std::vector* input_tensors, const FfnWeight* ffn_weights); friend GeluFfnLayerINT8; friend ReluFfnLayerINT8; }; template class GeluFfnLayerINT8: public FfnLayerINT8 { public: GeluFfnLayerINT8(size_t max_batch_size, size_t max_seq_len, size_t head_num, size_t size_per_head, size_t inter_size, int int8_mode, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward, bool sparse = false); GeluFfnLayerINT8(GeluFfnLayerINT8 const& ffn_layer); ~GeluFfnLayerINT8() = default; private: using FfnLayerINT8::inter_int_buf_; using FfnLayerINT8::inter_buf_; using FfnLayerINT8::inter_size_; using FfnLayerINT8::stream_; using FfnLayerINT8::int8_mode_; using FfnLayerINT8::sparse_; using FfnLayerINT8::hidden_units_; void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override; }; template class ReluFfnLayerINT8: public FfnLayerINT8 { public: ReluFfnLayerINT8(size_t max_batch_size, size_t max_seq_len, size_t head_num, size_t size_per_head, size_t inter_size, int int8_mode, cudaStream_t stream, cublasMMWrapper* cublas_wrapper, IAllocator* allocator, bool is_free_buffer_after_forward); ReluFfnLayerINT8(ReluFfnLayerINT8 const& ffn_layer); ~ReluFfnLayerINT8() = default; private: using FfnLayerINT8::inter_int_buf_; using FfnLayerINT8::inter_buf_; using FfnLayerINT8::inter_size_; using FfnLayerINT8::stream_; using FfnLayerINT8::int8_mode_; using FfnLayerINT8::hidden_units_; void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override; }; } // namespace fastertransformer