check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/src/fastertransformer/layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_subdirectory(beam_search_layers)
+add_subdirectory(sampling_layers)
+
+add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
+set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
+                        TopKSamplingLayer TopPSamplingLayer
+                        OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
+                        gpt_kernels tensor nvtx_utils)
\ No newline at end of file
--- a/src/fastertransformer/layers/DenseWeight.h
+++ b/src/fastertransformer/layers/DenseWeight.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "stdlib.h"
+
+namespace fastertransformer {
+
+// Note that the int8 mode of BERT and GPT are different.
+// For int8 mode = 2 on GPT:
+// scale (gemm input scale): quantize input of GEMM (float/half) in the int8 range. Namely, int8_x = scale * x
+// scale_inter: (gemm output scale) / (gemm input scale * gemm weight scale)
+// scale_out: 1 / (gemm output scale), dequantize activation from int8 range to float/half.
+template<typename T1, typename T2 = T1>
+struct DenseWeight {
+    const T1* kernel    = nullptr;
+    const T2* bias      = nullptr;
+    const T1* fp8_bias  = nullptr;
+    const T1* sp_kernel = nullptr;
+    // for int8 kernel
+    const int8_t* int8_kernel             = nullptr;
+    const float*  scale                   = nullptr;
+    const T2*     weight_only_quant_scale = nullptr;
+    const T2*     moe_scale               = nullptr;
+    const float*  scale_inter             = nullptr;
+    const float*  scale_out               = nullptr;
+
+    // FP8 scales
+    // scale = AMAX(tensor) / FP8_MAX
+    // During GEMM, A (original) = A_scaled (fp8) * "scale of A"
+    const float* input_scale      = nullptr;  // a scalar
+    const float* input_scale_inv  = nullptr;  // a scalar
+    const float* weight_scale     = nullptr;  // a scalar or a vector
+    const float* weight_scale_inv = nullptr;  // a scalar or a vector
+    const float* output_scale     = nullptr;  // a scalar
+    const float* output_scale_inv = nullptr;  // a scalar
+    // host pointer of scales, all are scalars
+    const float* input_h_scale      = nullptr;
+    const float* input_h_scale_inv  = nullptr;
+    const float* weight_h_scale     = nullptr;
+    const float* weight_h_scale_inv = nullptr;
+    const float* output_h_scale     = nullptr;
+    const float* output_h_scale_inv = nullptr;
+
+    // TODO(bhsueh) check do we need this param
+    const float* per_channel_scale_min =
+        nullptr;  // = min(weight_scale), used to adjust the scaling of per channel scaling
+
+    bool fuse_gemm_bias = false;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/DynamicDecodeBaseLayer.h
+++ b/src/fastertransformer/layers/DynamicDecodeBaseLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+
+namespace fastertransformer {
+
+class DynamicDecodeBaseLayer: public BaseLayer {
+protected:
+    virtual void allocateBuffer() = 0;
+    virtual void freeBuffer()     = 0;
+
+public:
+    DynamicDecodeBaseLayer(cudaStream_t     stream,
+                           cublasMMWrapper* cublas_wrapper,
+                           IAllocator*      allocator,
+                           bool             is_free_buffer_after_forward,
+                           cudaDeviceProp*  cuda_device_prop):
+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop){};
+    ~DynamicDecodeBaseLayer() = default;
+    DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){};
+
+    virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
+    virtual void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                         const std::vector<fastertransformer::Tensor>* input_tensors)             = 0;
+    virtual void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                         const std::unordered_map<std::string, Tensor>* input_tensors)            = 0;
+    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors)                     = 0;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/DynamicDecodeLayer.cc
+++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc
+/*
+ * Copyright (c) 2022-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/kernels/ban_bad_words.h"
+#include "src/fastertransformer/kernels/stop_criteria_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
+#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void DynamicDecodeLayer<T>::allocateBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true);
+    return;
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    allocator_->free((void**)(&h_pinned_finished_sum_), true);
+    return;
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::initialize()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0,  // max_batch_size, deprecated
+                                                             0,  // local_head_num, deprecated
+                                                             0,  // size_per_head, deprecated
+                                                             0,  // beam_width, deprecated
+                                                             vocab_size_,
+                                                             vocab_size_padded_,
+                                                             0,     // end_id, deprecated
+                                                             0.0f,  // beam_search_diversity_rate_, deprecated
+                                                             1.0f,  // temperature_, deprecated
+                                                             0.0f,  // len_penalty_, deprecated
+                                                             1.0f,  // repetition_penalty_, deprecated
+                                                             stream_,
+                                                             cublas_wrapper_,
+                                                             allocator_,
+                                                             is_free_buffer_after_forward_);
+
+    beamsearch_decode_ = new BeamSearchLayer<T>(0,  // max_batch_size, deprecated
+                                                0,  // local_head_num, deprecated
+                                                0,  // size_per_head, deprecated
+                                                0,  // beam_width, deprecated
+                                                vocab_size_,
+                                                vocab_size_padded_,
+                                                0,     // end_id, deprecated
+                                                0.0f,  // beam_search_diversity_rate_, deprecated
+                                                1.0f,  // temperature_, deprecated
+                                                0.0f,  // len_penalty_, deprecated
+                                                1.0f,  // repetition_penalty_, deprecated
+                                                stream_,
+                                                cublas_wrapper_,
+                                                allocator_,
+                                                is_free_buffer_after_forward_);
+
+    topk_decode_ = new TopKSamplingLayer<T>(0,
+                                            vocab_size_,
+                                            vocab_size_padded_,
+                                            0,     // end_id, deprecated
+                                            0,     // top_k_, deprecated
+                                            0,     // random_seed_, deprecated
+                                            1.0f,  // temperature_, deprecated
+                                            0.0f,  // len_penalty_, deprecated
+                                            1.0f,  // repetition_penalty_, deprecated
+                                            stream_,
+                                            cublas_wrapper_,
+                                            allocator_,
+                                            false);
+
+    topp_decode_ = new TopPSamplingLayer<T>(0,
+                                            vocab_size_,
+                                            vocab_size_padded_,
+                                            0,     // end_id, deprecated
+                                            0.0f,  // top_p_, deprecated
+                                            0,     // random_seed_, deprecated
+                                            1.0f,  // temperature_, deprecated
+                                            0.0f,  // len_penalty_, deprecated
+                                            1.0f,  // repetition_penalty_, deprecated
+                                            stream_,
+                                            cublas_wrapper_,
+                                            allocator_,
+                                            false,
+                                            cuda_device_prop_);
+
+    allocateBuffer();
+}
+
+template<typename T>
+DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t           vocab_size,
+                                          size_t           vocab_size_padded,
+                                          int              end_id,
+                                          cudaStream_t     stream,
+                                          cublasMMWrapper* cublas_wrapper,
+                                          IAllocator*      allocator,
+                                          bool             is_free_buffer_after_forward,
+                                          cudaDeviceProp*  cuda_device_prop):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    vocab_size_(vocab_size),
+    vocab_size_padded_(vocab_size_padded),
+    cuda_device_prop_(cuda_device_prop)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    initialize();
+}
+
+template<typename T>
+DynamicDecodeLayer<T>::~DynamicDecodeLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    delete online_beamsearch_decode_;
+    delete beamsearch_decode_;
+    delete topk_decode_;
+    delete topp_decode_;
+    freeBuffer();
+}
+
+template<typename T>
+DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer):
+    BaseLayer(dynamic_decode_layer),
+    vocab_size_(dynamic_decode_layer.vocab_size_),
+    vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_),
+    cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    initialize();
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    /**
+     * @brief Set up the dynamic decode layer for given input runtime arguments.
+     *
+     * runtime_args:
+     *   \param  runtime_top_k [1] or [batch_size] on cpu, optional.
+     *   \param  runtime_top_p [1] or [batch_size] on cpu, optional
+     *   \param  beam_search_diversity_rate [1] or [batch_size] on cpu, optional
+     *   \param  temperature [1] or [batch_size] on cpu, optional
+     *   \param  len_penalty [1] or [batch_size] on cpu, optional
+     *   \param  repetition_penalty [1] or [batch_size] on cpu, optional
+     *   \param  presence_penalty [1] or [batch_size] on cpu, optional, float
+     *   \param  min_length [1] or [batch_size], optional
+     *   \param  top_p_decay [batch_size] on gpu, float, optional
+     *   \param  top_p_min [batch_size] on gpu, float, optional
+     *   \param  top_p_reset_ids [batch_size] on gpu, uint32, optional
+     */
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    has_diff_runtime_args_ = hasDiffRuntimeArgs(runtime_args);
+    if (beam_width == 1) {  // sampling layers
+        topk_decode_->setup(batch_size, beam_width, runtime_args);
+        topp_decode_->setup(batch_size, beam_width, runtime_args);
+    }
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                    const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TensorMap input_map(*input_tensors);
+    TensorMap output_map(*output_tensors);
+    forward(&output_map, &input_map);
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    /**
+     * @brief
+     * input_tensors:
+     *   \param  logits [batch_size, beam_width, vocab_size_padded]
+     *   \param  embedding_bias [vocab_size_padded], optional
+     *   \param  step [1] on cpu
+     *   \param  max_input_length [1] on cpu
+     *   \param  input_lengths [batch_size, beam_width], optional
+     *   \param  min_length [batch_size], optional
+     *   \param  sequence_limit_length [batch_size]
+     *   \param  ite [1] on cpu
+     *   \param  local_batch_size [1] on cpu
+     *   \param  stop_words_list [batch_size, 2, stop_words_length], optional
+     *   \param  runtime_top_k [1] or [batch_size] on cpu, optional, uint
+     *   \param  runtime_top_p [1] or [batch_size] on cpu, optional, float
+     *   \param  temperature [1] or [batch_size] on cpu, optional, float
+     *   \param  len_penalty [1] or [batch_size] on cpu, optional, float
+     *   \param  repetition_penalty [1] or [batch_size] on cpu, optional, float
+     *   \param  presence_penalty [1] or [batch_size] on cpu, optional, float
+     *                Only one of repetition and presence penalties is allowed.
+     *   \param  random_seed [1] or [batch_size] on cpu, optional, unsigned long long int
+     *   \param  bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
+     *   \param  src_cache_indirection
+     *                [local_batch_size, beam_width, max_seq_len]
+     *                the k/v cache index for beam search
+     *   \param  is_initialize_random_table [1] on cpu, bool
+     *   \param  top_p_decay [batch_size] on gpu, float, optional
+     *   \param  top_p_min [batch_size] on gpu, float, optional
+     *   \param  top_p_reset_ids [batch_size] on gpu, uint32, optional
+     *
+     * output_tensors:
+     *   \param  output_ids [max_seq_len, batch_size]
+     *   \param  finished [batch_size * beam_width], optional
+     *   \param  should_stop [1] on cpu
+     *   \param  cum_log_probs [batch_size * beam_width], necessary in beam search
+     *   \param  parent_ids [max_seq_len, batch_size * beam_width]
+     *   \param  sequence_length [batch_size * beam_width], optional
+     *   \param  output_log_probs [request_ouptut_length, batch_size * beam_width], must be float*, optional
+     *   \param  tgt_cache_indirection
+     *                [local_batch_size, beam_width, max_seq_len]
+     *                the k/v cache index for beam search
+     *   \param  beam_hyps: [1] on cpu, a special structure which maintains some pointers of beam search
+     *
+     */
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    const int ite  = (int)input_tensors->at("ite").getVal<uint>();
+    const int step = input_tensors->at("step").getVal<int>();
+    FT_CHECK(input_tensors->at("logits").shape.size() == 3);
+
+    const size_t batch_size       = input_tensors->at("logits").shape[0];
+    const size_t beam_width       = input_tensors->at("logits").shape[1];
+    const size_t local_batch_size = (size_t)input_tensors->at("local_batch_size").getVal<int>();
+
+    if (input_tensors->isExist("bad_words_list")) {
+        const auto& bad_words     = input_tensors->at("bad_words_list");
+        const int*  bad_words_ptr = bad_words.getPtr<const int>();
+        FT_CHECK_WITH_INFO(bad_words.shape.size() == 2 || bad_words.shape.size() == 3,
+                           "Bad words dimension must be 2 or 3.");
+
+        const bool is_matrix = bad_words.shape.size() == 2;
+        if (bad_words.shape.size() == 3) {
+            FT_CHECK_WITH_INFO(bad_words.shape[0] == batch_size,
+                               fmtstr("Shape of dim 0 of bad words is invalid. It must be equal to batch size."
+                                      " However, it is %d and the batch size is %d.",
+                                      bad_words.shape[0],
+                                      batch_size));
+        }
+
+        const bool   shared_bad_words = is_matrix || bad_words.shape[0] == 1;
+        const size_t bad_words_len    = bad_words.shape[is_matrix ? 1 : 2];
+        // Add check on batch size of bad words
+        const int id_offset                      = ite * local_batch_size;
+        const int decode_vocab_size_units_offset = id_offset * vocab_size_padded_;
+
+        invokeBanBadWords((T*)input_tensors->at("logits").getPtrWithOffset(decode_vocab_size_units_offset),
+                          output_tensors->at("output_ids").getPtr<const int>(),
+                          beam_width > 1 ? output_tensors->at("parent_ids").getPtr<const int>() : nullptr,
+                          batch_size,
+                          local_batch_size,
+                          beam_width,
+                          shared_bad_words ?
+                              bad_words_ptr :
+                              bad_words.getPtrWithOffset<const int>(ite * local_batch_size * 2 * bad_words_len),
+                          shared_bad_words,
+                          bad_words_len,
+                          id_offset,
+                          vocab_size_padded_,
+                          step,
+                          stream_);
+    }
+
+    // dynamic decode GPT
+    if (beam_width > 1) {
+        // Because we still not support batch beam search now, so we need to compute one by one if there are different
+        // runtime arguments.
+        const size_t dynamic_decode_batch_size      = has_diff_runtime_args_ ? 1 : local_batch_size;
+        const int    dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size;
+
+        for (uint dynamic_ite = ite * dynamic_decode_total_iteration;
+             dynamic_ite < (ite + 1) * dynamic_decode_total_iteration;
+             ++dynamic_ite) {
+            const int dynamic_id_offset                      = dynamic_ite * dynamic_decode_batch_size * beam_width;
+            const int dynamic_decode_vocab_size_units_offset = dynamic_id_offset * vocab_size_padded_;
+
+            // common inputs
+            Tensor logits = input_tensors->at("logits");
+            Tensor end_id = input_tensors->at("end_id");
+
+            TensorMap dynamic_decode_input_tensors(
+                {{"logits",
+                  Tensor{logits.where,
+                         logits.type,
+                         {dynamic_decode_batch_size, logits.shape[1], logits.shape[2]},
+                         logits.getPtrWithOffset(dynamic_decode_vocab_size_units_offset)}},
+                 {"step", input_tensors->at("step")},
+                 {"max_input_length", input_tensors->at("max_input_length")},
+                 {"end_id",
+                  Tensor{end_id.where,
+                         end_id.type,
+                         {dynamic_decode_batch_size},
+                         end_id.getPtrWithOffset(dynamic_ite * dynamic_decode_batch_size)}},
+                 {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &dynamic_ite}}});
+
+            if (input_tensors->isExist("embedding_bias")) {
+                dynamic_decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
+            }
+            if (input_tensors->isExist("input_lengths")) {
+                Tensor input_lengths = input_tensors->at("input_lengths");
+                dynamic_decode_input_tensors.insert(
+                    {"input_lengths",
+                     input_lengths.slice({dynamic_decode_batch_size, input_lengths.shape[1]}, dynamic_id_offset)});
+            }
+            for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+                if (t->first.find("random_seed") == std::string::npos) {
+                    dynamic_decode_input_tensors.insert(*t);
+                }
+            }
+
+            // common outputs
+            TensorMap dynamic_decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
+            if (output_tensors->isExist("sequence_length")) {
+                Tensor sequence_length = output_tensors->at("sequence_length");
+                dynamic_decode_output_tensors.insert({"sequence_length",
+                                                      Tensor{sequence_length.where,
+                                                             sequence_length.type,
+                                                             {dynamic_decode_batch_size * beam_width},
+                                                             sequence_length.getPtrWithOffset(dynamic_id_offset)}});
+            }
+            if (output_tensors->isExist("finished")) {
+                Tensor finished = output_tensors->at("finished");
+                dynamic_decode_output_tensors.insert({"finished",
+                                                      Tensor{finished.where,
+                                                             finished.type,
+                                                             {dynamic_decode_batch_size * beam_width},
+                                                             finished.getPtrWithOffset(dynamic_id_offset)}});
+            }
+            if (output_tensors->isExist("cum_log_probs")) {
+                Tensor cum_log_probs = output_tensors->at("cum_log_probs");
+                dynamic_decode_output_tensors.insert({"cum_log_probs",
+                                                      Tensor{cum_log_probs.where,
+                                                             cum_log_probs.type,
+                                                             {dynamic_decode_batch_size * beam_width},
+                                                             cum_log_probs.getPtrWithOffset(dynamic_id_offset)}});
+            }
+            if (output_tensors->isExist("beam_hyps")) {
+                dynamic_decode_output_tensors.insert("beam_hyps", output_tensors->at("beam_hyps"));
+            }
+
+            if (output_tensors->isExist("output_log_probs")) {
+                dynamic_decode_output_tensors.insert({"output_log_probs", output_tensors->at("output_log_probs")});
+            }
+
+            dynamic_decode_input_tensors.insert({"src_cache_indirection", input_tensors->at("src_cache_indirection")});
+
+            dynamic_decode_output_tensors.insert({"parent_ids", output_tensors->at("parent_ids")});
+            dynamic_decode_output_tensors.insert(
+                {"tgt_cache_indirection", output_tensors->at("tgt_cache_indirection")});
+
+            FT_CHECK_WITH_INFO(dynamic_decode_output_tensors.isExist("cum_log_probs"),
+                               "cum_log_probs should be provided in beam search.");
+
+            if (true || beam_width < 16
+                || (output_tensors->isExist("beam_hyps")
+                    && input_tensors->getVal<float>("beam_search_diversity_rate", 0.0f) != 0.0f)) {
+                // only online_beamsearch_decode_ support beam_search_diversity_rate when beam_hyps is used
+                online_beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+            }
+            else {
+                FT_CHECK(false);  // deprecate this module
+                beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+            }
+        }  // end of dynamic_ite
+    }
+    else {  // beam_width=1
+        // In sampling, we have supported batch sampling. So, we always compute all sentences once.
+        const size_t local_batch_offset = ite * local_batch_size * beam_width;
+
+        Tensor logits = input_tensors->at("logits");
+        Tensor end_id = input_tensors->at("end_id");
+
+        TensorMap decode_input_tensors(
+            {{"logits",
+              logits.slice({local_batch_size, beam_width, logits.shape[2]}, local_batch_offset * logits.shape[2])},
+             {"step", input_tensors->at("step")},
+             {"max_input_length", input_tensors->at("max_input_length")},
+             {"end_id", end_id.slice({local_batch_size}, ite * local_batch_size)},
+             {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}});
+
+        if (input_tensors->isExist("embedding_bias")) {
+            decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
+        }
+        if (input_tensors->isExist("input_lengths")) {
+            Tensor input_lengths = input_tensors->at("input_lengths");
+            decode_input_tensors.insert(
+                {"input_lengths", input_lengths.slice({local_batch_size, beam_width}, local_batch_offset)});
+        }
+
+        TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
+        if (output_tensors->isExist("sequence_length")) {
+            Tensor sequence_length = output_tensors->at("sequence_length");
+            decode_output_tensors.insert(
+                {"sequence_length", sequence_length.slice({local_batch_size * beam_width}, local_batch_offset)});
+        }
+        if (output_tensors->isExist("finished")) {
+            Tensor finished = output_tensors->at("finished");
+            decode_output_tensors.insert(
+                {"finished", finished.slice({local_batch_size * beam_width}, local_batch_offset)});
+        }
+        if (output_tensors->isExist("cum_log_probs")) {
+            Tensor cum_log_probs = output_tensors->at("cum_log_probs");
+            decode_output_tensors.insert(
+                {"cum_log_probs", cum_log_probs.slice({local_batch_size * beam_width}, local_batch_offset)});
+        }
+        if (output_tensors->isExist("output_log_probs")) {
+            Tensor output_log_probs = output_tensors->at("output_log_probs");
+            int    max_input_length = input_tensors->at("max_input_length").getVal<int>();
+            size_t step_offset      = (step - max_input_length) * batch_size * beam_width;
+            decode_output_tensors.insert({"output_log_probs",
+                                          output_log_probs.slice({output_log_probs.shape[0] - (step - max_input_length),
+                                                                  local_batch_size * beam_width},
+                                                                 step_offset + local_batch_offset)});
+        }
+
+        // Run topk / topp decode layers.
+        // Currently, we support batch sampling. If the runtime arguments are like
+        // topk = [4, 0, 4]. topp = [0.0, 0.5, 0.5]
+        // then topk_decode handles [4, x, 4 + 0.5]
+        //      topp_decode handles [x, 0.5, x]
+        // where "x" are skipped.
+        topk_decode_->forward(&decode_output_tensors, &decode_input_tensors);
+        topp_decode_->forward(&decode_output_tensors, &decode_input_tensors);
+    }
+
+    if (input_tensors->isExist("stop_words_list")) {
+        const size_t id_offset         = ite * local_batch_size * beam_width;
+        const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2];
+
+        invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr<const int>(),
+                                 beam_width > 1 ? output_tensors->at("parent_ids").getPtr<const int>() : nullptr,
+                                 input_tensors->at("stop_words_list")
+                                     .getPtrWithOffset<const int>(ite * local_batch_size * 2 * stop_words_length),
+                                 output_tensors->at("finished").getPtrWithOffset<bool>(id_offset),
+                                 id_offset,
+                                 stop_words_length,
+                                 batch_size,
+                                 beam_width,
+                                 step,
+                                 stream_);
+    }
+
+    if (input_tensors->isExist("sequence_limit_length")) {
+        invokeLengthCriterion(output_tensors->at("finished").getPtr<bool>(),
+                              output_tensors->at("should_stop").getPtr<bool>(),
+                              h_pinned_finished_sum_,
+                              input_tensors->at("sequence_limit_length").getPtr<const uint32_t>(),
+                              batch_size,
+                              beam_width,
+                              step,
+                              stream_);
+    }
+}
+
+template<typename T>
+bool DynamicDecodeLayer<T>::hasDiffRuntimeArgs(TensorMap* input_tensors)
+{
+    for (int i = 0; i < (int)runtime_arg_names_.size(); i++) {
+        if (input_tensors->isExist(runtime_arg_names_[i])) {
+            auto tensor = input_tensors->at(runtime_arg_names_[i]);
+            FT_CHECK(tensor.shape.size() == 1);
+            for (int j = 1; j < (int)tensor.shape[0]; j++) {
+                const void* data = tensor.data;
+                switch (tensor.type) {
+                    case TYPE_FP32:
+                        if (((const float*)data)[0] != ((const float*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    case TYPE_INT32:
+                        if (((const int*)data)[0] != ((const int*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    case TYPE_UINT32:
+                        if (((const uint*)data)[0] != ((const uint*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    case TYPE_UINT64:
+                        if (((const unsigned long long int*)data)[0] != ((const unsigned long long int*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    default:
+                        FT_CHECK_WITH_INFO(false, runtime_arg_names_[i] + ": " + tensor.toString() + " is invalid.");
+                        break;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+template class DynamicDecodeLayer<float>;
+template class DynamicDecodeLayer<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/DynamicDecodeLayer.h
+++ b/src/fastertransformer/layers/DynamicDecodeLayer.h
+/*
+ * Copyright (c) 2022-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class DynamicDecodeLayer: public BaseLayer {
+protected:
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    void initialize();
+    bool hasDiffRuntimeArgs(TensorMap* input_tensors);
+
+    DynamicDecodeBaseLayer* online_beamsearch_decode_;
+    DynamicDecodeBaseLayer* beamsearch_decode_;
+    DynamicDecodeBaseLayer* topk_decode_;
+    DynamicDecodeBaseLayer* topp_decode_;
+
+    size_t          vocab_size_;
+    size_t          vocab_size_padded_;
+    cudaDeviceProp* cuda_device_prop_;
+
+    // List of argument names which can have different values in runtime
+    // and does not support a batched version of kernel in beam search.
+    const std::vector<std::string> runtime_arg_names_ = {"beam_search_diversity_rate",
+                                                         "temperature",
+                                                         "len_penalty",
+                                                         "repetition_penalty",
+                                                         "presence_penalty",
+                                                         "min_length"};
+
+    bool has_diff_runtime_args_ = false;
+    int* h_pinned_finished_sum_ = nullptr;
+
+public:
+    curandState_t* topk_curandstate_buf()
+    {
+        return static_cast<BaseSamplingLayer<T>*>(topk_decode_)->curandstate_buf();
+    }
+    curandState_t* topp_curandstate_buf()
+    {
+        return static_cast<BaseSamplingLayer<T>*>(topp_decode_)->curandstate_buf();
+    }
+
+    DynamicDecodeLayer(size_t           vocab_size,
+                       size_t           vocab_size_padded,
+                       int              end_id,
+                       cudaStream_t     stream,
+                       cublasMMWrapper* cublas_wrapper,
+                       IAllocator*      allocator,
+                       bool             is_free_buffer_after_forward,
+                       cudaDeviceProp*  cuda_device_prop);
+
+    ~DynamicDecodeLayer();
+    DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer);
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args);
+    void forward(TensorMap* output_tensors, TensorMap* input_tensors);
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors);
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnFP8Layer.cc
+++ b/src/fastertransformer/layers/FfnFP8Layer.cc
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/FfnFP8Layer.h"
+#include "src/fastertransformer/kernels/activation_fp8_kernels.h"
+#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::forward(TensorMap*                  output_tensors,
+                                  TensorMap*                  input_tensors,
+                                  const FfnFP8Weight<T1, T2>* ffn_weights)
+{
+    // input tensors:
+    //      input_hidden_state [token_num, d_model],
+
+    // output tensors:
+    //      output_hidden_state [token_num, d_model],
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() == 1);
+    FT_CHECK(output_tensors->size() == 1);
+
+    const int m                  = input_tensors->at("input_hidden_state").shape[0];
+    const int d_model            = input_tensors->at("input_hidden_state").shape[1];
+    const T1* input_hidden_state = input_tensors->at("input_hidden_state").getPtr<T1>();
+    Tensor    output_tensor      = output_tensors->at("output_hidden_state");
+    allocateBuffer(m);
+
+#ifdef FUSE_GEMM_ACT
+    if (fp8_mode_ == 1) {
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+        reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+            ->Gemm(inter_buf_bf16_,
+                   (int)1,
+                   (int)m,
+                   (int)inter_size_,
+                   (int)d_model,
+                   (int64_t)0,
+                   (int64_t)0,
+                   (int64_t)0,
+                   &alpha,
+                   &beta,
+                   input_hidden_state,
+                   ffn_weights->intermediate_weight.kernel,
+                   ffn_weights->intermediate_weight.input_scale,
+                   ffn_weights->intermediate_weight.per_channel_scale_min,  // identity_scale
+                   stream_);
+        invokeAddBiasActivation(m,
+                                ffn_weights->intermediate_weight.bias,
+                                ffn_weights->intermediate_weight.output_scale,
+                                ffn_weights->intermediate_weight.scale,
+                                ffn_weights->intermediate_weight.per_channel_scale_min,
+                                ffn_weights->output_weight.input_scale_inv);
+    }
+    else if (fp8_mode_ == 2) {
+#ifdef USE_QGMMA
+        if (getActivationType() == ActivationType::Gelu) {
+            PUSH_RANGE("FFN gemm 1 bias gelu");
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Conv1x1Gemm<false, true>(inter_buf_,
+                                           m,
+                                           inter_size_,
+                                           d_model,
+                                           input_hidden_state,
+                                           ffn_weights->intermediate_weight.kernel,
+                                           ffn_weights->intermediate_weight.bias,
+                                           *(ffn_weights->intermediate_weight.input_h_scale),   // scale_a,
+                                           *(ffn_weights->intermediate_weight.weight_h_scale),  // scale_b,
+                                           *(ffn_weights->output_weight.input_h_scale_inv),     // scale_d,
+                                           stream_);
+            POP_RANGE;
+        }
+        else if (getActivationType() == ActivationType::Relu) {
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Conv1x1Gemm<true, false>(inter_buf_,
+                                           m,
+                                           inter_size_,
+                                           d_model,
+                                           input_hidden_state,
+                                           ffn_weights->intermediate_weight.kernel,
+                                           ffn_weights->intermediate_weight.bias,
+                                           *(ffn_weights->intermediate_weight.input_h_scale),   // scale_a,
+                                           *(ffn_weights->intermediate_weight.weight_h_scale),  // scale_b,
+                                           *(ffn_weights->output_weight.input_h_scale_inv),     // scale_d,
+                                           stream_);
+        }
+#else  // USE_QGMMA
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+        if (getActivationType() == ActivationType::Gelu) {
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<false, true>(inter_buf_bf16_,
+#else   // FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<false, true>(inter_buf_,
+#endif  // FP8_GEMM_OUTPUT_QUANT_DISABLE
+                                             (int)1,
+                                             (int)m,
+                                             (int)inter_size_,
+                                             (int)d_model,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             &alpha,
+                                             &beta,
+                                             input_hidden_state,
+                                             ffn_weights->intermediate_weight.kernel,
+                                             ffn_weights->intermediate_weight.input_scale,
+                                             ffn_weights->intermediate_weight.weight_scale,
+                                             ffn_weights->intermediate_weight.bias,
+                                             ffn_weights->intermediate_weight.output_scale,
+                                             stream_);
+        }
+        else if (getActivationType() == ActivationType::Relu) {
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<true, false>(inter_buf_bf16_,
+#else   // FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<true, false>(inter_buf_,
+#endif  // #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+                                             (int)1,
+                                             (int)m,
+                                             (int)inter_size_,
+                                             (int)d_model,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             &alpha,
+                                             &beta,
+                                             input_hidden_state,
+                                             ffn_weights->intermediate_weight.kernel,
+                                             ffn_weights->intermediate_weight.input_scale,
+                                             ffn_weights->intermediate_weight.weight_scale,
+                                             ffn_weights->intermediate_weight.bias,
+                                             ffn_weights->intermediate_weight.output_scale,
+                                             stream_);
+        }
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+        invokeQuantizeMatrix<T1, T2, QUANTIZE_MODE::PER_TENSOR>(
+            inter_buf_, ffn_weights->output_weight.input_scale_inv, inter_buf_bf16_, m * inter_size_, 1, stream_);
+#endif FP8_GEMM_OUTPUT_QUANT_DISABLE
+#endif  // USE_QGMMA
+    }
+
+#else  // FUSE_GEMM_ACT
+    PUSH_RANGE("FFN gemm 1");
+#ifdef SPARSITY_ENABLED
+    int m_tmp = m;
+    if (m_tmp % 8 != 0) {
+        m_tmp = (m_tmp / 8 + 1) * 8;
+    }
+    const int m_padded = m_tmp;
+    if (sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, d_model)) {
+        FT_CHECK(false);
+        // cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+        //                         CUBLAS_OP_N,
+        //                         inter_size_,
+        //                         m_padded,
+        //                         d_model,
+        //                         ffn_weights->intermediate_weight.sp_kernel,
+        //                         input_hidden_state,
+        //                         inter_buf_);
+    }
+    else {
+#endif  // SPARSITY_ENABLED
+        if (fp8_mode_ == 1) {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Gemm(inter_buf_bf16_,
+                       (int)1,
+                       (int)m,
+                       (int)inter_size_,
+                       (int)d_model,
+                       (int64_t)0,
+                       (int64_t)0,
+                       (int64_t)0,
+                       &alpha,
+                       &beta,
+                       input_hidden_state,
+                       ffn_weights->intermediate_weight.kernel,
+                       ffn_weights->intermediate_weight.input_scale,
+                       ffn_weights->intermediate_weight.per_channel_scale_min,  // identity_scale
+                       stream_);
+        }
+        else if (fp8_mode_ == 2) {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Gemm(inter_buf_bf16_,
+                       (int)1,
+                       (int)m,
+                       (int)inter_size_,
+                       (int)d_model,
+                       (int64_t)0,
+                       (int64_t)0,
+                       (int64_t)0,
+                       &alpha,
+                       &beta,
+                       input_hidden_state,
+                       ffn_weights->intermediate_weight.kernel,
+                       ffn_weights->intermediate_weight.input_scale,
+                       ffn_weights->intermediate_weight.weight_scale,
+                       stream_);
+        }
+#ifdef SPARSITY_ENABLED
+    }
+#endif  // SPARSITY_ENABLED
+    POP_RANGE;
+
+    PUSH_RANGE("FFN add bias act");
+    if (fp8_mode_ == 1) {
+        invokeAddBiasActivation(m,
+                                ffn_weights->intermediate_weight.bias,
+                                ffn_weights->intermediate_weight.output_scale,
+                                ffn_weights->intermediate_weight.scale,
+                                ffn_weights->intermediate_weight.per_channel_scale_min,
+                                ffn_weights->output_weight.input_scale_inv);
+    }
+    else if (fp8_mode_ == 2) {
+        invokeAddBiasActivation(m,
+                                ffn_weights->intermediate_weight.bias,
+                                ffn_weights->intermediate_weight.output_scale,
+                                nullptr,
+                                nullptr,
+                                ffn_weights->output_weight.input_scale_inv);
+    }
+    sync_check_cuda_error();
+    POP_RANGE;
+#endif  // FUSE_GEMM_ACT
+
+    PUSH_RANGE("FFN gemm 2");
+#ifdef SPARSITY_ENABLED
+    if (sparse_ && cublas_wrapper_->isUseSparse(1, d_model, m, inter_size_)) {
+        FT_CHECK(false);
+        // cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+        //                         CUBLAS_OP_N,
+        //                         d_model,
+        //                         m_padded,
+        //                         inter_size_,
+        //                         ffn_weights->output_weight.sp_kernel,
+        //                         inter_buf_,
+        //                         output_tensor);
+    }
+    else {
+#endif SPARSITY_ENABLED
+        if (fp8_mode_ == 1) {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            if (output_tensor.type == TYPE_BF16) {
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T2>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->identity_scale,
+                           stream_);
+            }
+            else if (output_tensor.type == TYPE_FP8_E4M3) {
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T1>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->output_weight.per_channel_scale_min,
+                           ffn_weights->output_weight.output_scale_inv,
+                           stream_);
+            }
+            else {
+                FT_CHECK(false);
+            }
+        }
+        else if (fp8_mode_ == 2) {
+            if (output_tensor.type == TYPE_BF16) {
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T2>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->output_weight.weight_scale,
+                           stream_);
+            }
+            else if (output_tensor.type == TYPE_FP8_E4M3) {
+                // It looks like conv1x1Gemm does not bring better performance for this gemm
+                // because the k dimension of this gemm is large
+                // #ifdef USE_QGMMA
+                //                 reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                //                     ->Conv1x1Gemm<false, false>(output_tensor.getPtr<T1>(),
+                //                                                 m,
+                //                                                 d_model,
+                //                                                 inter_size_,
+                //                                                 inter_buf_,
+                //                                                 ffn_weights->output_weight.kernel,
+                //                                                 ffn_weights->output_weight.bias,
+                //                                                 *(ffn_weights->output_weight.input_h_scale),       //
+                //                                                 scale_a,
+                //                                                 *(ffn_weights->output_weight.weight_h_scale),      //
+                //                                                 scale_b,
+                //                                                 *(ffn_weights->output_weight.output_h_scale_inv),  //
+                //                                                 scale_d, stream_);
+                // #else   // USE_QGMMA
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T1>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->output_weight.weight_scale,
+                           ffn_weights->output_weight.output_scale_inv,
+                           stream_);
+                // #endif  // USE_QGMMA
+            }
+            else {
+                FT_CHECK(false);
+            }
+        }
+#ifdef SPARSITY_ENABLED
+    }
+#endif  // SPARSITY_ENABLED
+    POP_RANGE;
+
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T1, typename T2>
+FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t           inter_size,
+                                 int              fp8_mode,
+                                 cudaStream_t     stream,
+                                 cublasMMWrapper* cublas_wrapper,
+                                 IAllocator*      allocator,
+                                 bool             is_free_buffer_after_forward,
+                                 bool             sparse):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
+    inter_size_(inter_size),
+    fp8_mode_(fp8_mode)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T1, typename T2>
+FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
+    BaseLayer(ffn_layer.stream_,
+              ffn_layer.cublas_wrapper_,
+              ffn_layer.allocator_,
+              ffn_layer.is_free_buffer_after_forward_,
+              ffn_layer.cuda_device_prop_,
+              ffn_layer.sparse_),
+    inter_size_(ffn_layer.inter_size_),
+    fp8_mode_(ffn_layer.fp8_mode_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T1, typename T2>
+FfnFP8Layer<T1, T2>::~FfnFP8Layer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    inter_buf_          = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
+    inter_buf_bf16_     = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
+    is_allocate_buffer_ = true;
+}
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&inter_buf_));
+        allocator_->free((void**)(&inter_buf_bf16_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template class FfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
+
+template<typename T1, typename T2>
+GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(size_t           inter_size,
+                                         int              fp8_mode,
+                                         cudaStream_t     stream,
+                                         cublasMMWrapper* cublas_wrapper,
+                                         IAllocator*      allocator,
+                                         bool             is_free_buffer_after_forward,
+                                         bool             sparse):
+    FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
+{
+}
+
+template<typename T1, typename T2>
+GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& gelu_ffn_layer):
+    FfnFP8Layer<T1, T2>(gelu_ffn_layer)
+{
+}
+
+template<typename T1, typename T2>
+void GeluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int    m,
+                                                      const T2*    bias,
+                                                      const float* input_scale,
+                                                      const float* input_scale_2,
+                                                      const float* input_scale_2_min,
+                                                      const float* output_scale)
+{
+    FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
+                                     inter_buf_,
+                                     bias,
+                                     input_scale,
+                                     input_scale_2,
+                                     input_scale_2_min,
+                                     output_scale,
+                                     (uint32_t)m,
+                                     (uint32_t)inter_size_,
+                                     stream_};
+    invokeFP8AddBiasGelu<T1, T2>(param);
+}
+
+template class GeluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
+
+template<typename T1, typename T2>
+ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(size_t           inter_size,
+                                         int              fp8_mode,
+                                         cudaStream_t     stream,
+                                         cublasMMWrapper* cublas_wrapper,
+                                         IAllocator*      allocator,
+                                         bool             is_free_buffer_after_forward,
+                                         bool             sparse):
+    FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
+{
+}
+
+template<typename T1, typename T2>
+ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& relu_ffn_layer):
+    FfnFP8Layer<T1, T2>(relu_ffn_layer)
+{
+}
+
+template<typename T1, typename T2>
+void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int    m,
+                                                      const T2*    bias,
+                                                      const float* input_scale,
+                                                      const float* input_scale_2,
+                                                      const float* input_scale_2_min,
+                                                      const float* output_scale)
+{
+    FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
+                                     inter_buf_,
+                                     bias,
+                                     input_scale,
+                                     input_scale_2,
+                                     input_scale_2_min,
+                                     output_scale,
+                                     (uint32_t)m,
+                                     (uint32_t)inter_size_,
+                                     stream_};
+    invokeFP8AddBiasRelu<T1, T2>(param);
+}
+
+template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnFP8Layer.h
+++ b/src/fastertransformer/layers/FfnFP8Layer.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnFP8Weight.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+class FfnFP8Layer: public BaseLayer {
+private:
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    void allocateBuffer(size_t token_num);
+
+protected:
+    const int    fp8_mode_;
+    T1*          inter_buf_      = nullptr;
+    T2*          inter_buf_bf16_ = nullptr;
+    size_t       inter_size_;
+    virtual void invokeAddBiasActivation(const int    m,
+                                         const T2*    bias,
+                                         const float* input_scale,
+                                         const float* input_scale_2,
+                                         const float* input_scale_2_min,
+                                         const float* output_scale) = 0;
+
+public:
+    FfnFP8Layer(size_t           inter_size,
+                int              fp8_mode,
+                cudaStream_t     stream,
+                cublasMMWrapper* cublas_wrapper,
+                IAllocator*      allocator,
+                bool             is_free_buffer_after_forward,
+                bool             sparse = false);
+
+    FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
+
+    virtual ~FfnFP8Layer();
+
+    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
+    virtual ActivationType getActivationType() = 0;
+};
+
+template<typename T1, typename T2>
+class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
+public:
+    GeluFfnFP8Layer(size_t           inter_size,
+                    int              fp8_mode_,
+                    cudaStream_t     stream,
+                    cublasMMWrapper* cublas_wrapper,
+                    IAllocator*      allocator,
+                    bool             is_free_buffer_after_forward,
+                    bool             sparse = false);
+
+    GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
+
+    virtual ~GeluFfnFP8Layer() = default;
+    ActivationType getActivationType() override
+    {
+        return ActivationType::Gelu;
+    };
+
+protected:
+    using FfnFP8Layer<T1, T2>::stream_;
+
+private:
+    using FfnFP8Layer<T1, T2>::inter_buf_;
+    using FfnFP8Layer<T1, T2>::inter_size_;
+    using FfnFP8Layer<T1, T2>::fp8_mode_;
+    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
+    void invokeAddBiasActivation(const int    m,
+                                 const T2*    bias,
+                                 const float* input_scale,
+                                 const float* input_scale_2,
+                                 const float* input_scale_2_min,
+                                 const float* output_scale) override;
+};
+
+template<typename T1, typename T2>
+class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
+public:
+    ReluFfnFP8Layer(size_t           inter_size,
+                    int              fp8_mode,
+                    cudaStream_t     stream,
+                    cublasMMWrapper* cublas_wrapper,
+                    IAllocator*      allocator,
+                    bool             is_free_buffer_after_forward,
+                    bool             sparse = false);
+
+    ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
+
+    virtual ~ReluFfnFP8Layer() = default;
+    ActivationType getActivationType() override
+    {
+        return ActivationType::Relu;
+    };
+
+protected:
+    using FfnFP8Layer<T1, T2>::stream_;
+
+private:
+    using FfnFP8Layer<T1, T2>::inter_buf_;
+    using FfnFP8Layer<T1, T2>::inter_size_;
+    using FfnFP8Layer<T1, T2>::fp8_mode_;
+    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
+    void invokeAddBiasActivation(const int    m,
+                                 const T2*    bias,
+                                 const float* input_scale,
+                                 const float* input_scale_2,
+                                 const float* input_scale_2_min,
+                                 const float* output_scale) override;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnFP8Weight.h
+++ b/src/fastertransformer/layers/FfnFP8Weight.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+struct FfnFP8Weight: FfnWeight<T1, T2> {
+    ScaleList* scale_list_ptr;
+    float*     identity_scale;
+    float*     identity_h_scale;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnINT8Weight.h
+++ b/src/fastertransformer/layers/FfnINT8Weight.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+namespace fastertransformer {
+
+template<typename T>
+struct FfnINT8Weight: FfnWeight<T> {
+    ScaleList* scale_list_ptr;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnLayer.cc
+++ b/src/fastertransformer/layers/FfnLayer.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void FfnLayer<T>::forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                          const std::vector<fastertransformer::Tensor>* input_tensors,
+                          const FfnWeight<T>*                           ffn_weights)
+{
+    TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
+    TensorMap output_tensor({{"ffn_output", output_tensors->at(0)}});
+    forward(&output_tensor, &input_tensor, ffn_weights);
+}
+
+template<typename T>
+void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights)
+{
+    // input tensors:
+    //      ffn_input [token_num, hidden_dimension],
+    //      ia3_tasks [batch_size] (optional)
+    //      moe_k     [1], uint64 (optional)
+    //      padding_offset [token_num] (optional)
+    //      seq_len [1], int32, (optional), only used for ia3
+
+    // output tensors:
+    //      ffn_output [token_num, hidden_dimension] or [moe_k * token_num, hidden_dimension] if use_moe
+    //      expert_scales [token_num, moe_k] (optional)
+    //      expanded_source_row_to_expanded_dest_row [token_num, moe_k] (optional)
+    //      expert_for_source_row [token_num, moe_k] (optional)
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 1 && input_tensors->size() <= 5);
+    FT_CHECK(output_tensors->size() >= 1 || output_tensors->size() <= 4);
+    bool   use_moe = false;
+    size_t moe_k   = 0;
+    if (input_tensors->isExist("moe_k")) {
+        use_moe = true;
+        moe_k   = input_tensors->at("moe_k").getVal<size_t>();
+    }
+    allocateBuffer(input_tensors->at("ffn_input").shape[0], moe_k, use_moe);
+
+    const int m             = input_tensors->at("ffn_input").shape[0];
+    T*        output_tensor = output_tensors->at("ffn_output").getPtr<T>();
+    const T*  input_tensor  = input_tensors->at("ffn_input").getPtr<const T>();
+
+    // for moe output
+    T*   expert_scales    = nullptr;
+    int* permuted_rows    = nullptr;
+    int* permuted_experts = nullptr;
+
+    // moe outputs should exist or not together
+    FT_CHECK((use_moe && output_tensors->isExist("expert_scales")
+              && output_tensors->isExist("expanded_source_row_to_expanded_dest_row")
+              && output_tensors->isExist("expert_for_source_row"))
+             || (!use_moe && !output_tensors->isExist("expert_scales")
+                 && !output_tensors->isExist("expanded_source_row_to_expanded_dest_row")
+                 && !output_tensors->isExist("expert_for_source_row")));
+
+    if (use_moe) {
+        expert_scales    = output_tensors->at("expert_scales").getPtr<T>();
+        permuted_rows    = output_tensors->at("expanded_source_row_to_expanded_dest_row").getPtr<int>();
+        permuted_experts = output_tensors->at("expert_for_source_row").getPtr<int>();
+    }
+
+    // TODO: INT8 and Sparsity are currently not implemented (geglu or reglu)
+    const bool use_gated_activation = use_gated_activation_ && ffn_weights->intermediate_weight2.kernel != nullptr;
+
+    // moe can't be used with use_gated_activation currently
+    FT_CHECK(!(use_gated_activation && use_moe));
+    auto activation_type = getActivationType();
+
+    const int* ia3_tasks = input_tensors->getPtr<const int>("ia3_tasks", nullptr);
+
+    if (use_moe) {
+        PUSH_RANGE("FFN moe");
+        FT_CHECK(ia3_tasks == nullptr);
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              expert_num_,
+                              m,
+                              hidden_units_,
+                              ffn_weights->gating_weight.kernel,
+                              expert_num_,
+                              input_tensor,
+                              hidden_units_,
+                              moe_gates_buf_,
+                              expert_num_);
+
+        if (int8_mode_ == 0) {
+            moe_fc_runner_->run_moe_fc(input_tensor,
+                                       moe_gates_buf_,
+                                       ffn_weights->intermediate_weight.kernel,
+                                       ffn_weights->intermediate_weight.weight_only_quant_scale,
+                                       ffn_weights->intermediate_weight.bias,
+                                       activation_type,
+                                       ffn_weights->output_weight.kernel,
+                                       ffn_weights->output_weight.weight_only_quant_scale,
+                                       m,
+                                       hidden_units_,
+                                       inter_size_,
+                                       expert_num_,
+                                       moe_k,
+                                       moe_fc_workspace_,
+                                       output_tensor,
+                                       expert_scales,
+                                       permuted_rows,
+                                       permuted_experts,
+                                       stream_);
+        }
+        else if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(moe_int8_weight_only_fc_runner_.get() != NULL,
+                               "weight only runner was not initialized.");
+
+            FT_CHECK(ffn_weights->intermediate_weight.int8_kernel != NULL
+                     && ffn_weights->intermediate_weight.weight_only_quant_scale != NULL);
+
+            FT_CHECK(ffn_weights->output_weight.int8_kernel != NULL
+                     && ffn_weights->output_weight.weight_only_quant_scale != NULL);
+
+            moe_int8_weight_only_fc_runner_->run_moe_fc(
+                input_tensor,
+                moe_gates_buf_,
+                reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
+                ffn_weights->intermediate_weight.weight_only_quant_scale,
+                ffn_weights->intermediate_weight.bias,
+                activation_type,
+                reinterpret_cast<const uint8_t*>(ffn_weights->output_weight.int8_kernel),
+                ffn_weights->output_weight.weight_only_quant_scale,
+                m,
+                hidden_units_,
+                inter_size_,
+                expert_num_,
+                moe_k,
+                moe_fc_workspace_,
+                output_tensor,
+                expert_scales,
+                permuted_rows,
+                permuted_experts,
+                stream_);
+        }
+        else {
+            FT_CHECK_WITH_INFO(false, "Invalid int8 mode for MoE");
+        }
+
+        sync_check_cuda_error();
+        if (is_free_buffer_after_forward_ == true) {
+            freeBuffer();
+        }
+        sync_check_cuda_error();
+        POP_RANGE;
+        return;
+    }
+
+    PUSH_RANGE("FFN gemm 1");
+    int m_tmp = input_tensors->at("ffn_input").shape[0];
+    if (m_tmp % 8 != 0) {
+        m_tmp = (m_tmp / 8 + 1) * 8;
+    }
+    const int m_padded = m_tmp;
+#ifdef SPARSITY_ENABLED
+    bool use_sparse_gemm = sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, hidden_units_);
+#else
+    constexpr bool use_sparse_gemm = false;
+#endif
+
+    if (use_sparse_gemm) {
+        FT_CHECK(!use_gated_activation);
+#ifdef SPARSITY_ENABLED
+        cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+                                CUBLAS_OP_N,
+                                inter_size_,
+                                m_padded,
+                                hidden_units_,
+                                ffn_weights->intermediate_weight.sp_kernel,
+                                input_tensor,
+                                inter_buf_);
+#endif
+    }
+    else {
+        if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
+            FT_CHECK(ffn_weights->intermediate_weight.int8_kernel != NULL
+                     && ffn_weights->intermediate_weight.weight_only_quant_scale != NULL);
+
+            if (ia3_tasks == nullptr && !use_gated_activation) {
+                // launch fused GEMM + activation
+                weight_only_int8_fc_runner_->gemm_bias_act(
+                    input_tensor,
+                    reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
+                    ffn_weights->intermediate_weight.weight_only_quant_scale,
+                    ffn_weights->intermediate_weight.bias,
+                    inter_buf_,
+                    m,
+                    inter_size_,
+                    hidden_units_,
+                    activation_type,
+                    mixed_gemm_workspace_,
+                    mixed_gemm_ws_bytes_,
+                    stream_);
+            }
+            else {
+                // Otherwise, let FT handle activation
+                weight_only_int8_fc_runner_->gemm(
+                    input_tensor,
+                    reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
+                    ffn_weights->intermediate_weight.weight_only_quant_scale,
+                    inter_buf_,
+                    m,
+                    inter_size_,
+                    hidden_units_,
+                    mixed_gemm_workspace_,
+                    mixed_gemm_ws_bytes_,
+                    stream_);
+
+                if (use_gated_activation) {
+                    FT_CHECK(ffn_weights->intermediate_weight2.int8_kernel != NULL
+                             && ffn_weights->intermediate_weight2.weight_only_quant_scale != NULL);
+
+                    weight_only_int8_fc_runner_->gemm(
+                        input_tensor,
+                        reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight2.int8_kernel),
+                        ffn_weights->intermediate_weight2.weight_only_quant_scale,
+                        inter_buf_2_,
+                        m,
+                        inter_size_,
+                        hidden_units_,
+                        mixed_gemm_workspace_,
+                        mixed_gemm_ws_bytes_,
+                        stream_);
+                }
+            }
+        }
+        else if (int8_mode_ == 2) {
+            FT_CHECK(!use_gated_activation);
+            cublas_wrapper_->Int8Gemm(inter_size_,
+                                      m,
+                                      hidden_units_,
+                                      ffn_weights->intermediate_weight.int8_kernel,
+                                      hidden_units_,
+                                      input_tensors->getPtr<int8_t>("ffn_input"),
+                                      hidden_units_,
+                                      reinterpret_cast<int8_t*>(inter_buf_),
+                                      inter_size_,
+                                      ffn_weights->intermediate_weight.scale_inter);
+        }
+        else {
+            cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                  CUBLAS_OP_N,
+                                  inter_size_,
+                                  m,
+                                  hidden_units_,
+                                  ffn_weights->intermediate_weight.kernel,
+                                  inter_size_,
+                                  input_tensor,
+                                  hidden_units_,
+                                  inter_buf_,
+                                  inter_size_);
+            if (use_gated_activation) {
+                cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      inter_size_,
+                                      m,
+                                      hidden_units_,
+                                      ffn_weights->intermediate_weight2.kernel,
+                                      inter_size_,
+                                      input_tensor,
+                                      hidden_units_,
+                                      inter_buf_2_,
+                                      inter_size_);
+            }
+        }
+    }
+
+    POP_RANGE;
+
+    if (int8_mode_ != 1 || ia3_tasks != nullptr || use_gated_activation) {
+        // if int8_mode == 1 && ia3_tasks == nullptr && we don't use gated activations, we use cutlass
+        // to fuse GEMM + bias + activation, so we skip the activation function here. In all
+        // other cases, we must apply the activation function separately.
+        PUSH_RANGE("add bias act");
+        genericActivation(m,
+                          ffn_weights->intermediate_weight.bias,
+                          use_gated_activation ? ffn_weights->intermediate_weight2.bias : nullptr,
+                          input_tensors->at("ia3_tasks", {MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<const int>(),
+                          ffn_weights->ia3_weight.kernel,
+                          int8_mode_ == 2 ? ffn_weights->intermediate_weight.scale_out : (float*)nullptr,
+                          int8_mode_ == 2 ? ffn_weights->output_weight.scale : (float*)nullptr,
+                          input_tensors->getPtr<int>("padding_offset", nullptr),
+                          input_tensors->getVal<int>("seq_len", 1));
+        POP_RANGE;
+    }
+
+    sync_check_cuda_error();
+
+    PUSH_RANGE("FFN gemm 2");
+#ifdef SPARSITY_ENABLED
+    use_sparse_gemm = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m, inter_size_);
+#endif
+    if (use_sparse_gemm) {
+#ifdef SPARSITY_ENABLED
+        cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+                                CUBLAS_OP_N,
+                                hidden_units_,
+                                m_padded,
+                                inter_size_,
+                                ffn_weights->output_weight.sp_kernel,
+                                inter_buf_,
+                                output_tensor);
+#endif
+    }
+    else {
+        if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
+            FT_CHECK(ffn_weights->output_weight.int8_kernel != NULL
+                     && ffn_weights->output_weight.weight_only_quant_scale != NULL);
+            weight_only_int8_fc_runner_->gemm(inter_buf_,
+                                              reinterpret_cast<const uint8_t*>(ffn_weights->output_weight.int8_kernel),
+                                              ffn_weights->output_weight.weight_only_quant_scale,
+                                              output_tensor,
+                                              m,
+                                              hidden_units_,
+                                              inter_size_,
+                                              mixed_gemm_workspace_,
+                                              mixed_gemm_ws_bytes_,
+                                              stream_);
+        }
+        else if (int8_mode_ == 2) {
+            int8_fc_runner_->gemm(reinterpret_cast<int8_t*>(inter_buf_),
+                                  ffn_weights->output_weight.int8_kernel,
+                                  QuantMode::PerTensorQuant,
+                                  ffn_weights->output_weight.scale_inter,
+                                  ffn_weights->output_weight.scale_out,
+                                  output_tensors->getPtr<T>("ffn_output"),
+                                  m,
+                                  hidden_units_,
+                                  inter_size_,
+                                  nullptr,
+                                  0,
+                                  stream_);
+        }
+        else {
+            cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                  CUBLAS_OP_N,
+                                  hidden_units_,
+                                  m,
+                                  inter_size_,
+                                  ffn_weights->output_weight.kernel,
+                                  hidden_units_,
+                                  inter_buf_,
+                                  inter_size_,
+                                  output_tensor,
+                                  hidden_units_);
+        }
+    }
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T>
+FfnLayer<T>::FfnLayer(size_t           max_batch_size,
+                      size_t           max_seq_len,
+                      size_t           head_num,
+                      size_t           size_per_head,
+                      size_t           expert_num,
+                      size_t           inter_size,
+                      cudaStream_t     stream,
+                      cublasMMWrapper* cublas_wrapper,
+                      IAllocator*      allocator,
+                      bool             is_free_buffer_after_forward,
+                      bool             sparse,
+                      int              int8_mode,
+                      bool             use_gated_activation):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
+    max_token_num_(max_batch_size * max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    expert_num_(expert_num),
+    hidden_units_(head_num * size_per_head),
+    max_inter_size_(inter_size),
+    inter_size_(inter_size),
+    int8_mode_(int8_mode),
+    use_gated_activation_(use_gated_activation),
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (int8_mode_ == 0) {
+        moe_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, T>>();
+    }
+    else if (int8_mode_ == 1) {
+        FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value), "Weight only quant not supported for fp32.");
+        moe_int8_weight_only_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, uint8_t>>();
+        weight_only_int8_fc_runner_     = std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>();
+    }
+}
+
+template<typename T>
+FfnLayer<T>::FfnLayer(FfnLayer<T> const& ffn_layer):
+    BaseLayer(ffn_layer.stream_,
+              ffn_layer.cublas_wrapper_,
+              ffn_layer.allocator_,
+              ffn_layer.is_free_buffer_after_forward_,
+              ffn_layer.cuda_device_prop_,
+              ffn_layer.sparse_),
+    max_token_num_(ffn_layer.max_token_num_),
+    head_num_(ffn_layer.head_num_),
+    size_per_head_(ffn_layer.size_per_head_),
+    expert_num_(ffn_layer.expert_num_),
+    hidden_units_(ffn_layer.hidden_units_),
+    max_inter_size_(ffn_layer.max_inter_size_),
+    inter_size_(ffn_layer.inter_size_),
+    int8_mode_(ffn_layer.int8_mode_),
+    use_gated_activation_(ffn_layer.use_gated_activation_),
+    moe_fc_runner_(ffn_layer.moe_fc_runner_),
+    moe_int8_weight_only_fc_runner_(ffn_layer.moe_int8_weight_only_fc_runner_),
+    weight_only_int8_fc_runner_(ffn_layer.weight_only_int8_fc_runner_),
+    int8_fc_runner_(ffn_layer.int8_fc_runner_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T>
+FfnLayer<T>::~FfnLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T>
+void FfnLayer<T>::allocateBuffer()
+{
+    FT_CHECK_WITH_INFO(false,
+                       "FfnLayer::allocateBuffer() is deprecated. Use `allocateBuffer(size_t token_num, ...)` instead");
+}
+
+template<typename T>
+void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (use_moe) {
+        moe_gates_buf_ =
+            (T*)allocator_->reMalloc(moe_gates_buf_, sizeof(T) * pad_to_multiple_of_16(token_num * expert_num_), false);
+        size_t ws_size_moe = 0;
+        if (int8_mode_ == 0) {
+            FT_CHECK_WITH_INFO(moe_fc_runner_.get() != NULL, "moe runner was not initialized.");
+            ws_size_moe = moe_fc_runner_->getWorkspaceSize(token_num, hidden_units_, inter_size_, expert_num_, moe_k);
+        }
+        else if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(moe_int8_weight_only_fc_runner_.get() != NULL,
+                               "weight only moe runner was not initialized.");
+            ws_size_moe = moe_int8_weight_only_fc_runner_->getWorkspaceSize(
+                token_num, hidden_units_, inter_size_, expert_num_, moe_k);
+        }
+
+        moe_fc_workspace_ = (char*)allocator_->reMalloc(moe_fc_workspace_, sizeof(char) * ws_size_moe, false);
+    }
+    else {
+        const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
+        inter_buf_           = (T*)allocator_->reMalloc(inter_buf_, type_size * token_num * max_inter_size_, false);
+        if (use_gated_activation_) {
+            inter_buf_2_ = (T*)allocator_->reMalloc(inter_buf_2_, sizeof(T) * token_num * max_inter_size_, false);
+        }
+
+        if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
+            // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
+            // possible memory that would be required by any of the individual gemms.
+            const int max_size    = std::max(hidden_units_, inter_size_);
+            mixed_gemm_ws_bytes_  = weight_only_int8_fc_runner_->getWorkspaceSize(token_num, max_size, max_size);
+            mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
+        }
+        else if (int8_mode_ == 2) {
+            const int max_size   = std::max(hidden_units_, inter_size_);
+            int8_gemm_ws_bytes_  = int8_fc_runner_->getWorkspaceSize(token_num, max_size, max_size);
+            int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false);
+        }
+    }
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void FfnLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&inter_buf_));
+        if (use_gated_activation_) {
+            allocator_->free((void**)(&inter_buf_2_));
+        }
+        if (expert_num_ != 0) {
+            allocator_->free((void**)(&moe_gates_buf_));
+            allocator_->free((void**)(&moe_fc_workspace_));
+        }
+
+        if (mixed_gemm_workspace_) {
+            allocator_->free((void**)(&mixed_gemm_workspace_));
+            mixed_gemm_ws_bytes_ = 0;
+        }
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+#define INVOKE_GENERIC_ACT(ACT)                                                                                        \
+    invokeGenericActivation<ACT>(inter_buf_,                                                                           \
+                                 bias1,                                                                                \
+                                 inter_buf_2_,                                                                         \
+                                 bias2,                                                                                \
+                                 ia3_tasks,                                                                            \
+                                 ia3_weights,                                                                          \
+                                 m,                                                                                    \
+                                 inter_size_,                                                                          \
+                                 int8_mode_,                                                                           \
+                                 activation_in,                                                                        \
+                                 activation_out,                                                                       \
+                                 padding_offset,                                                                       \
+                                 seq_len,                                                                              \
+                                 stream_);
+
+template<typename T>
+void FfnLayer<T>::genericActivation(int          m,
+                                    const T*     bias1,
+                                    const T*     bias2,
+                                    const int*   ia3_tasks,
+                                    const T*     ia3_weights,
+                                    const float* activation_in,
+                                    const float* activation_out,
+                                    const int*   padding_offset,
+                                    const int    seq_len)
+{
+    if (ia3_tasks != nullptr) {
+        FT_CHECK(seq_len > 0);
+    }
+
+    // dispatch according to actual activation
+    switch (getActivationType()) {
+        case ActivationType::Gelu:
+        case ActivationType::GeGLU:
+            if (inter_buf_2_ == nullptr && int8_mode_ <= 1) {
+                invokeAddBiasGeluV2(
+                    inter_buf_, bias1, ia3_tasks, ia3_weights, padding_offset, seq_len, m, inter_size_, stream_);
+            }
+            else {
+                INVOKE_GENERIC_ACT(GeluActivation);
+            }
+            break;
+        case ActivationType::Relu:
+        case ActivationType::ReGLU:
+            INVOKE_GENERIC_ACT(ReluActivation);
+            break;
+        case ActivationType::Silu:
+        case ActivationType::SiGLU:
+            INVOKE_GENERIC_ACT(SiluActivation);
+            break;
+        case ActivationType::Identity:
+            INVOKE_GENERIC_ACT(IdentityActivation);
+            break;
+    }
+}
+
+#undef INVOKE_GENERIC_ACT
+
+template class FfnLayer<float>;
+template class FfnLayer<half>;
+#ifdef ENABLE_BF16
+template class FfnLayer<__nv_bfloat16>;
+#endif
+
+template<typename T>
+GeluFfnLayer<T>::GeluFfnLayer(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           expert_num,
+                              size_t           inter_size,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse,
+                              int              int8_mode,
+                              bool             use_gated_activation):
+    FfnLayer<T>(max_batch_size,
+                max_seq_len,
+                head_num,
+                size_per_head,
+                expert_num,
+                inter_size,
+                stream,
+                cublas_wrapper,
+                allocator,
+                is_free_buffer_after_forward,
+                sparse,
+                int8_mode,
+                use_gated_activation)
+{
+}
+
+template<typename T>
+GeluFfnLayer<T>::GeluFfnLayer(GeluFfnLayer<T> const& gelu_ffn_layer): FfnLayer<T>(gelu_ffn_layer)
+{
+}
+
+template class GeluFfnLayer<float>;
+template class GeluFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class GeluFfnLayer<__nv_bfloat16>;
+#endif
+
+template<typename T>
+ReluFfnLayer<T>::ReluFfnLayer(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           expert_num,
+                              size_t           inter_size,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse,
+                              int              int8_mode,
+                              bool             use_gated_activation):
+    FfnLayer<T>(max_batch_size,
+                max_seq_len,
+                head_num,
+                size_per_head,
+                expert_num,
+                inter_size,
+                stream,
+                cublas_wrapper,
+                allocator,
+                is_free_buffer_after_forward,
+                sparse,
+                int8_mode,
+                use_gated_activation)
+{
+}
+
+template<typename T>
+ReluFfnLayer<T>::ReluFfnLayer(ReluFfnLayer<T> const& relu_ffn_layer): FfnLayer<T>(relu_ffn_layer)
+{
+}
+
+template class ReluFfnLayer<float>;
+template class ReluFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class ReluFfnLayer<__nv_bfloat16>;
+#endif
+
+template<typename T>
+SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           expert_num,
+                              size_t           inter_size,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse,
+                              bool             use_gated_activation):
+    FfnLayer<T>(max_batch_size,
+                max_seq_len,
+                head_num,
+                size_per_head,
+                expert_num,
+                inter_size,
+                stream,
+                cublas_wrapper,
+                allocator,
+                is_free_buffer_after_forward,
+                sparse,
+                0,
+                use_gated_activation)
+{
+}
+
+template<typename T>
+SiluFfnLayer<T>::SiluFfnLayer(SiluFfnLayer<T> const& gelu_ffn_layer): FfnLayer<T>(gelu_ffn_layer)
+{
+}
+
+template class SiluFfnLayer<float>;
+template class SiluFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class SiluFfnLayer<__nv_bfloat16>;
+#endif
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnLayer.h
+++ b/src/fastertransformer/layers/FfnLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "src/fastertransformer/kernels/matrix_vector_multiplication.h"
+#include "src/fastertransformer/kernels/moe_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/utils/activation_types.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <stdint.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+class FfnLayer: public BaseLayer {
+private:
+    // buffer handling
+    size_t max_token_num_ = 0;
+
+    // meta data
+    size_t head_num_;       // (martinma): this member is not used in this class. Remove it?
+    size_t size_per_head_;  // (martinma): this member is not used in this class. Remove it?
+    size_t expert_num_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    // gated activation
+    bool use_gated_activation_;
+
+    std::shared_ptr<CutlassMoeFCRunner<T, T>>       moe_fc_runner_;
+    std::shared_ptr<CutlassMoeFCRunner<T, uint8_t>> moe_int8_weight_only_fc_runner_;
+
+    std::shared_ptr<CutlassFpAIntBGemmRunner<T, uint8_t>> weight_only_int8_fc_runner_;
+    std::shared_ptr<CutlassInt8GemmRunner<T>>             int8_fc_runner_;
+
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    void allocateBuffer(int moe_k = 0, bool use_moe = false);
+    void allocateBuffer(size_t token_num, int moe_k = 0, bool use_moe = false);
+
+protected:
+    T*    inter_buf_        = nullptr;
+    T*    inter_buf_2_      = nullptr;  // for gated activation
+    T*    moe_gates_buf_    = nullptr;
+    char* moe_fc_workspace_ = nullptr;
+
+    char*  mixed_gemm_workspace_ = nullptr;
+    size_t mixed_gemm_ws_bytes_  = 0;
+    char*  int8_gemm_workspace_  = nullptr;
+    size_t int8_gemm_ws_bytes_   = 0;
+
+    size_t inter_size_;
+    /* used to allocater memory buffers
+       different ffn layers (inter_size) will
+       reuse the same ffn layer with the max inter size.
+       max_inter_size will be passed as inter_size when initializing the ffn layer
+    */
+    size_t max_inter_size_;
+
+    // int8_mode_ == 0 means we don't use any mechanism related to INT8.
+    // int8_mode_ == 1 for weight quantized only gemm for GPT
+    // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales)
+    int int8_mode_ = 0;
+
+    virtual ActivationType getActivationType() const
+    {
+        return ActivationType::InvalidType;
+    };
+
+    void genericActivation(int          m,
+                           const T*     bias1,
+                           const T*     bias2,
+                           const int*   ia3_tasks,
+                           const T*     ia3_weights,
+                           const float* activation_in,
+                           const float* activation_out,
+                           const int*   padding_offset,
+                           const int    seq_len);
+
+public:
+    FfnLayer(size_t           max_batch_size,
+             size_t           max_seq_len,
+             size_t           head_num,       // (martinma): redundant parameter?
+             size_t           size_per_head,  // (martinma): redundant parameter?
+             size_t           expert_num,
+             size_t           inter_size,
+             cudaStream_t     stream,
+             cublasMMWrapper* cublas_wrapper,
+             IAllocator*      allocator,
+             bool             is_free_buffer_after_forward,
+             bool             sparse               = false,
+             int              int8_mode            = 0,
+             bool             use_gated_activation = false);
+
+    FfnLayer(FfnLayer<T> const& ffn_layer);
+
+    virtual ~FfnLayer();
+
+    void resetInterSize(size_t runtime_inter_size)
+    {
+        inter_size_ = runtime_inter_size;
+    }
+
+    virtual void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                         const std::vector<fastertransformer::Tensor>* input_tensors,
+                         const FfnWeight<T>*                           ffn_weights);
+    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
+};
+
+template<typename T>
+class GeluFfnLayer: public FfnLayer<T> {
+public:
+    GeluFfnLayer(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           expert_num,
+                 size_t           inter_size,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse               = false,
+                 int              int8_mode            = 0,
+                 bool             use_gated_activation = false);
+
+    GeluFfnLayer(GeluFfnLayer<T> const& ffn_layer);
+
+    virtual ~GeluFfnLayer() = default;
+
+protected:
+    using FfnLayer<T>::stream_;
+    virtual ActivationType getActivationType() const override
+    {
+        return ActivationType::Gelu;
+    };
+
+private:
+    using FfnLayer<T>::inter_buf_;
+    using FfnLayer<T>::inter_buf_2_;
+    using FfnLayer<T>::inter_size_;
+};
+
+template<typename T>
+class ReluFfnLayer: public FfnLayer<T> {
+public:
+    ReluFfnLayer(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           expert_num,
+                 size_t           inter_size,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse               = false,
+                 int              int8_mode            = 0,
+                 bool             use_gated_activation = false);
+
+    ReluFfnLayer(ReluFfnLayer<T> const& ffn_layer);
+
+    virtual ~ReluFfnLayer() = default;
+
+protected:
+    using FfnLayer<T>::stream_;
+    virtual ActivationType getActivationType() const override
+    {
+        return ActivationType::Relu;
+    };
+
+private:
+    using FfnLayer<T>::inter_buf_;
+    using FfnLayer<T>::inter_buf_2_;
+    using FfnLayer<T>::inter_size_;
+};
+
+template<typename T>
+class SiluFfnLayer: public FfnLayer<T> {
+public:
+    SiluFfnLayer(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           expert_num,
+                 size_t           inter_size,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse               = false,
+                 bool             use_gated_activation = false);
+
+    SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);
+
+    virtual ~SiluFfnLayer() = default;
+
+protected:
+    using FfnLayer<T>::stream_;
+    virtual ActivationType getActivationType() const override
+    {
+        return ActivationType::Silu;
+    };
+
+private:
+    using FfnLayer<T>::inter_buf_;
+    using FfnLayer<T>::inter_buf_2_;
+    using FfnLayer<T>::inter_size_;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnLayerINT8.cc
+++ b/src/fastertransformer/layers/FfnLayerINT8.cc
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FfnLayerINT8.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void FfnLayerINT8<T>::forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                              const std::vector<fastertransformer::Tensor>* input_tensors,
+                              const FfnWeight<T>*                           ffn_weights)
+{
+    // input_tensors: [input (token_num, hidden_dimension)]
+    // output_tensors: [output (token_num, hidden_dimension)]
+    ScaleList* scale_list = ((const FfnINT8Weight<T>*)ffn_weights)->scale_list_ptr;
+
+    cublasINT8MMWrapper* cublas_wrapper = (cublasINT8MMWrapper*)cublas_wrapper_;
+
+    FT_CHECK(isValidTokenNum(input_tensors->at(0).shape[0]));
+    allocateBuffer();
+
+    const int m = static_cast<int>(input_tensors->at(0).shape[0]);
+#ifdef SPARSITY_ENABLED
+    int m_tmp = m;
+    if (m_tmp % 16 != 0) {
+        m_tmp = (m_tmp / 16 + 1) * 16;
+    }
+    const int m_padded = m_tmp;
+#endif
+
+    int32_t*      output_tensor = output_tensors->at(0).getPtr<int32_t>();
+    const int8_t* input_tensor  = input_tensors->at(0).getPtr<const int8_t>();
+
+    PUSH_RANGE("FFN gemm 1");
+    if (int8_mode_ == 1) {
+        cublas_wrapper->Gemm(inter_int_buf_,
+                             1,
+                             m,
+                             inter_size_,
+                             hidden_units_,
+                             0,
+                             0,
+                             0,
+                             input_tensor,
+                             (int8_t*)(ffn_weights->intermediate_weight.kernel));
+    }
+    else if (int8_mode_ == 2 || int8_mode_ == 3) {
+#ifdef SPARSITY_ENABLED
+        if (sparse_) {
+            cublas_wrapper->SpGemm(inter_size_,
+                                   m_padded,
+                                   hidden_units_,
+                                   scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
+                                   (int8_t*)(ffn_weights->intermediate_weight.sp_kernel),
+                                   input_tensor,
+                                   (int8_t*)inter_int_buf_);
+        }
+        else {
+#endif
+            cublas_wrapper->Gemm((int8_t*)inter_int_buf_,
+                                 1,
+                                 m,
+                                 inter_size_,
+                                 hidden_units_,
+                                 0,
+                                 0,
+                                 0,
+                                 scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
+                                 input_tensor,
+                                 (int8_t*)(ffn_weights->intermediate_weight.kernel));
+#ifdef SPARSITY_ENABLED
+        }
+#endif
+    }
+    POP_RANGE;
+
+    PUSH_RANGE("add bias act");
+    invokeAddBiasActivation(m, ffn_weights->intermediate_weight.bias, scale_list);
+    POP_RANGE;
+    sync_check_cuda_error();
+
+    PUSH_RANGE("FFN gemm 2");
+    if (int8_mode_ == 1) {
+        cublas_wrapper->Gemm(output_tensor,
+                             1,
+                             m,
+                             hidden_units_,
+                             inter_size_,
+                             0,
+                             0,
+                             0,
+                             inter_buf_,
+                             (int8_t*)(ffn_weights->output_weight.kernel));
+    }
+    else if (int8_mode_ == 2 || int8_mode_ == 3) {
+#ifdef SPARSITY_ENABLED
+        if (sparse_) {
+            cublas_wrapper->SpGemm(hidden_units_,
+                                   m_padded,
+                                   inter_size_,
+                                   scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
+                                   (int8_t*)(ffn_weights->output_weight.sp_kernel),
+                                   inter_buf_,
+                                   (int8_t*)output_tensor);
+        }
+        else {
+#endif
+            cublas_wrapper->Gemm((int8_t*)output_tensor,
+                                 1,
+                                 m,
+                                 hidden_units_,
+                                 inter_size_,
+                                 0,
+                                 0,
+                                 0,
+                                 scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
+                                 inter_buf_,
+                                 (int8_t*)(ffn_weights->output_weight.kernel));
+#ifdef SPARSITY_ENABLED
+        }
+#endif
+    }
+    POP_RANGE;
+
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T>
+FfnLayerINT8<T>::FfnLayerINT8(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           inter_size,
+                              int              int8_mode,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    max_token_num_(max_batch_size * max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    hidden_units_(head_num * size_per_head),
+    inter_size_(inter_size),
+    int8_mode_(int8_mode),
+    sparse_(sparse)
+{
+}
+
+template<typename T>
+FfnLayerINT8<T>::FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer):
+    BaseLayer(
+        ffn_layer.stream_, ffn_layer.cublas_wrapper_, ffn_layer.allocator_, ffn_layer.is_free_buffer_after_forward_),
+    max_token_num_(ffn_layer.max_token_num_),
+    head_num_(ffn_layer.head_num_),
+    size_per_head_(ffn_layer.size_per_head_),
+    hidden_units_(ffn_layer.hidden_units_),
+    inter_size_(ffn_layer.inter_size_),
+    int8_mode_(ffn_layer.int8_mode_),
+    sparse_(ffn_layer.sparse_)
+{
+}
+
+template<typename T>
+FfnLayerINT8<T>::~FfnLayerINT8()
+{
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T>
+void FfnLayerINT8<T>::allocateBuffer()
+{
+    if (is_allocate_buffer_ == false) {
+        inter_int_buf_ =
+            (int32_t*)allocator_->reMalloc(inter_int_buf_, sizeof(int32_t) * max_token_num_ * inter_size_, false);
+        inter_buf_ = (int8_t*)allocator_->reMalloc(inter_buf_, sizeof(int8_t) * max_token_num_ * inter_size_, false);
+        is_allocate_buffer_ = true;
+    }
+}
+
+template<typename T>
+void FfnLayerINT8<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&inter_int_buf_));
+        allocator_->free((void**)(&inter_buf_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool FfnLayerINT8<T>::isValidTokenNum(size_t token_num)
+{
+    if (max_token_num_ == 0) {
+        max_token_num_ = token_num;
+        return true;
+    }
+    else {
+        return token_num <= max_token_num_;
+    }
+}
+
+template class FfnLayerINT8<float>;
+template class FfnLayerINT8<half>;
+
+template<typename T>
+GeluFfnLayerINT8<T>::GeluFfnLayerINT8(size_t           max_batch_size,
+                                      size_t           max_seq_len,
+                                      size_t           head_num,
+                                      size_t           size_per_head,
+                                      size_t           inter_size,
+                                      int              int8_mode,
+                                      cudaStream_t     stream,
+                                      cublasMMWrapper* cublas_wrapper,
+                                      IAllocator*      allocator,
+                                      bool             is_free_buffer_after_forward,
+                                      bool             sparse):
+    FfnLayerINT8<T>(max_batch_size,
+                    max_seq_len,
+                    head_num,
+                    size_per_head,
+                    inter_size,
+                    int8_mode,
+                    stream,
+                    cublas_wrapper,
+                    allocator,
+                    is_free_buffer_after_forward,
+                    sparse)
+{
+}
+
+template<typename T>
+GeluFfnLayerINT8<T>::GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& gelu_ffn_layer): FfnLayerINT8<T>(gelu_ffn_layer)
+{
+}
+
+template<typename T>
+void GeluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
+{
+    if (int8_mode_ == 1) {
+        invokeAddBiasGeluCol32<T>(inter_buf_,
+                                  inter_int_buf_,
+                                  bias,
+                                  m,
+                                  inter_size_,
+                                  stream_,
+                                  &(scale_list->d_scale_list_[scale_list->p2_offset_ + 4 * hidden_units_]),
+                                  &(scale_list->d_scale_list_[44 + 2]),
+                                  &(scale_list->d_scale_list_[52 + 3]));
+    }
+    else if (int8_mode_ == 2 || int8_mode_ == 3) {
+#ifdef SPARSITY_ENABLED
+        if (sparse_) {
+            invokeAddBiasGeluRow<T>(inter_buf_,
+                                    (const int8_t*)inter_int_buf_,
+                                    bias,
+                                    m,
+                                    inter_size_,
+                                    stream_,
+                                    &(scale_list->d_scale_list_[48 + 1]),
+                                    &(scale_list->d_scale_list_[52 + 3]));
+        }
+        else {
+#endif
+            invokeAddBiasGeluCol32<T>(inter_buf_,
+                                      (const int8_t*)inter_int_buf_,
+                                      bias,
+                                      m,
+                                      inter_size_,
+                                      stream_,
+                                      &(scale_list->d_scale_list_[48 + 1]),
+                                      &(scale_list->d_scale_list_[52 + 3]));
+#ifdef SPARSITY_ENABLED
+        }
+#endif
+    }
+}
+
+template class GeluFfnLayerINT8<float>;
+template class GeluFfnLayerINT8<half>;
+
+template<typename T>
+ReluFfnLayerINT8<T>::ReluFfnLayerINT8(size_t           max_batch_size,
+                                      size_t           max_seq_len,
+                                      size_t           head_num,
+                                      size_t           size_per_head,
+                                      size_t           inter_size,
+                                      int              int8_mode,
+                                      cudaStream_t     stream,
+                                      cublasMMWrapper* cublas_wrapper,
+                                      IAllocator*      allocator,
+                                      bool             is_free_buffer_after_forward):
+    FfnLayerINT8<T>(max_batch_size,
+                    max_seq_len,
+                    head_num,
+                    size_per_head,
+                    inter_size,
+                    int8_mode,
+                    stream,
+                    cublas_wrapper,
+                    allocator,
+                    is_free_buffer_after_forward)
+{
+}
+
+template<typename T>
+ReluFfnLayerINT8<T>::ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& relu_ffn_layer): FfnLayerINT8<T>(relu_ffn_layer)
+{
+}
+
+template<typename T>
+void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
+{
+    // TODO
+}
+
+template class ReluFfnLayerINT8<float>;
+template class ReluFfnLayerINT8<half>;
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnLayerINT8.h
+++ b/src/fastertransformer/layers/FfnLayerINT8.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnINT8Weight.h"
+#include "src/fastertransformer/kernels/activation_int8_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+class GeluFfnLayerINT8;
+
+template<typename T>
+class ReluFfnLayerINT8;
+
+template<typename T>
+class FfnLayerINT8: public BaseLayer {
+private:
+    // buffer handling
+    size_t max_token_num_ = 0;
+
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    bool isValidTokenNum(size_t token_num);
+
+protected:
+    size_t inter_size_;
+    int    int8_mode_;
+    bool   sparse_;
+
+    int*         inter_int_buf_;
+    int8_t*      inter_buf_;
+    virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
+
+public:
+    FfnLayerINT8(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           inter_size,
+                 int              int8_mode,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse = false);
+
+    FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
+
+    ~FfnLayerINT8();
+
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors,
+                 const FfnWeight<T>*                           ffn_weights);
+
+    friend GeluFfnLayerINT8<T>;
+    friend ReluFfnLayerINT8<T>;
+};
+
+template<typename T>
+class GeluFfnLayerINT8: public FfnLayerINT8<T> {
+public:
+    GeluFfnLayerINT8(size_t           max_batch_size,
+                     size_t           max_seq_len,
+                     size_t           head_num,
+                     size_t           size_per_head,
+                     size_t           inter_size,
+                     int              int8_mode,
+                     cudaStream_t     stream,
+                     cublasMMWrapper* cublas_wrapper,
+                     IAllocator*      allocator,
+                     bool             is_free_buffer_after_forward,
+                     bool             sparse = false);
+
+    GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
+
+    ~GeluFfnLayerINT8() = default;
+
+private:
+    using FfnLayerINT8<T>::inter_int_buf_;
+    using FfnLayerINT8<T>::inter_buf_;
+    using FfnLayerINT8<T>::inter_size_;
+    using FfnLayerINT8<T>::stream_;
+    using FfnLayerINT8<T>::int8_mode_;
+    using FfnLayerINT8<T>::sparse_;
+    using FfnLayerINT8<T>::hidden_units_;
+    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
+};
+
+template<typename T>
+class ReluFfnLayerINT8: public FfnLayerINT8<T> {
+public:
+    ReluFfnLayerINT8(size_t           max_batch_size,
+                     size_t           max_seq_len,
+                     size_t           head_num,
+                     size_t           size_per_head,
+                     size_t           inter_size,
+                     int              int8_mode,
+                     cudaStream_t     stream,
+                     cublasMMWrapper* cublas_wrapper,
+                     IAllocator*      allocator,
+                     bool             is_free_buffer_after_forward);
+
+    ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
+
+    ~ReluFfnLayerINT8() = default;
+
+private:
+    using FfnLayerINT8<T>::inter_int_buf_;
+    using FfnLayerINT8<T>::inter_buf_;
+    using FfnLayerINT8<T>::inter_size_;
+    using FfnLayerINT8<T>::stream_;
+    using FfnLayerINT8<T>::int8_mode_;
+    using FfnLayerINT8<T>::hidden_units_;
+    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnWeight.h
+++ b/src/fastertransformer/layers/FfnWeight.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "DenseWeight.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2 = T1>
+struct FfnWeight {
+    DenseWeight<T1, T2> gating_weight;
+    DenseWeight<T1, T2> intermediate_weight;
+    DenseWeight<T1, T2> intermediate_weight2;  // for gated activation
+    DenseWeight<T1, T2> output_weight;
+    DenseWeight<T1, T2> ia3_weight;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers/AttentionWeight.h
+++ b/src/fastertransformer/layers/attention_layers/AttentionWeight.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/DenseWeight.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2 = T1>
+struct AttentionWeight {
+    DenseWeight<T1, T2> query_weight;
+    DenseWeight<T1, T2> key_weight;
+    DenseWeight<T1, T2> value_weight;
+    DenseWeight<T1, T2> attention_output_weight;
+    DenseWeight<T1, T2> ia3_key_weight;
+    DenseWeight<T1, T2> ia3_value_weight;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <vector>
+
+// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+enum class AttentionType {
+    UNFUSED_MHA,
+    UNFUSED_PADDED_MHA,
+    FUSED_MHA,
+    FUSED_PADDED_MHA
+};
+
+/* NOTE:
+1. only swin-style relative position bias is supported currently
+2. gpt-style (causal-mask) models support any-sequence-length fmha, so we don't need to call isValidSeqLen at run-time
+3. bert/vit can also support any-seq-length fmha
+*/
+template<typename T>
+AttentionType getAttentionType(size_t     size_per_head,
+                               const int  sm,
+                               const bool remove_padding,
+                               const int  max_seq_len,
+                               const bool is_fuse                          = true,
+                               const bool with_swin_relative_position_bias = false,
+                               const bool causal_mask                      = false)
+{
+
+    if (std::is_same<T, half>::value && is_fuse) {
+        // Bert/Vit
+        if (!causal_mask) {
+            if (!with_swin_relative_position_bias
+                && (((sm == kSM_70 || sm == kSM_72) && size_per_head == 64)
+                    || ((sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
+                        && (size_per_head == 64 || size_per_head == 32)))) {
+                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+            }
+            else if (with_swin_relative_position_bias && (sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
+                     && max_seq_len <= 256 && size_per_head == 32) {
+                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+            }
+        }
+        // GPT and its variants
+        else {
+           // FMHA_ENABLE only affects gpt-style models (causal-mask)
+            char * fused_qkv = std::getenv("FMHA_ENABLE");
+            if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
+                if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
+                    && (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
+                        || size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) {
+                    return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+                }
+            }
+        }
+    }
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value && is_fuse) {
+        if (!causal_mask) {
+            if ((sm == kSM_89 || sm == kSM_90) && max_seq_len < 512 && is_fuse && size_per_head == 64) {
+                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+            }
+            else {
+                return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+            }
+        }
+    }
+#endif
+
+    return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+}
+
+template<typename T>
+AttentionType getAttentionTypeINT8(
+    size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len, const int int8_mode)
+{
+    if ((int8_mode == 1 || int8_mode == 2)
+        && (((sm == kSM_80 || sm == kSM_86) && (size_per_head == 64 || size_per_head == 32) && max_seq_len <= 512)
+            || (sm == kSM_75
+                && ((size_per_head == 64 && max_seq_len <= 384) || (size_per_head == 32 && max_seq_len <= 512))))) {
+        return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+    }
+    else {
+        return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+    }
+}
+
+inline bool isFusedMHA(AttentionType attention_type)
+{
+    return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA;
+}
+
+inline bool isUnPaddedMHA(AttentionType attention_type)
+{
+    return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::UNFUSED_MHA;
+}
+
+inline bool isPaddedMHA(AttentionType attention_type)
+{
+    return attention_type == AttentionType::FUSED_PADDED_MHA || attention_type == AttentionType::UNFUSED_PADDED_MHA;
+}
+
+inline AttentionType getUnfusedAttentionType(AttentionType attention_type)
+{
+    if (attention_type == AttentionType::FUSED_MHA) {
+        return AttentionType::UNFUSED_MHA;
+    }
+    else if (attention_type == AttentionType::FUSED_PADDED_MHA) {
+        return AttentionType::UNFUSED_PADDED_MHA;
+    }
+    return attention_type;
+}
+
+template<typename T>
+class BaseAttentionLayer: public BaseLayer {
+
+public:
+    virtual void
+    forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight<T>* attention_weights) = 0;
+
+    BaseAttentionLayer(cudaStream_t     stream,
+                       cublasMMWrapper* cublas_wrapper,
+                       IAllocator*      allocator,
+                       bool             is_free_buffer_after_forward,
+                       bool             sparse = false):
+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
+    {
+    }
+    virtual ~BaseAttentionLayer() = default;
+    virtual bool isValidSeqLen(const size_t seq_len)
+    {
+        return true;
+    }
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
--- a/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h
+++ b/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
+    const float* qk_scale;
+    const float* qk_scale_inv;
+    float*       qk_h_scale;
+    float*       qk_h_scale_inv;
+    float*       identity_scale;
+    float*       identity_h_scale;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
+++ b/src/fastertransformer/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <vector>
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+// template<typename T>
+// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
+// const bool is_fuse = true)
+// {
+//     if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm ==
+//     kSM_72)
+//         && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
+//         return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+//     }
+//     else {
+//         return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+//     }
+// }
+
+template<typename T1, typename T2>
+class BaseAttentionFP8Layer: public BaseLayer {
+
+public:
+    virtual void forward(TensorMap*                        output_tensors,
+                         TensorMap*                        input_tensors,
+                         const AttentionFP8Weight<T1, T2>* attention_weights) = 0;
+
+    BaseAttentionFP8Layer(cudaStream_t     stream,
+                          cublasMMWrapper* cublas_wrapper,
+                          IAllocator*      allocator,
+                          bool             is_free_buffer_after_forward,
+                          bool             sparse = false):
+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
+    {
+    }
+    virtual ~BaseAttentionFP8Layer() = default;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers_fp8/CMakeLists.txt
+++ b/src/fastertransformer/layers/attention_layers_fp8/CMakeLists.txt
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)