diff --git a/.gitignore b/.gitignore
index a143965a0238d469c21f30ce911ee6d18112abc7..e8bb8c8570b5e231720120f2f8600792a13b86e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 __pycache__/
 *.egg-info/
 workspace/
+.cache
+*build*/
diff --git a/3rdparty/INIReader.h b/3rdparty/INIReader.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d40f0638f2ba88342035b8e33c45a9029320d84
--- /dev/null
+++ b/3rdparty/INIReader.h
@@ -0,0 +1,501 @@
+// Read an INI file into easy-to-access name/value pairs.
+
+// inih and INIReader are released under the New BSD license.
+// Go to the project home page for more info:
+//
+// https://github.com/benhoyt/inih (Initial repo)
+// https://github.com/jtilly/inih  (The reference of this header file)
+/* inih -- simple .INI file parser
+inih is released under the New BSD license (see LICENSE.txt). Go to the project
+home page for more info:
+https://github.com/benhoyt/inih
+https://github.com/jtilly/inih 
+*/
+
+#ifndef __INI_H__
+#define __INI_H__
+
+/* Make this header file easier to include in C++ code */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+/* Typedef for prototype of handler function. */
+typedef int (*ini_handler)(void* user, const char* section,
+                           const char* name, const char* value);
+
+/* Typedef for prototype of fgets-style reader function. */
+typedef char* (*ini_reader)(char* str, int num, void* stream);
+
+/* Parse given INI-style file. May have [section]s, name=value pairs
+   (whitespace stripped), and comments starting with ';' (semicolon). Section
+   is "" if name=value pair parsed before any section heading. name:value
+   pairs are also supported as a concession to Python's configparser.
+   For each name=value pair parsed, call handler function with given user
+   pointer as well as section, name, and value (data only valid for duration
+   of handler call). Handler should return nonzero on success, zero on error.
+   Returns 0 on success, line number of first error on parse error (doesn't
+   stop on first error), -1 on file open error, or -2 on memory allocation
+   error (only when INI_USE_STACK is zero).
+*/
+int ini_parse(const char* filename, ini_handler handler, void* user);
+
+/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't
+   close the file when it's finished -- the caller must do that. */
+int ini_parse_file(FILE* file, ini_handler handler, void* user);
+
+/* Same as ini_parse(), but takes an ini_reader function pointer instead of
+   filename. Used for implementing custom or string-based I/O. */
+int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
+                     void* user);
+
+/* Nonzero to allow multi-line value parsing, in the style of Python's
+   configparser. If allowed, ini_parse() will call the handler with the same
+   name for each subsequent line parsed. */
+#ifndef INI_ALLOW_MULTILINE
+#define INI_ALLOW_MULTILINE 1
+#endif
+
+/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of
+   the file. See http://code.google.com/p/inih/issues/detail?id=21 */
+#ifndef INI_ALLOW_BOM
+#define INI_ALLOW_BOM 1
+#endif
+
+/* Nonzero to allow inline comments (with valid inline comment characters
+   specified by INI_INLINE_COMMENT_PREFIXES). Set to 0 to turn off and match
+   Python 3.2+ configparser behaviour. */
+#ifndef INI_ALLOW_INLINE_COMMENTS
+#define INI_ALLOW_INLINE_COMMENTS 1
+#endif
+#ifndef INI_INLINE_COMMENT_PREFIXES
+#define INI_INLINE_COMMENT_PREFIXES ";"
+#endif
+
+/* Nonzero to use stack, zero to use heap (malloc/free). */
+#ifndef INI_USE_STACK
+#define INI_USE_STACK 1
+#endif
+
+/* Stop parsing on first error (default is to keep parsing). */
+#ifndef INI_STOP_ON_FIRST_ERROR
+#define INI_STOP_ON_FIRST_ERROR 0
+#endif
+
+/* Maximum line length for any line in INI file. */
+#ifndef INI_MAX_LINE
+#define INI_MAX_LINE 200
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/* inih -- simple .INI file parser
+inih is released under the New BSD license (see LICENSE.txt). Go to the project
+home page for more info:
+https://github.com/benhoyt/inih
+*/
+
+#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#if !INI_USE_STACK
+#include <stdlib.h>
+#endif
+
+#define MAX_SECTION 50
+#define MAX_NAME 50
+
+/* Strip whitespace chars off end of given string, in place. Return s. */
+inline static char* rstrip(char* s)
+{
+    char* p = s + strlen(s);
+    while (p > s && isspace((unsigned char)(*--p)))
+        *p = '\0';
+    return s;
+}
+
+/* Return pointer to first non-whitespace char in given string. */
+inline static char* lskip(const char* s)
+{
+    while (*s && isspace((unsigned char)(*s)))
+        s++;
+    return (char*)s;
+}
+
+/* Return pointer to first char (of chars) or inline comment in given string,
+   or pointer to null at end of string if neither found. Inline comment must
+   be prefixed by a whitespace character to register as a comment. */
+inline static char* find_chars_or_comment(const char* s, const char* chars)
+{
+#if INI_ALLOW_INLINE_COMMENTS
+    int was_space = 0;
+    while (*s && (!chars || !strchr(chars, *s)) &&
+           !(was_space && strchr(INI_INLINE_COMMENT_PREFIXES, *s))) {
+        was_space = isspace((unsigned char)(*s));
+        s++;
+    }
+#else
+    while (*s && (!chars || !strchr(chars, *s))) {
+        s++;
+    }
+#endif
+    return (char*)s;
+}
+
+/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
+inline static char* strncpy0(char* dest, const char* src, size_t size)
+{
+    strncpy(dest, src, size);
+    dest[size - 1] = '\0';
+    return dest;
+}
+
+/* See documentation in header file. */
+inline int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
+                     void* user)
+{
+    /* Uses a fair bit of stack (use heap instead if you need to) */
+#if INI_USE_STACK
+    char line[INI_MAX_LINE];
+#else
+    char* line;
+#endif
+    char section[MAX_SECTION] = "";
+    char prev_name[MAX_NAME] = "";
+
+    char* start;
+    char* end;
+    char* name;
+    char* value;
+    int lineno = 0;
+    int error = 0;
+
+#if !INI_USE_STACK
+    line = (char*)malloc(INI_MAX_LINE);
+    if (!line) {
+        return -2;
+    }
+#endif
+
+    /* Scan through stream line by line */
+    while (reader(line, INI_MAX_LINE, stream) != NULL) {
+        lineno++;
+
+        start = line;
+#if INI_ALLOW_BOM
+        if (lineno == 1 && (unsigned char)start[0] == 0xEF &&
+                           (unsigned char)start[1] == 0xBB &&
+                           (unsigned char)start[2] == 0xBF) {
+            start += 3;
+        }
+#endif
+        start = lskip(rstrip(start));
+
+        if (*start == ';' || *start == '#') {
+            /* Per Python configparser, allow both ; and # comments at the
+               start of a line */
+        }
+#if INI_ALLOW_MULTILINE
+        else if (*prev_name && *start && start > line) {
+
+#if INI_ALLOW_INLINE_COMMENTS
+        end = find_chars_or_comment(start, NULL);
+        if (*end)
+            *end = '\0';
+        rstrip(start);
+#endif
+
+            /* Non-blank line with leading whitespace, treat as continuation
+               of previous name's value (as per Python configparser). */
+            if (!handler(user, section, prev_name, start) && !error)
+                error = lineno;
+        }
+#endif
+        else if (*start == '[') {
+            /* A "[section]" line */
+            end = find_chars_or_comment(start + 1, "]");
+            if (*end == ']') {
+                *end = '\0';
+                strncpy0(section, start + 1, sizeof(section));
+                *prev_name = '\0';
+            }
+            else if (!error) {
+                /* No ']' found on section line */
+                error = lineno;
+            }
+        }
+        else if (*start) {
+            /* Not a comment, must be a name[=:]value pair */
+            end = find_chars_or_comment(start, "=:");
+            if (*end == '=' || *end == ':') {
+                *end = '\0';
+                name = rstrip(start);
+                value = lskip(end + 1);
+#if INI_ALLOW_INLINE_COMMENTS
+                end = find_chars_or_comment(value, NULL);
+                if (*end)
+                    *end = '\0';
+#endif
+                rstrip(value);
+
+                /* Valid name[=:]value pair found, call handler */
+                strncpy0(prev_name, name, sizeof(prev_name));
+                if (!handler(user, section, name, value) && !error)
+                    error = lineno;
+            }
+            else if (!error) {
+                /* No '=' or ':' found on name[=:]value line */
+                error = lineno;
+            }
+        }
+
+#if INI_STOP_ON_FIRST_ERROR
+        if (error)
+            break;
+#endif
+    }
+
+#if !INI_USE_STACK
+    free(line);
+#endif
+
+    return error;
+}
+
+/* See documentation in header file. */
+inline int ini_parse_file(FILE* file, ini_handler handler, void* user)
+{
+    return ini_parse_stream((ini_reader)fgets, file, handler, user);
+}
+
+/* See documentation in header file. */
+inline int ini_parse(const char* filename, ini_handler handler, void* user)
+{
+    FILE* file;
+    int error;
+
+    file = fopen(filename, "r");
+    if (!file)
+        return -1;
+    error = ini_parse_file(file, handler, user);
+    fclose(file);
+    return error;
+}
+
+#endif /* __INI_H__ */
+
+
+#ifndef __INIREADER_H__
+#define __INIREADER_H__
+
+#include <map>
+#include <set>
+#include <string>
+
+// Read an INI file into easy-to-access name/value pairs. (Note that I've gone
+// for simplicity here rather than speed, but it should be pretty decent.)
+class INIReader
+{
+public:
+    // Empty Constructor
+    INIReader() {};
+
+    // Construct INIReader and parse given filename. See ini.h for more info
+    // about the parsing.
+    INIReader(std::string filename);
+
+    // Construct INIReader and parse given file. See ini.h for more info
+    // about the parsing.
+    INIReader(FILE *file);
+    ~INIReader();
+    // Return the result of ini_parse(), i.e., 0 on success, line number of
+    // first error on parse error, or -1 on file open error.
+    int ParseError() const;
+
+    // Return the list of sections found in ini file
+    const std::set<std::string>& Sections() const;
+
+    // Get a string value from INI file, returning default_value if not found.
+    std::string Get(std::string section, std::string name,
+                    std::string default_value) const;
+    std::string Get(std::string section, std::string name) const;
+
+    // Get an integer (long) value from INI file, returning default_value if
+    // not found or not a valid integer (decimal "1234", "-1234", or hex "0x4d2").
+    long GetInteger(std::string section, std::string name, long default_value) const;
+    long GetInteger(std::string section, std::string name) const;
+
+    // Get a real (floating point double) value from INI file, returning
+    // default_value if not found or not a valid floating point value
+    // according to strtod().
+    double GetReal(std::string section, std::string name, double default_value) const;
+
+    // Get a single precision floating point number value from INI file, returning
+    // default_value if not found or not a valid floating point value
+    // according to strtof().
+    float GetFloat(std::string section, std::string name, float default_value) const;
+    float GetFloat(std::string section, std::string name) const;
+  
+    // Get a boolean value from INI file, returning default_value if not found or if
+    // not a valid true/false value. Valid true values are "true", "yes", "on", "1",
+    // and valid false values are "false", "no", "off", "0" (not case sensitive).
+    bool GetBoolean(std::string section, std::string name, bool default_value) const;
+
+protected:
+    int _error;
+    std::map<std::string, std::string> _values;
+    std::set<std::string> _sections;
+    static std::string MakeKey(std::string section, std::string name);
+    static int ValueHandler(void* user, const char* section, const char* name,
+                            const char* value);
+};
+
+#endif  // __INIREADER_H__
+
+
+#ifndef __INIREADER__
+#define __INIREADER__
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+
+inline INIReader::INIReader(std::string filename)
+{
+    _error = ini_parse(filename.c_str(), ValueHandler, this);
+}
+
+inline INIReader::INIReader(FILE *file)
+{
+    _error = ini_parse_file(file, ValueHandler, this);
+}
+
+inline int INIReader::ParseError() const
+{
+    return _error;
+}
+
+inline INIReader::~INIReader() { }
+
+inline const std::set<std::string>& INIReader::Sections() const
+{
+    return _sections;
+}
+
+inline std::string INIReader::Get(std::string section, std::string name, std::string default_value) const
+{
+    std::string key = MakeKey(section, name);
+    return _values.count(key) ? _values.at(key) : default_value;
+}
+
+inline std::string INIReader::Get(std::string section, std::string name) const
+{
+    std::string key = MakeKey(section, name);
+    if(_values.count(key)) return _values.at(key);
+    else
+    {
+        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
+        exit(-1);
+    }
+}
+
+inline long INIReader::GetInteger(std::string section, std::string name, long default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    // This parses "1234" (decimal) and also "0x4D2" (hex)
+    long n = strtol(value, &end, 0);
+    return end > value ? n : default_value;
+}
+
+inline long INIReader::GetInteger(std::string section, std::string name) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    // This parses "1234" (decimal) and also "0x4D2" (hex)
+    long n = strtol(value, &end, 0);
+    if(end <= value)
+    {
+        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
+        exit(-1);
+    }
+    return n;
+}
+
+inline double INIReader::GetReal(std::string section, std::string name, double default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    double n = strtod(value, &end);
+    return end > value ? n : default_value;
+}
+
+inline float INIReader::GetFloat(std::string section, std::string name, float default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    float n = strtof(value, &end);
+    return end > value ? n : default_value;
+}
+
+inline float INIReader::GetFloat(std::string section, std::string name) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    float n = strtof(value, &end);
+    if(end <= value)
+    {
+        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
+        exit(-1);
+    }
+    return n;
+}
+
+inline bool INIReader::GetBoolean(std::string section, std::string name, bool default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    // Convert to lower case to make string comparisons case-insensitive
+    std::transform(valstr.begin(), valstr.end(), valstr.begin(), ::tolower);
+    if (valstr == "true" || valstr == "yes" || valstr == "on" || valstr == "1")
+        return true;
+    else if (valstr == "false" || valstr == "no" || valstr == "off" || valstr == "0")
+        return false;
+    else
+        return default_value;
+}
+
+inline std::string INIReader::MakeKey(std::string section, std::string name)
+{
+    std::string key = section + "=" + name;
+    // Convert to lower case to make section/name lookups case-insensitive
+    std::transform(key.begin(), key.end(), key.begin(), ::tolower);
+    return key;
+}
+
+inline int INIReader::ValueHandler(void* user, const char* section, const char* name,
+                            const char* value)
+{
+    INIReader* reader = (INIReader*)user;
+    std::string key = MakeKey(section, name);
+    if (reader->_values[key].size() > 0)
+        reader->_values[key] += "\n";
+    reader->_values[key] += value;
+    reader->_sections.insert(section);
+    return 1;
+}
+
+#endif  // __INIREADER__
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100755
index 0000000000000000000000000000000000000000..b81abb3b86a592f47bc072c56e68cb264fce6691
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,399 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13
+project(FasterTransformer LANGUAGES CXX CUDA)
+
+find_package(CUDA 10.2 REQUIRED)
+
+if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
+  add_definitions("-DENABLE_BF16")
+  message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
+endif()
+
+if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12"))
+  add_definitions("-DENABLE_FP8")
+  option(ENABLE_FP8 "ENABLE_FP8" OFF)
+  if(ENABLE_FP8)
+    message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag")
+  endif()
+endif()
+
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
+
+option(BUILD_PYT "Build in PyTorch TorchScript class mode" OFF)
+if(NOT BUILD_MULTI_GPU)
+  option(BUILD_MULTI_GPU "Build project about multi-GPU" OFF)
+endif()
+if(NOT USE_TRITONSERVER_DATATYPE)
+  option(USE_TRITONSERVER_DATATYPE "Build triton backend for triton server" OFF)
+endif()
+
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-cutlass 
+  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+  GIT_TAG        cc85b64cf676c45f98a17e3a47c0aafcf817f088
+)
+
+set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+FetchContent_MakeAvailable(repo-cutlass)
+
+set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
+set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/fastertransformer/cutlass_extensions/include)
+
+option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
+
+option(BUILD_FAST_MATH "Build in fast math mode" ON)
+
+if(BUILD_MULTI_GPU)
+  message(STATUS "Add DBUILD_MULTI_GPU, requires MPI and NCCL")
+  add_definitions("-DBUILD_MULTI_GPU")
+  set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
+  find_package(MPI REQUIRED)
+  find_package(NCCL REQUIRED)
+  set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building
+endif()
+
+if(BUILD_PYT)
+  if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
+    if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
+      message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
+    endif()
+    if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_EQUAL "20.03")
+      add_definitions(-DLEGACY_THS=1)
+    endif()
+  endif()
+endif()
+
+if(USE_TRITONSERVER_DATATYPE)
+  message("-- USE_TRITONSERVER_DATATYPE")
+  add_definitions("-DUSE_TRITONSERVER_DATATYPE")
+endif()
+
+set(CXX_STD "14" CACHE STRING "C++ standard")
+
+set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+
+set(TF_PATH "" CACHE STRING "TensorFlow path")
+set(CUSPARSELT_PATH "" CACHE STRING "cuSPARSELt path")
+
+if((BUILD_TF OR BUILD_TF2) AND NOT TF_PATH)
+  message(FATAL_ERROR "TF_PATH must be set if BUILD_TF or BUILD_TF2 (=TensorFlow mode) is on.")
+endif()
+
+list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
+
+# profiling
+option(USE_NVTX "Whether or not to use nvtx" ON)
+if(USE_NVTX)
+  message(STATUS "NVTX is enabled.")
+  add_definitions("-DUSE_NVTX")
+endif()
+
+# setting compiler flags
+set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") # -Xptxas -v
+
+set(SM_SETS 52 60 61 70 75 80 86 89 90)
+set(USING_WMMA False)
+set(FIND_SM False)
+
+foreach(SM_NUM IN LISTS SM_SETS)
+  string(FIND "${SM}" "${SM_NUM}" SM_POS)
+  if(SM_POS GREATER -1)
+    if(FIND_SM STREQUAL False)
+      set(ENV{TORCH_CUDA_ARCH_LIST} "")
+    endif()
+    set(FIND_SM True)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"")
+
+    if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86 OR SM_NUM STREQUAL 89 OR SM_NUM STREQUAL 90)
+      set(USING_WMMA True)
+    endif()
+
+    if(BUILD_PYT)
+      string(SUBSTRING ${SM_NUM} 0 1 SM_MAJOR)
+      string(SUBSTRING ${SM_NUM} 1 1 SM_MINOR)
+      set(ENV{TORCH_CUDA_ARCH_LIST} "$ENV{TORCH_CUDA_ARCH_LIST}\;${SM_MAJOR}.${SM_MINOR}")
+    endif()
+
+    list(APPEND CMAKE_CUDA_ARCHITECTURES ${SM_NUM})
+    message("-- Assign GPU architecture (sm=${SM_NUM})")
+  endif()
+endforeach()
+
+if(USING_WMMA STREQUAL True)
+  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
+  message("-- Use WMMA")
+endif()
+
+if(NOT (FIND_SM STREQUAL True))
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
+                        -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
+                        -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
+                        -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
+                        -gencode=arch=compute_86,code=\\\"sm_86,compute_86\\\" \
+                        ")
+  #                      -rdc=true")
+  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
+  if(BUILD_PYT)
+    set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0;8.6")
+  endif()
+  set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)
+  message("-- Assign GPU architecture (sm=70,75,80,86)")
+endif()
+
+if(BUILD_PYT)
+  set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
+endif()
+
+set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
+set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
+# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall  --ptxas-options=-v --resource-usage")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall -DCUDA_PTX_FP8_F2FP_ENABLED")
+
+set(CMAKE_CXX_STANDARD "${CXX_STD}")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD} -DCUDA_PTX_FP8_F2FP_ENABLED")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED")
+if(BUILD_FAST_MATH)
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} --use_fast_math")
+message("CMAKE_CUDA_FLAGS_RELEASE: ${CMAKE_CUDA_FLAGS_RELEASE}")
+endif()
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+set(COMMON_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}
+  ${CUDA_PATH}/include
+  ${CUTLASS_HEADER_DIR}
+)
+message("-- COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
+
+set(COMMON_LIB_DIRS
+  ${CUDA_PATH}/lib64
+)
+
+if (SPARSITY_SUPPORT)
+  list(APPEND COMMON_HEADER_DIRS ${CUSPARSELT_PATH}/include)
+  list(APPEND COMMON_LIB_DIRS ${CUSPARSELT_PATH}/lib64)
+  add_definitions(-DSPARSITY_ENABLED=1)
+endif()
+
+if(BUILD_TF)
+  list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
+  list(APPEND COMMON_LIB_DIRS ${TF_PATH})
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+endif()
+
+if(BUILD_TF2)
+  list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
+  list(APPEND COMMON_LIB_DIRS ${TF_PATH})
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
+endif()
+
+set(PYTHON_PATH "python" CACHE STRING "Python path")
+if(BUILD_PYT)
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE TORCH_VERSION)
+  if (TORCH_VERSION VERSION_LESS "1.5.0")
+      message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
+  endif()
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
+print(os.path.dirname(torch.__file__),end='');"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE TORCH_DIR)
+  if (NOT _PYTHON_SUCCESS MATCHES 0)
+      message(FATAL_ERROR "Torch config Error.")
+  endif()
+  list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
+  find_package(Torch REQUIRED)
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
+print(sysconfig.get_python_inc());"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE PY_INCLUDE_DIR)
+  if (NOT _PYTHON_SUCCESS MATCHES 0)
+      message(FATAL_ERROR "Python config Error.")
+  endif()
+  list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch;
+print(torch._C._GLIBCXX_USE_CXX11_ABI,end='');"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE USE_CXX11_ABI)
+  message("-- USE_CXX11_ABI=${USE_CXX11_ABI}")
+  if (USE_CXX11_ABI)
+    set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=1")
+  else()
+    set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
+endif()
+
+if (BUILD_MULTI_GPU)
+  list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
+  list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
+endif()
+
+if(USE_TRITONSERVER_DATATYPE)
+  list(APPEND COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR}/../repo-core-src/include)
+endif()
+
+include_directories(
+  ${COMMON_HEADER_DIRS}
+)
+
+link_directories(
+  ${COMMON_LIB_DIRS}
+)
+
+# add_subdirectory(3rdparty)
+add_subdirectory(src)
+add_subdirectory(examples)
+
+add_subdirectory(tests)
+
+# # Mesaure the compile time
+option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF)
+if (MEASURE_BUILD_TIME)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_COMMAND} -E time")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_CUSTOM "${CMAKE_COMMAND} -E time")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
+endif()
+
+########################################
+
+add_library(transformer-shared SHARED
+  $<TARGET_OBJECTS:BaseBeamSearchLayer>
+  $<TARGET_OBJECTS:BaseSamplingLayer>
+  $<TARGET_OBJECTS:BeamSearchLayer>
+  $<TARGET_OBJECTS:DynamicDecodeLayer>
+  $<TARGET_OBJECTS:llama_fmha>
+  $<TARGET_OBJECTS:Llama>
+  $<TARGET_OBJECTS:LlamaTritonBackend>
+  $<TARGET_OBJECTS:OnlineBeamSearchLayer>
+  $<TARGET_OBJECTS:TopKSamplingLayer>
+  $<TARGET_OBJECTS:TopPSamplingLayer>
+  $<TARGET_OBJECTS:TransformerTritonBackend>
+  $<TARGET_OBJECTS:activation_kernels>
+  $<TARGET_OBJECTS:ban_bad_words>
+  $<TARGET_OBJECTS:beam_search_penalty_kernels>
+  $<TARGET_OBJECTS:beam_search_topk_kernels>
+  $<TARGET_OBJECTS:bert_preprocess_kernels>
+  $<TARGET_OBJECTS:cublasAlgoMap>
+  $<TARGET_OBJECTS:cublasMMWrapper>
+  $<TARGET_OBJECTS:cuda_utils>
+  $<TARGET_OBJECTS:custom_ar_comm>
+  $<TARGET_OBJECTS:custom_ar_kernels>
+  $<TARGET_OBJECTS:decoder_masked_multihead_attention>
+  $<TARGET_OBJECTS:decoding_kernels>
+  $<TARGET_OBJECTS:gpt_kernels>
+  $<TARGET_OBJECTS:logprob_kernels>
+  $<TARGET_OBJECTS:logger>
+  $<TARGET_OBJECTS:memory_utils>
+  $<TARGET_OBJECTS:mpi_utils>
+  $<TARGET_OBJECTS:nccl_utils>
+  $<TARGET_OBJECTS:nvtx_utils>
+  $<TARGET_OBJECTS:online_softmax_beamsearch_kernels>
+  $<TARGET_OBJECTS:sampling_penalty_kernels>
+  $<TARGET_OBJECTS:sampling_topk_kernels>
+  $<TARGET_OBJECTS:sampling_topp_kernels>
+  $<TARGET_OBJECTS:stop_criteria>
+  $<TARGET_OBJECTS:tensor>
+  $<TARGET_OBJECTS:unfused_attention_kernels>
+  $<TARGET_OBJECTS:word_list>
+)
+
+if (BUILD_MULTI_GPU)
+target_link_libraries(transformer-shared PUBLIC
+  -lmpi
+  ${NCCL_LIBRARIES}
+)
+endif()
+
+if(USE_NVTX)
+target_link_libraries(transformer-shared PUBLIC
+  -lnvToolsExt
+)
+endif()
+
+set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
+target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
+
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  TARGETS
+    transformer-shared
+  EXPORT
+    transformer-shared-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+)
+
+install(
+  EXPORT
+    transformer-shared-targets
+  FILE
+    FasterTransformerTargets.cmake
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+export(
+  EXPORT
+    transformer-shared-targets
+  FILE
+    ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake
+  NAMESPACE
+    TritonCore::
+)
+
+export(PACKAGE FasterTransformer)
diff --git a/cmake/FasterTransformerConfig.cmake.in b/cmake/FasterTransformerConfig.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..290213c9699e66cefbbe6ebe2b81be2f0c13fedb
--- /dev/null
+++ b/cmake/FasterTransformerConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  FASTERTRANSFORMER_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${FASTERTRANSFORMER_CMAKE_DIR})
+
+if(NOT TARGET transformer-shared)
+  include("${FASTERTRANSFORMER_CMAKE_DIR}/FasterTransformerTargets.cmake")
+endif()
+
+set(FASTERTRANSFORMER_LIBRARIES transformer-shared)
diff --git a/cmake/Modules/FindCUDNN.cmake b/cmake/Modules/FindCUDNN.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7e7fc0c9391e661e14c5c4d9210abeb04be94dda
--- /dev/null
+++ b/cmake/Modules/FindCUDNN.cmake
@@ -0,0 +1,51 @@
+# taken from https://github.com/pytorch/pytorch/blob/master/cmake/Modules_CUDA_fix/FindCUDNN.cmake
+# Find the CUDNN libraries
+#
+# The following variables are optionally searched for defaults
+#  CUDNN_ROOT: Base directory where CUDNN is found
+#  CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
+#  CUDNN_LIBRARY: Directory where CUDNN library is searched for
+#  CUDNN_STATIC: Are we looking for a static library? (default: no)
+#
+# The following are set after configuration is done:
+#  CUDNN_FOUND
+#  CUDNN_INCLUDE_PATH
+#  CUDNN_LIBRARY_PATH
+#
+
+include(FindPackageHandleStandardArgs)
+
+set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
+if (DEFINED $ENV{CUDNN_ROOT_DIR})
+  message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
+endif()
+list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+
+# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
+
+set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
+
+find_path(CUDNN_INCLUDE_PATH cudnn.h
+  HINTS ${CUDNN_INCLUDE_DIR}
+  PATH_SUFFIXES cuda/include cuda include)
+
+option(CUDNN_STATIC "Look for static CUDNN" OFF)
+if (CUDNN_STATIC)
+  set(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+  set(CUDNN_LIBNAME "cudnn")
+endif()
+
+set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
+if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
+  message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
+endif()
+
+find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
+  PATHS ${CUDNN_LIBRARY}
+  PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH)
+
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d2f2f8358af1df739f918c25d1a9405e7dd32979
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,165 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# 
+# From PyTorch:
+# 
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+# 
+# From Caffe2:
+# 
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+# 
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+# 
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+# 
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+# 
+# All contributions by Kakao Brain:
+# Copyright 2019-2020 Kakao Brain
+# 
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+# 
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+# 
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+# 
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# 
+# Find the nccl libraries
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
+set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
+set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
+
+if ($ENV{NCCL_ROOT_DIR})
+  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
+endif()
+list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR})
+
+if (USE_STATIC_NCCL)
+  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
+  SET(NCCL_LIBNAME "nccl_static")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(NCCL_LIBNAME "nccl")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+  if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <nccl.h>
+      int main()
+      {
+        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+        int x;
+        ncclGetVersion(&x);
+        return x == NCCL_VERSION_CODE;
+      }
+")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "NCCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
diff --git a/cmake/TritonFasterTransformerBackendConfig.cmake.in b/cmake/TritonFasterTransformerBackendConfig.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..61a4a5489a80d9d571cf1cddbee4840f70228e13
--- /dev/null
+++ b/cmake/TritonFasterTransformerBackendConfig.cmake.in
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include(CMakeFindDependencyMacro)
+
+get_filename_component(
+  TRITONPYTORCHBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+
+list(APPEND CMAKE_MODULE_PATH ${TRITONPYTORCHBACKEND_CMAKE_DIR})
+
+if(NOT TARGET TritonPyTorchBackend::triton-pytorch-backend)
+  include("${TRITONPYTORCHBACKEND_CMAKE_DIR}/TritonPyTorchBackendTargets.cmake")
+endif()
+
+set(TRITONPYTORCHBACKEND_LIBRARIES TritonPyTorchBackend::triton-pytorch-backend)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d2fd51ada3dd216e9c2facb69551131a5d90a91d
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(cpp)
\ No newline at end of file
diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ddbf5e9c21a53a000fcfdd50aa6a9670093e24a7
--- /dev/null
+++ b/examples/cpp/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(llama)
\ No newline at end of file
diff --git a/examples/cpp/llama/CMakeLists.txt b/examples/cpp/llama/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d0f0dae55690aecb1827d651a885433cee5ad4fb
--- /dev/null
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_executable(llama_triton_example llama_triton_example.cc)
+target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart
+        LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils 
+        nvtx_utils word_list glog)
\ No newline at end of file
diff --git a/examples/cpp/llama/generate_gemm_config.py b/examples/cpp/llama/generate_gemm_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e538e6d40f46796ffd008a4cd69111e39b601b4f
--- /dev/null
+++ b/examples/cpp/llama/generate_gemm_config.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import subprocess
+import fire
+
+
+def main(head_num: int = 80,
+         size_per_head: int = 128,
+         vocab_size: int = 65632,
+         inter_size: int = 27392,
+         tensor_para_size: int = 8,
+         max_batch_size: int = 64):
+    for bsz in range(1, max_batch_size + 1):
+        subprocess.call(
+            f'bin/gpt_gemm {bsz} 1 1 {head_num} {size_per_head} {inter_size} {vocab_size} 1 {tensor_para_size} {0 if bsz == 1 else 1}',
+            shell=True)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/examples/cpp/llama/llama_ckpt_convert.py b/examples/cpp/llama/llama_ckpt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3ae4d1d0060d436be3534176f2d26ff2d8ab2d1
--- /dev/null
+++ b/examples/cpp/llama/llama_ckpt_convert.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+import fire
+import os.path as osp
+from os import makedirs
+from pathlib import Path
+import safetensors
+from typing import List
+from tqdm import tqdm
+
+
+def import_fb(ckpt_dir: str):
+    checkpoints = []
+    for pattern in ['*.pth', '*.pt']:
+        checkpoints += sorted(Path(ckpt_dir).glob(pattern))
+    print(checkpoints)
+    n_ckpt = len(checkpoints)
+    model_params = {}
+
+    def get_param(name, size):
+        print(name, size)
+        if name not in model_params:
+            model_params[name] = torch.zeros(
+                size, dtype=torch.float16, device='cpu')
+        return model_params[name]
+    for i, ckpt_path in enumerate(checkpoints):
+        ckpt = torch.load(ckpt_path, map_location='cpu')
+        for param_name, param_data in ckpt.items():
+            key = param_name.split('.')[-2]
+            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:  # column-parallel
+                size = param_data.size(0)
+                param = get_param(
+                    param_name, [size * n_ckpt, param_data.size(1)])
+                param.data[size * i: size * (i + 1), :] = param_data
+            elif key in ['w2', 'wo', 'tok_embeddings']:          # row-parallel
+                size = param_data.size(-1)
+                param = get_param(
+                    param_name, [param_data.size(0), size * n_ckpt])
+                param.data[:, size * i: size * (i + 1)] = param_data
+            elif i == 0:
+                param = get_param(param_name, param_data.size())
+                param.data = param_data
+        del ckpt
+
+    for name, param in model_params.items():
+        # transpose all weights as FasterTransformer is expecting column-major weights
+        # (output_dims, input_dims) -> (input_dims, output_dims)
+        key = name.split('.')[-2]
+        if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
+            param.data = param.data.t()
+
+    # concat qkv projection
+    for i in range(1000):
+        _qkv = [f'layers.{i}.attention.{k}.weight' for k in ['wq', 'wk', 'wv']]
+        try:
+            qkv = tuple(map(model_params.pop, _qkv))
+        except KeyError:
+            break
+        qkv = torch.stack(qkv, dim=1)
+        model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv
+        print(qkv.shape, qkv.dtype)
+
+    return model_params
+
+
+def permute(x: torch.Tensor):
+    SIZE_PER_HEAD = 128
+    if x.shape[-1] > 1:  # qweights
+        dim = x.shape[-1]
+        n_heads = dim // SIZE_PER_HEAD
+        return x.view(-1, n_heads, 2, dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
+    else:  # scales, zeros
+        dim = x.shape[0]
+        n_heads = dim // SIZE_PER_HEAD
+        return x.view(n_heads, 2, dim // n_heads // 2, 1).transpose(1, 2).reshape(dim, 1)
+
+
+def check_zero(x: torch.Tensor):
+    sum = x.flatten().sum().item()
+    assert sum == 0, str(sum)
+
+
+def import_gptq(path: str):
+    model_params = {}
+
+    _qweight = 'weight'
+    _suffixes = [_qweight]
+    n_split = 3
+    if True:
+        _params = {}
+        for i in tqdm(range(0, n_split)):
+            filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(i + 1, n_split)
+            _tmp = torch.load(osp.join(path, filename), map_location='cpu')
+            _params.update(_tmp)
+        # print('\n'.join(_params.keys()))
+        def get_tensor(name):
+            return _params[name]
+        def get_tensor_transposed(name):
+            return _params[name].t()
+
+    # _qweight = 'qweight'
+    # _suffixes = [_qweight, 'bias', 'scales', 'zeros']
+    # with safetensors.safe_open(path, framework='pt') as f:
+    #     get_tensor = f.get_tensor
+    #     # quantized weights are already in column major, no need to transpose
+    #     get_tensor_transposed = get_tensor
+        for i in range(1000):
+            try:
+                # attention weights
+                _qkvo = [f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo']
+                for suffix in _suffixes:
+                    q, k, v, o = map(get_tensor_transposed, map(('{}.' + suffix).format, _qkvo))
+                    if suffix == 'bias':
+                        check_zero(q), check_zero(k), check_zero(v), check_zero(o)
+                    else:
+                        # q, k has different layout for fb & hf, convert to fb's layout
+                        q = permute(q)
+                        k = permute(k)
+                        if suffix == _qweight:  # weight, qweight
+                            # insert a dimension for splitting heads later
+                            # qkv = torch.cat([q[:, None, :], k[:, None, :], v[:, None, :]], dim=1)
+                            qkv = torch.stack((q, k, v), dim=1)
+                        else:  # scales, zeros
+                            # qkv = torch.cat([q[None, :], k[None, :], v[None, :]], dim=0).squeeze(dim=-1)
+                            qkv = torch.stack((q, k, v), dim=0).squeeze(dim=-1)
+                        for k, v in [('w_qkv', qkv), ('wo', o)]:
+                            model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
+                # ffn weights
+                _w123 = [f'model.layers.{i}.mlp.{t}_proj' for t in ['gate', 'down', 'up']]
+                for suffix in _suffixes:
+                    w1, w2, w3 = map(get_tensor_transposed, map(('{}.' + suffix).format, _w123))
+                    if suffix == 'bias':
+                        check_zero(w1), check_zero(w2), check_zero(w3)
+                    else:
+                        if suffix in ['scales', 'zeros']:
+                            w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
+                        for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
+                            model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
+                other = [('attention_norm.weight', 'input_layernorm.weight'),
+                         ('ffn_norm.weight', 'post_attention_layernorm.weight')]
+                for ours, theirs in other:
+                    model_params[f'layers.{i}.' + ours] = get_tensor(f'model.layers.{i}.' + theirs)
+            except safetensors.SafetensorError:
+                break
+            except KeyError:
+                break
+            print(i)
+
+        other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
+                 ('norm.weight', 'model.norm.weight'),
+                 ('output.weight', 'lm_head.weight')]
+        for ours, theirs in other:
+            model_params[ours] = get_tensor(theirs)
+
+        return model_params
+
+
+def export(model_params: dict, out_dir: str, n_inference: int):
+    makedirs(out_dir, exist_ok=True)
+
+    def save_bin(param: torch.Tensor, name):
+        print(name, param.shape)
+        if param.dtype in [torch.float, torch.bfloat16]:
+            param = param.half()
+        param.contiguous().numpy().tofile(osp.join(out_dir, name))
+
+    # reverse the spliting axes since the weights are transposed above
+    for param_name, param_data in model_params.items():
+        split_dim = None
+        key, ext = param_name.split('.')[-2:]
+        copy = False
+        if key in ['w1', 'w3', 'w_qkv']:
+            split_dim = -1
+        elif key in ['w2', 'wo']:
+            if ext in ['scales', 'zeros']:
+                copy = True
+            else:
+                split_dim = 0
+        if split_dim is not None:
+            print(f'*** spliting {param_name}, shape={param_data.shape}, split_dim={split_dim}')
+            assert param_data.shape[split_dim] % n_inference == 0
+            split_size = param_data.shape[split_dim] // n_inference
+            splits = torch.split(param_data, split_size, dim=split_dim)
+            for i, split in enumerate(splits):
+                prefix, ext = osp.splitext(param_name)
+                save_bin(split, f'{prefix}.{i}{ext}')
+        elif copy:
+            print(f'### copying {param_name}, shape={param_data.shape}')
+            copies = [param_data] * n_inference
+            for i, copy in enumerate(copies):
+                prefix, ext = osp.splitext(param_name)
+                save_bin(copy, f'{prefix}.{i}{ext}')
+        else:
+            save_bin(param_data, param_name)
+
+
+def main(kind: str, input_path: str, out_dir: str, n_inference: int = 1):
+    if kind == 'fb':
+        model_params = import_fb(input_path)
+    elif kind == 'gptq':
+        model_params = import_gptq(input_path)
+    else:
+        raise RuntimeError(f'Unsupported kind: {kind}')
+
+    export(model_params, out_dir, n_inference)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
\ No newline at end of file
diff --git a/examples/cpp/llama/llama_config.ini b/examples/cpp/llama/llama_config.ini
new file mode 100644
index 0000000000000000000000000000000000000000..09c662d8962d1b126b4b8ea12e879d71e0b999eb
--- /dev/null
+++ b/examples/cpp/llama/llama_config.ini
@@ -0,0 +1,82 @@
+[ft_instance_hyperparameter]
+data_type=fp16
+enable_custom_all_reduce=0
+pipeline_para_size=1
+tensor_para_size=8
+model_dir=/shared_data/chatpjlm-0/v0.2.3/fastertransformer/weights/
+
+
+[request]
+request_batch_size=8
+request_output_len=2048
+beam_width=1 ; beam width for beam search
+top_k=1 ; k value for top k sampling
+top_p=0.0 ; p value for top p sampling
+temperature=1.0 ; Use for sampling
+repetition_penalty=1.00 ; Use for sampling
+presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
+len_penalty=0.0
+beam_search_diversity_rate=0.0
+; PJLM start/end ids
+start_id=0
+end_id=1
+
+
+; --------------------- legacy params -------------------------
+
+; LLaMA start/end ids
+; start_id=1
+; end_id=2
+
+[4999_llama]
+head_num=80
+size_per_head=128
+vocab_size=65632
+num_layer=82
+rotary_embedding=128
+norm_eps=1e-5
+start_id=0
+end_id=1
+inter_size=27392
+
+[llama_7B]
+head_num=32
+size_per_head=128
+vocab_size=32000
+num_layer=32
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=11008
+
+[llama_13B]
+head_num=40
+size_per_head=128
+vocab_size=32000
+num_layer=40
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=13824
+
+[llama_30B]
+head_num=52
+size_per_head=128
+vocab_size=32000
+num_layer=60
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=17920
+
+[llama_65B]
+head_num=64
+size_per_head=128
+vocab_size=32000
+num_layer=80
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=22016
+
+
diff --git a/examples/cpp/llama/llama_triton_example.cc b/examples/cpp/llama/llama_triton_example.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e698b040177243555cd6f47adbe7f6e19d18582
--- /dev/null
+++ b/examples/cpp/llama/llama_triton_example.cc
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/examples/cpp/multi_gpu_gpt/multi_gpu_gpt_triton_example.cc
+
+#include "3rdparty/INIReader.h"
+#include <memory>
+#include <thread>
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include "src/fastertransformer/utils/word_list.h"
+
+namespace ft = fastertransformer;
+
+constexpr const bool kUSE_MPI = true;
+
+struct RequestParam {
+    int                    beam_width;
+    int                    request_output_len;
+    float                  beam_search_diversity_rate;
+    uint                   runtime_top_k;
+    float                  runtime_top_p;
+    float                  temperature;
+    float                  len_penalty;
+    float                  repetition_penalty;
+    float                  presence_penalty;
+    int                    min_length;
+    unsigned long long int random_seed;
+    int                    start_id;
+    int                    end_id;
+};
+
+std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
+broadCastRequest(const std::vector<int>& v_start_ids,
+                 const std::vector<int>& v_start_lengths,
+                 const std::vector<int>& v_bad_words,
+                 const int               node_id,
+                 const int               gpu_count,
+                 const RequestParam      param,
+                 std::vector<void*>*     pointer_record)
+{
+    // broadcast the request to all nodes, and copy "gpu_count" copies on
+    // different gpu
+    int size_1         = v_start_ids.size();
+    int size_2         = v_start_lengths.size();
+    int size_bad_words = v_bad_words.size();
+    if (kUSE_MPI) {
+        ft::mpi::bcast(&size_1, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+        ft::mpi::bcast(&size_2, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+        ft::mpi::bcast(&size_bad_words, 1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    }
+
+    std::vector<int> v_input_ids(size_1);
+    std::vector<int> v_input_lengths(size_2);
+    std::vector<int> v_input_bad_words(size_bad_words);
+
+    if (node_id == 0) {
+        memcpy(v_input_ids.data(), v_start_ids.data(), size_1 * sizeof(int));
+        memcpy(v_input_lengths.data(), v_start_lengths.data(), size_2 * sizeof(int));
+        memcpy(v_input_bad_words.data(), v_bad_words.data(), size_bad_words * sizeof(int));
+    }
+    if (kUSE_MPI) {
+        ft::mpi::barrier();
+    }
+
+    int request_batch_size = size_2;
+    int max_input_len      = size_1 / size_2;
+
+    std::cerr << "request_batch_size=" << request_batch_size << " max_input_len=" << max_input_len << "\n";
+
+    if (kUSE_MPI) {
+        ft::mpi::bcast(v_input_ids.data(), size_1, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+        ft::mpi::bcast(v_input_lengths.data(), size_2, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+        ft::mpi::bcast(v_input_bad_words.data(), size_bad_words, ft::mpi::MPI_TYPE_INT, 0, ft::mpi::COMM_WORLD);
+    }
+
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list;
+    for (int device_id = 0; device_id < gpu_count; device_id++) {
+        ft::check_cuda_error(cudaSetDevice(device_id));
+
+        int* d_input_ids;
+        // int* d_input_lengths;
+        int* d_input_bad_words;
+
+        if (max_input_len == 0) {
+            // unconditional case, no input ids, so do nothing.
+            d_input_ids = nullptr;
+            // d_input_lengths = nullptr;
+            max_input_len = 0;
+        }
+        else {
+            // conditional case.
+            ft::deviceMalloc(&d_input_ids, size_1, false);
+            // ft::deviceMalloc(&d_input_lengths, size_2, false);
+            ft::cudaH2Dcpy(d_input_ids, v_input_ids.data(), size_1);
+            // ft::cudaH2Dcpy(d_input_lengths, v_input_lengths.data(), size_2);
+        }
+
+        if (!v_input_bad_words.empty()) {
+            ft::deviceMalloc(&d_input_bad_words, size_bad_words, false);
+            ft::cudaH2Dcpy(d_input_bad_words, v_input_bad_words.data(), size_bad_words);
+        }
+        else {
+            d_input_bad_words = nullptr;
+        }
+
+        uint32_t* request_output_len_ptr = (uint32_t*)malloc(request_batch_size * sizeof(uint32_t));
+        int*      input_lengths_ptr      = (int*)malloc(request_batch_size * sizeof(int));
+        for (int i = 0; i < request_batch_size; i++) {
+            request_output_len_ptr[i] = param.request_output_len;
+            input_lengths_ptr[i]      = v_input_lengths[i];
+        }
+
+        int* start_ids_ptr = (int*)malloc(request_batch_size * sizeof(int));
+        int* end_ids_ptr   = (int*)malloc(request_batch_size * sizeof(int));
+        for (int i = 0; i < request_batch_size; i++) {
+            start_ids_ptr[i] = param.start_id;
+            end_ids_ptr[i]   = param.end_id;
+        }
+        pointer_record->push_back(start_ids_ptr);
+        pointer_record->push_back(end_ids_ptr);
+
+        request_list.push_back(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(
+            new std::unordered_map<std::string, triton::Tensor>{
+                {"input_ids",
+                 triton::Tensor{triton::MEMORY_GPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size, (size_t)max_input_len},
+                                d_input_ids}},
+                {"input_lengths",
+                 triton::Tensor{triton::MEMORY_CPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size},
+                                input_lengths_ptr}},
+                {"request_output_len",
+                 triton::Tensor{triton::MEMORY_CPU,
+                                triton::TYPE_INT32,
+                                std::vector<size_t>{(size_t)request_batch_size},
+                                request_output_len_ptr}},
+                {"bad_words_list",
+                 triton::Tensor{
+                     triton::MEMORY_GPU, triton::TYPE_INT32, {2, v_input_bad_words.size() / 2}, d_input_bad_words}},
+                {"start_id",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, start_ids_ptr}},
+                {"end_id",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, {(size_t)request_batch_size}, end_ids_ptr}}}));
+
+        int* beam_width_ptr = new int(param.beam_width);
+        pointer_record->push_back(beam_width_ptr);
+        request_list[device_id]->insert(
+            {"beam_width",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, beam_width_ptr}});
+        if (param.beam_width > 1) {
+            float* beam_search_diversity_rate_ptr = new float(param.beam_search_diversity_rate);
+            pointer_record->push_back(beam_search_diversity_rate_ptr);
+            request_list[device_id]->insert(
+                {"beam_search_diversity_rate",
+                 triton::Tensor{
+                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, beam_search_diversity_rate_ptr}});
+        }
+        else {
+            if (param.runtime_top_p != 0.0f) {
+                float* runtime_top_p_ptr = new float(param.runtime_top_p);
+                pointer_record->push_back(runtime_top_p_ptr);
+                request_list[device_id]->insert(
+                    {"runtime_top_p",
+                     triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, runtime_top_p_ptr}});
+            }
+            if (param.runtime_top_k != 0) {
+                uint* runtime_top_k_ptr = new uint(param.runtime_top_k);
+                pointer_record->push_back(runtime_top_k_ptr);
+                request_list[device_id]->insert(
+                    {"runtime_top_k",
+                     triton::Tensor{
+                         triton::MEMORY_CPU, triton::TYPE_UINT32, std::vector<size_t>{1}, runtime_top_k_ptr}});
+            }
+        }
+        float* temperature_ptr = new float(param.temperature);
+        pointer_record->push_back(temperature_ptr);
+        request_list[device_id]->insert(
+            {"temperature",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, temperature_ptr}});
+        float* len_penalty_ptr = new float(param.len_penalty);
+        pointer_record->push_back(len_penalty_ptr);
+        request_list[device_id]->insert(
+            {"len_penalty",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, len_penalty_ptr}});
+        if (param.repetition_penalty != 1.0f) {
+            float* repetition_penalty_ptr = new float(param.repetition_penalty);
+            pointer_record->push_back(repetition_penalty_ptr);
+            request_list[device_id]->insert(
+                {"repetition_penalty",
+                 triton::Tensor{
+                     triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, repetition_penalty_ptr}});
+        }
+        if (param.presence_penalty != 0.0f) {
+            float* presence_penalty_ptr = new float(param.presence_penalty);
+            pointer_record->push_back(presence_penalty_ptr);
+            request_list[device_id]->insert(
+                {"presence_penalty",
+                 triton::Tensor{triton::MEMORY_CPU, triton::TYPE_FP32, std::vector<size_t>{1}, presence_penalty_ptr}});
+        }
+        int* min_length_ptr = new int(param.min_length);
+        pointer_record->push_back(min_length_ptr);
+        request_list[device_id]->insert(
+            {"min_length",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_INT32, std::vector<size_t>{1}, min_length_ptr}});
+        unsigned long long int* random_seed_ptr = new unsigned long long int(param.random_seed);
+        pointer_record->push_back(random_seed_ptr);
+        request_list[device_id]->insert(
+            {"random_seed",
+             triton::Tensor{triton::MEMORY_CPU, triton::TYPE_UINT64, std::vector<size_t>{1}, random_seed_ptr}});
+
+        pointer_record->push_back(d_input_ids);
+        // pointer_record->push_back(d_input_lengths);
+        pointer_record->push_back(d_input_bad_words);
+        pointer_record->push_back(request_output_len_ptr);
+        pointer_record->push_back(input_lengths_ptr);
+    }
+
+    return request_list;
+}
+
+int read_start_ids(size_t            batch_size,
+                   std::vector<int>* v_start_lengths,
+                   std::vector<int>* v_start_ids,
+                   size_t&           max_input_len,
+                   const int         end_id,
+                   const int         beam_width,
+                   std::string       file_name);
+
+std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>>
+prepareRequest(std::string ini_name, const int node_id, const int gpu_count, std::vector<void*>* pointer_record)
+{
+    INIReader reader = INIReader(ini_name);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << ini_name << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    const size_t request_batch_size = reader.GetInteger("request", "request_batch_size");
+    std::cerr << "request_batch_size=" << request_batch_size << "\n";
+
+    const int start_id = reader.GetInteger("request", "start_id");
+    const int end_id   = reader.GetInteger("request", "end_id");
+
+    std::vector<int> v_start_ids;
+    std::vector<int> v_start_lengths;
+
+    size_t max_input_len = 0;
+    read_start_ids(request_batch_size,
+                   &v_start_lengths,
+                   &v_start_ids,
+                   max_input_len,
+                   end_id,
+                   1,
+                   "../examples/cpp/llama/start_ids.csv");
+    // drop requests > request_batch_size
+    if (v_start_lengths.size() > request_batch_size) {
+        v_start_lengths.resize(request_batch_size);
+        v_start_ids.resize(request_batch_size * max_input_len);
+    }
+    std::cerr << "max_input_len=" << max_input_len << "\n";
+
+    std::vector<int> v_bad_words;
+    // ft::read_word_list("../examples/cpp/llama/bad_words.csv", v_bad_words);
+
+    RequestParam param;
+    param.beam_width                 = reader.GetInteger("request", "beam_width");
+    param.request_output_len         = reader.GetInteger("request", "request_output_len");
+    param.beam_search_diversity_rate = reader.GetFloat("request", "beam_search_diversity_rate");
+    param.runtime_top_k              = reader.GetInteger("request", "top_k");
+    param.runtime_top_p              = reader.GetFloat("request", "top_p");
+    param.temperature                = reader.GetFloat("request", "temperature");
+    param.len_penalty                = reader.GetFloat("request", "len_penalty");
+    param.repetition_penalty         = reader.GetFloat("request", "repetition_penalty", 1.0f);
+    param.presence_penalty           = reader.GetFloat("request", "presence_penalty", 0.0f);
+    param.min_length                 = reader.GetInteger("request", "min_length", 0);
+    param.random_seed                = (unsigned long long int)0;
+    param.start_id                   = start_id;
+    param.end_id                     = end_id;
+
+    auto request_list =
+        broadCastRequest(v_start_ids, v_start_lengths, v_bad_words, node_id, gpu_count, param, pointer_record);
+    return request_list;
+}
+
+int threadCreateModelInstances(std::shared_ptr<AbstractTransformerModel>                         model,
+                               std::vector<std::unique_ptr<AbstractTransformerModelInstance>>*   model_instances,
+                               const int                                                         device_id,
+                               const int                                                         rank,
+                               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                               std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr)
+{
+    printf("[INFO] rank = %d \n", rank);
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    cudaStream_t stream;
+    ft::check_cuda_error(cudaStreamCreate(&stream));
+    model->createSharedWeights(device_id, rank);
+    auto model_instance = model->createModelInstance(device_id, rank, stream, nccl_params, custom_all_reduce_comm);
+    model_instances->at(device_id) = std::move(model_instance);
+    printf("model instance %d is created \n", device_id);
+    ft::print_mem_usage();
+    return 0;
+}
+
+int threadForward(std::unique_ptr<AbstractTransformerModelInstance>*                model_instance,
+                  std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>  request,
+                  std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* output_tensors,
+                  const int                                                         device_id,
+                  ft::AbstractInstanceComm*                                         comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    cudaDeviceSynchronize();
+    *output_tensors = (*model_instance)->forward(request, comm);
+    cudaDeviceSynchronize();
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    /*
+        Prepare the nccl ids, node id, device id and world size
+        by MPI or triton
+    */
+
+    int node_id  = 0;
+    int node_num = 1;
+
+    if (kUSE_MPI) {
+        ft::mpi::initialize(&argc, &argv);
+        node_id  = ft::mpi::getCommWorldRank();
+        node_num = ft::mpi::getCommWorldSize();
+    }
+
+    printf("node_id=%d node_num=%d\n", node_id, node_num);
+
+    // Note: Only supports that all nodes have same gpu count
+    const int   gpu_count  = ft::getDeviceCount();
+    const int   world_size = node_num * gpu_count;
+    std::string ini_name   = argc >= 2 ? std::string(argv[1]) : "../examples/cpp/llama/llama_config.ini";
+
+    // step 1: Create model
+    std::shared_ptr<AbstractTransformerModel> model              = AbstractTransformerModel::createLlamaModel(ini_name);
+    int                                       tensor_para_size   = model->getTensorParaSize();
+    int                                       pipeline_para_size = model->getPipelineParaSize();
+    printf(
+        "world_size=%d tensor_para_size=%d pipeline_para_size=%d\n", world_size, tensor_para_size, pipeline_para_size);
+    FT_CHECK_WITH_INFO(world_size == (tensor_para_size * pipeline_para_size),
+                       "World Size != Tensor Parallel Size * Pipeline Parallel Size !");
+
+    std::cout << model->toString();
+
+    // step 2: Initialize the NCCL
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_comms = model->createNcclParams(node_id);
+    cudaDeviceSynchronize();
+
+    // Optional Step: create custom all reduce comm
+    // std::vector<std::shared_ptr<ft::AbstractCustomComm>>
+    // custom_all_reduce_comms; model->createCustomComms(&custom_all_reduce_comms,
+    // world_size);
+
+    // step 2.1 create instance comm
+    auto instance_comm = model->createInstanceComm(gpu_count);
+
+    // step 3: Create model instances
+    std::vector<std::unique_ptr<AbstractTransformerModelInstance>> model_instances((size_t)gpu_count);
+    std::vector<std::thread>                                       threads;
+    for (int device_id = 0; device_id < gpu_count; device_id++) {
+        const int rank = node_id * gpu_count + device_id;
+        threads.push_back(
+            std::thread(threadCreateModelInstances, model, &model_instances, device_id, rank, nccl_comms, nullptr));
+        //   custom_all_reduce_comms[rank]));
+    }
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    // step 4: prepare request
+    std::vector<void*> pointer_record;  // Used to prevent the pointers are
+                                        // release after leaving functions
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> request_list =
+        prepareRequest(ini_name, node_id, gpu_count, &pointer_record);
+    printf("[INFO] request is created \n");
+
+    // step 5: Forward
+    std::vector<std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>> output_tensors_lists(
+        (size_t)gpu_count);
+    for (int i = 0; i < 1; i++) {
+        threads.clear();
+        for (int device_id = 0; device_id < gpu_count; device_id++) {
+            threads.push_back(std::thread(threadForward,
+                                          &model_instances[device_id],
+                                          request_list[device_id],
+                                          &output_tensors_lists[device_id],
+                                          device_id,
+                                          instance_comm.get()));
+        }
+        for (auto& t : threads) {
+            t.join();
+        }
+    }
+    printf("[INFO] forward is completed. \n");
+
+    const int* d_output_ids = (const int*)output_tensors_lists[0].get()->at("output_ids").data;
+    const int* d_seq_lens   = (const int*)output_tensors_lists[0].get()->at("sequence_length").data;
+    const int  batch_size   = output_tensors_lists[0].get()->at("output_ids").shape[0];
+    const int  beam_width   = output_tensors_lists[0].get()->at("output_ids").shape[1];
+    const int  seq_len      = output_tensors_lists[0].get()->at("output_ids").shape[2];
+    // step 6: check results
+    if (node_id == 0) {
+        std::string fName   = "out";
+        auto        outFile = std::ofstream(fName, std::ios::out);
+        if (!outFile.is_open()) {
+            printf("[WARNING] Cannot write results into output file %s \n", fName.c_str());
+        }
+        else {
+            size_t outCount = batch_size * beam_width * seq_len;
+            // int*   hBuf     = new int[outCount];
+            std::vector<int> hBuf(outCount);
+            ft::cudaD2Hcpy(hBuf.data(), d_output_ids, outCount);
+            std::vector<int> seq_lens(batch_size);
+            ft::cudaD2Hcpy(seq_lens.data(), d_seq_lens, batch_size);
+            std::cout << "sequence length: ";
+            for (int i = 0; i < batch_size; ++i) {
+                std::cout << (i ? ", " : "") << seq_lens[i];
+            }
+            std::cout << "\n";
+            {
+                std::cout << "Writing " << outCount << " elements\n";
+                int zeroCount = 0;
+                for (size_t i = 0; i < outCount; i++) {
+                    if (hBuf[i] == int(0))
+                        zeroCount++;
+                    outFile << hBuf[i] << " ";
+                    if ((i + 1) % (seq_len) == 0)
+                        outFile << std::endl;
+
+                    if (i < 10)
+                        printf("%5d ", hBuf[i]);
+                    if ((i + 1) % (seq_len) == 0 && i < 10)
+                        std::cout << std::endl;
+                }
+                std::cout << std::endl << "zeroCount = " << zeroCount << std::endl;
+            }
+        }
+    }
+
+    if (kUSE_MPI) {
+        ft::mpi::barrier();
+    }
+    cudaDeviceSynchronize();
+
+    if (1) {
+        // test time
+        struct timeval start, end;
+        gettimeofday(&start, NULL);
+
+        const int ite = 1;
+        for (int i = 0; i < ite; i++) {
+            threads.clear();
+            for (int device_id = 0; device_id < gpu_count; device_id++) {
+                threads.push_back(std::thread(threadForward,
+                                              &model_instances[device_id],
+                                              request_list[device_id],
+                                              &output_tensors_lists[device_id],
+                                              device_id,
+                                              instance_comm.get()));
+            }
+            for (auto& t : threads) {
+                t.join();
+            }
+        }
+
+        cudaDeviceSynchronize();
+        if (kUSE_MPI) {
+            ft::mpi::barrier();
+        }
+
+        gettimeofday(&end, NULL);
+
+        printf("[INFO] batch_size %d beam_width %d seq_len %d"
+               " FT-CPP-GPT-Triton-time %.2f ms\n",
+               batch_size,
+               beam_width,
+               seq_len,
+               ((end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001) / ite);
+    }
+
+    if (kUSE_MPI) {
+        ft::mpi::finalize();
+    }
+    return 0;
+}
+
+int read_start_ids(size_t            batch_size,
+                   std::vector<int>* v_start_lengths,
+                   std::vector<int>* v_start_ids,
+                   size_t&           max_input_len,
+                   const int         end_id,
+                   const int         beam_width,
+                   std::string       file_name)
+{
+    std::vector<std::vector<int>> tmp_start_ids;
+    std::vector<int>              tmp_start_lengths;
+
+    std::ifstream start_id_file(file_name, std::ios::in);
+    int           line_num = 0;
+    if (start_id_file.is_open()) {
+        std::string line;
+        while (std::getline(start_id_file, line)) {
+            std::stringstream lineStream(line);
+            std::string       vals;
+            int               i1 = 0;
+            std::vector<int>  tmp_vec;
+            while (std::getline(lineStream, vals, ',')) {
+                tmp_vec.push_back(std::stoi(vals));
+                i1++;
+            }
+            tmp_start_ids.push_back(tmp_vec);
+            tmp_start_lengths.push_back(i1);
+            line_num++;
+        }
+        if (batch_size == 0) {
+            batch_size = line_num;
+        }
+    }
+    else {
+        printf("[WARNING] Cannot open the file '%s'. \n", file_name.c_str());
+        max_input_len = 0;
+        return 0;
+    }
+
+    max_input_len = tmp_start_lengths.data()[0];
+    for (uint i = 1; i < (uint)tmp_start_lengths.size(); i++) {
+        max_input_len = max_input_len > tmp_start_lengths.data()[i] ? max_input_len : tmp_start_lengths.data()[i];
+    }
+
+    while ((int)tmp_start_lengths.size() < batch_size) {
+        std::vector<int> padding_ids;
+        for (int i = 0; i < max_input_len; i++) {
+            padding_ids.push_back(end_id);
+        }
+        tmp_start_ids.push_back(padding_ids);
+        tmp_start_lengths.push_back(max_input_len);
+    }
+
+    // Add padding
+    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
+        for (int j = (int)tmp_start_ids[i].size(); j < max_input_len; j++) {
+            tmp_start_ids[i].push_back(end_id);
+        }
+    }
+
+    for (int i = 0; i < (int)tmp_start_ids.size(); i++) {
+        for (int b = 0; b < beam_width; b++) {
+            for (int j = 0; j < (int)tmp_start_ids[i].size(); j++) {
+                v_start_ids->push_back(tmp_start_ids[i][j]);
+            }
+            v_start_lengths->push_back(tmp_start_lengths[i]);
+        }
+    }
+    return batch_size;
+}
diff --git a/examples/cpp/llama/start_ids.csv b/examples/cpp/llama/start_ids.csv
new file mode 100644
index 0000000000000000000000000000000000000000..1c5d7b09658b59fedc63bceb2a922a4a15663582
--- /dev/null
+++ b/examples/cpp/llama/start_ids.csv
@@ -0,0 +1,8 @@
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,44883,2282,32901,4220,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,46088,46064,625,19880,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,47335,56437,60468,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,44883,2282,6828,3467,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,36589,3467,7849,299,7032,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,44976,39798,6828,3467,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,2795,977,9193,299,405,537,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,45691,45926,45513,46641,47641,46285,6456,46323,13,44975,45004,11130,32843,45004,35597
\ No newline at end of file
diff --git a/examples/cpp/llama/tokenizer.py b/examples/cpp/llama/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9187a95dfe5ece5847df1d184c5c45ade78c7871
--- /dev/null
+++ b/examples/cpp/llama/tokenizer.py
@@ -0,0 +1,57 @@
+from sentencepiece import SentencePieceProcessor
+from typing import List
+import fire
+import sys
+
+
+class Tokenizer:
+    def __init__(self, model_file: str):
+        self.model = SentencePieceProcessor(model_file=model_file)
+        self.vocab_size = self.model.vocab_size()
+        self.start_id = self.model.bos_id()
+        self.end_id = self.model.eos_id()
+        self.pad_id = self.model.pad_id()
+        print(f'vocab_size = {self.vocab_size}')
+        print(f'start_id = {self.start_id}')
+        print(f'end_id = {self.end_id}')
+        print(f'pad_id = {self.pad_id}')
+
+    def encode(self, s: str):
+        return self.model.Encode(s, add_bos=True)
+
+    def decode(self, t: List[int]):
+        return self.model.Decode(t)
+
+
+def main(model_file: str = '/data/llama/model/tokenizer.model',
+         encode_file: str = None, decode_file: str = None):
+    tokenizer = Tokenizer(model_file)
+    if encode_file:
+        with open(encode_file, 'r') as f:
+            xs = tokenizer.encode(f.read())
+            print(','.join(map(str, xs)))
+    elif decode_file:
+        with open(decode_file, 'r') as f:
+            ys = tokenizer.decode(f.read())
+            print(ys)
+    else:
+        first = True
+        while True:
+            try:
+                s = input()
+            except EOFError:
+                break
+            if not first:
+                print('---------------------------------------------')
+            first = False
+            try:
+                xs = map(int, s.strip().split(' '))
+                s = tokenizer.decode(list(xs))
+                print(s)
+            except ValueError:
+                xs = tokenizer.encode(s)
+                print(' '.join(map(str, xs)))
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5474bbe37140e6f73fc15260b93a47f541ffcb1b
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(fastertransformer)
\ No newline at end of file
diff --git a/src/fastertransformer/CMakeLists.txt b/src/fastertransformer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9baa5329a0320efc291b7a5939cc401c95f4845e
--- /dev/null
+++ b/src/fastertransformer/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(utils)
+add_subdirectory(kernels)
+add_subdirectory(layers)
+add_subdirectory(models)
+if(BUILD_PYT)
+    add_subdirectory(th_op)
+endif()
+add_subdirectory(triton_backend)
\ No newline at end of file
diff --git a/src/fastertransformer/kernels/CMakeLists.txt b/src/fastertransformer/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d718b1fdf17bf10e154e29a4aaadfa0f46860874
--- /dev/null
+++ b/src/fastertransformer/kernels/CMakeLists.txt
@@ -0,0 +1,89 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(ban_bad_words STATIC ban_bad_words.cu)
+set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(stop_criteria STATIC stop_criteria_kernels.cu)
+set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(activation_kernels STATIC activation_kernels.cu)
+set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(gen_relative_pos_bias STATIC gen_relative_pos_bias.cu)
+set_property(TARGET gen_relative_pos_bias PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET gen_relative_pos_bias PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(gen_relative_pos_bias PUBLIC activation_kernels)
+
+add_library(logprob_kernels STATIC logprob_kernels.cu)
+set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(unfused_attention_kernels STATIC unfused_attention_kernels.cu)
+set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(bert_preprocess_kernels STATIC bert_preprocess_kernels.cu)
+set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+set(decoder_masked_multihead_attention_files
+    decoder_masked_multihead_attention.cu
+)
+file(GLOB decoder_masked_multihead_attention_files ${decoder_masked_multihead_attention_files} ./decoder_masked_multihead_attention/*.cu)
+add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead_attention_files})
+set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(online_softmax_beamsearch_kernels STATIC online_softmax_beamsearch_kernels.cu)
+set_property(TARGET online_softmax_beamsearch_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET online_softmax_beamsearch_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(decoding_kernels STATIC decoding_kernels.cu)
+set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(gpt_kernels STATIC gpt_kernels.cu)
+set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(beam_search_penalty_kernels STATIC beam_search_penalty_kernels.cu)
+set_property(TARGET beam_search_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET beam_search_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(beam_search_penalty_kernels PRIVATE cuda_utils)
+
+add_library(beam_search_topk_kernels STATIC beam_search_topk_kernels.cu)
+set_property(TARGET beam_search_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET beam_search_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(sampling_topk_kernels STATIC sampling_topk_kernels.cu)
+set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(sampling_topp_kernels STATIC sampling_topp_kernels.cu)
+set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(sampling_penalty_kernels STATIC sampling_penalty_kernels.cu)
+set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(custom_ar_kernels STATIC custom_ar_kernels.cu)
+set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
diff --git a/src/fastertransformer/kernels/activation_kernels.cu b/src/fastertransformer/kernels/activation_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa1cd7b10d4d9b8fdd053876ecf2027e8b4b6651
--- /dev/null
+++ b/src/fastertransformer/kernels/activation_kernels.cu
@@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+
+/* Gelu Activation */
+
+__forceinline__ __device__ float copysignf_pos(float a, float b)
+{
+    float r;
+    r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+    return r;
+}
+
+__inline__ __device__ float tanh_opt(float x)
+{
+#if (__CUDA_ARCH__ >= 750 && CUDART_VERSION >= 11000)
+    float r;
+    asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(r) : "f"(x));
+    return r;
+#else
+    const float exp_val = -1.f * fabs(2 * x);
+    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+#endif
+}
+
+template<typename T>
+struct GeluActivation {
+    using return_type = T;
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        const float cdf = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (val + 0.044715f * val * val * val))));
+        return val * cdf;
+    }
+};
+
+template<>
+struct GeluActivation<half2> {
+    using return_type = half2;
+    static __device__ __forceinline__ half2 apply(const half2& val)
+    {
+        half2  val_pow3 = __hmul2(val, __hmul2(val, val));
+        float2 tmp_pow  = __half22float2(val_pow3);
+        float2 tmp      = __half22float2(val);
+
+        tmp.x = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
+        tmp.y = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
+        return __hmul2(val, __float22half2_rn(tmp));
+    }
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct GeluActivation<__nv_bfloat162> {
+    using return_type = __nv_bfloat162;
+    static __device__ __forceinline__ __nv_bfloat162 apply(const __nv_bfloat162& val)
+    {
+        __nv_bfloat162 val_pow3 = bf16hmul2(val, bf16hmul2(val, val));
+        float2         tmp_pow  = bf1622float2(val_pow3);
+        float2         tmp      = bf1622float2(val);
+
+        tmp.x = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
+        tmp.y = 0.5f * (1.0f + tanh_opt((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
+        return bf16hmul2(val, __floats2bfloat162_rn(tmp.x, tmp.y));
+    }
+};
+#endif
+
+/* Relu Activation */
+
+template<typename T>
+struct ReluActivation {
+    using return_type = T;
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        return val > static_cast<T>(0.0f) ? val : static_cast<T>(0.0f);
+    }
+};
+
+template<>
+struct ReluActivation<half2> {
+    using return_type = half2;
+    static __device__ __forceinline__ half2 apply(const half2& val)
+    {
+        const half zero_half = static_cast<half>(0.0f);
+        return make_half2(val.x > zero_half ? val.x : zero_half, val.y > zero_half ? val.y : zero_half);
+    }
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct ReluActivation<__nv_bfloat162> {
+    using return_type = __nv_bfloat162;
+    static __device__ __forceinline__ __nv_bfloat162 apply(const __nv_bfloat162& val)
+    {
+        const __nv_bfloat16 zero_bf16 = static_cast<__nv_bfloat16>(0.0f);
+        return make_bfloat162(val.x > zero_bf16 ? val.x : zero_bf16, val.y > zero_bf16 ? val.y : zero_bf16);
+    }
+};
+#endif
+
+/* Silu Activation */
+
+template<typename T>
+struct SiluActivation {
+    using return_type = T;
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        return (T)((float)val / (1.0f + __expf((float)-val)));
+    }
+};
+
+template<>
+struct SiluActivation<half2> {
+    using return_type = float2;
+    static __device__ __forceinline__ float2 apply(const half2& val)
+    {
+        return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
+    }
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct SiluActivation<__nv_bfloat162> {
+    using return_type = float2;
+    static __device__ __forceinline__ float2 apply(const __nv_bfloat162& val)
+    {
+        return make_float2(SiluActivation<float>::apply(val.x), SiluActivation<float>::apply(val.y));
+    }
+};
+#endif  // ENABLE_BF16
+
+/* Identity Activation (= no activation) */
+
+template<typename T>
+struct IdentityActivation {
+    using return_type = T;
+    static __device__ __forceinline__ T apply(const T& val)
+    {
+        return val;
+    }
+};
+
+// clang-format off
+template<template<typename T> class Activation, typename T, typename BT>
+__global__ void generic_activation(T*                      out,
+                                   const BT*  __restrict   bias,
+                                   const T*   __restrict   gated_weights,
+                                   const BT*  __restrict   gated_bias,
+                                   const int* __restrict   ia3_tasks,
+                                   const T*   __restrict   ia3_weights,
+                                   const int               int8_mode,
+                                   const float* __restrict activation_in,
+                                   const float* __restrict activation_out,
+                                   const int* __restrict padding_offset,
+                                   const int seq_len,
+                                   int m,
+                                   int n)
+{
+    constexpr size_t packed_elems = num_elems<T>::value;
+
+    const bool with_bias = bias != nullptr;
+    const bool with_gate = gated_weights != nullptr;
+    // const bool with_ia3  = ia3_tasks != nullptr;
+
+    using Act_T         = typename Activation<T>::return_type;
+    using Float_T       = typename packed_as<float, packed_elems>::type;
+    using Packed_Int8_t = typename packed_as<int8_t, packed_elems>::type;
+
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
+        T val;
+        if (int8_mode == 2) {
+            // val = cuda_cast<T>(cuda_cast<Float_T>(reinterpret_cast<Packed_Int8_t*>(out)[id]) * activation_in[0]);
+        }
+        else {
+            val = out[id];
+        }
+
+        T gated_val;
+        if (with_gate) {
+            gated_val = gated_weights[id];
+        }
+
+        // if (with_bias) {
+        //     const T reg_bias = static_cast<T>(bias[id % n]);
+        //     val              = val + reg_bias;
+
+        //     if (with_gate) {
+        //         const T reg_gated_bias = static_cast<T>(gated_bias[id % n]);
+        //         gated_val              = gated_val + reg_gated_bias;
+        //     }
+        // }
+
+        if (with_gate) {
+            val = cuda_cast<T>(Activation<T>::apply(val) * cuda_cast<Act_T>(gated_val));
+        }
+        else {
+            // val = cuda_cast<T>(Activation<T>::apply(val));
+        }
+
+        // if (with_ia3) {
+        //     const int word_id = id / n;
+        //     const int offset = padding_offset == nullptr ? 0 : padding_offset[word_id];
+        //     const int batch_id = (word_id + offset) / seq_len;
+        //     const int task = ia3_tasks[batch_id];
+        //     val            = val * ia3_weights[task * n + (id % n)];
+        // }
+
+        if (int8_mode != 2) {
+            out[id] = val;
+        }
+        else {
+            // reinterpret_cast<Packed_Int8_t*>(out)[id] =
+            //     cuda_cast<Packed_Int8_t>(cuda_cast<Float_T>(val) * activation_out[0]);
+        }
+    }
+}
+// clang-format on
+
+template<template<typename T> class Activation, typename T, typename BT>
+void invokeGenericActivation(T*           out,
+                             const BT*    bias,
+                             const T*     gated_weights,
+                             const BT*    gated_bias,
+                             const int*   ia3_tasks,
+                             const T*     ia3_weights,
+                             const int    m,
+                             const int    n,
+                             const int    int8_mode,
+                             const float* activation_in,
+                             const float* activation_out,
+                             const int*   padding_offset,
+                             const int    seq_len,
+                             cudaStream_t stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_LOG_DEBUG("invokeGenericActivation %d %d %d", m, n, seq_len);
+    using PT                   = typename packed_type<T>::type;
+    constexpr int packed_elems = num_elems<PT>::value;
+    using PBT                  = typename packed_as<BT, packed_elems>::type;
+
+    const int n_threads = 512;
+
+    dim3 block, grid;
+    if (n / 4 / packed_elems <= n_threads) {
+        block.x = n / 4 / packed_elems;
+        grid.x  = m;
+    }
+    else {
+        block.x = n_threads;
+        grid.x  = ceil(m * n / double(n_threads));
+    }
+    FT_LOG_DEBUG("%d %d", grid.x, block.x);
+    sync_check_cuda_error();
+    generic_activation<Activation><<<grid, block, 0, stream>>>(reinterpret_cast<PT*>(out),
+                                                               reinterpret_cast<const PBT*>(bias),
+                                                               reinterpret_cast<const PT*>(gated_weights),
+                                                               reinterpret_cast<const PBT*>(gated_bias),
+                                                               ia3_tasks,
+                                                               reinterpret_cast<const PT*>(ia3_weights),
+                                                               int8_mode,
+                                                               activation_in,
+                                                               activation_out,
+                                                               padding_offset,
+                                                               seq_len,
+                                                               m,
+                                                               n / packed_elems);
+    sync_check_cuda_error();
+}
+
+#define INSTANTIATE_GENERIC_ACTIVATION(Activation, T, BT)                                                              \
+    template void invokeGenericActivation<Activation, T, BT>(T * out,                                                  \
+                                                             const BT*    bias,                                        \
+                                                             const T*     gated_weights,                               \
+                                                             const BT*    gated_bias,                                  \
+                                                             const int*   ia3_tasks,                                   \
+                                                             const T*     ia3_weights,                                 \
+                                                             const int    m,                                           \
+                                                             const int    n,                                           \
+                                                             const int    int8_mode,                                   \
+                                                             const float* activation_in,                               \
+                                                             const float* activation_out,                              \
+                                                             const int*   padding_offset,                              \
+                                                             const int    seq_len,                                     \
+                                                             cudaStream_t stream);
+
+INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, float, float);
+INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, half, half);
+#ifdef ENABLE_BF16
+INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, __nv_bfloat16, __nv_bfloat16);
+#endif
+
+INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, float, float);
+INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, half, half);
+#ifdef ENABLE_BF16
+INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, __nv_bfloat16, __nv_bfloat16);
+#endif
+
+INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, float, float);
+INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
+#ifdef ENABLE_BF16
+INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, __nv_bfloat16, __nv_bfloat16);
+#endif
+
+INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, float);
+INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, half, half);
+INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, half);
+#ifdef ENABLE_BF16
+INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, __nv_bfloat16, __nv_bfloat16);
+INSTANTIATE_GENERIC_ACTIVATION(IdentityActivation, float, __nv_bfloat16);
+#endif
+#undef INSTANCIATE_GENERIC_ACTIVATION
+
+template<typename T>
+__global__ void add_bias_tanh(T* out, const T* __restrict bias, int m, int n)
+{
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
+        T val = out[id];
+        if (bias != nullptr) {
+            val = val + ldg(&bias[id % n]);
+        }
+        out[id] = tanhf(val);
+    }
+}
+
+template<>
+__global__ void add_bias_tanh(half* out, const half* __restrict bias, int m, int n)
+{
+    half2*       out_ptr  = (half2*)out;
+    const half2* bias_ptr = (half2*)bias;
+
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
+        half2 val = out_ptr[id];
+        if (bias != nullptr) {
+            val = val + __ldg(&bias_ptr[id % n]);
+        }
+        val.x       = tanhf(val.x);
+        val.y       = tanhf(val.y);
+        out_ptr[id] = val;
+    }
+}
+
+#ifdef ENABLE_BF16
+template<>
+__global__ void add_bias_tanh(__nv_bfloat16* out, const __nv_bfloat16* __restrict bias, int m, int n)
+{
+    __nv_bfloat162*       out_ptr  = (__nv_bfloat162*)out;
+    const __nv_bfloat162* bias_ptr = (__nv_bfloat162*)bias;
+
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < m * n; id += blockDim.x * gridDim.x) {
+        __nv_bfloat162 val = out_ptr[id];
+        if (bias != nullptr) {
+            val = bf16hadd2(val, ldg(&bias_ptr[id % n]));
+        }
+        val.x       = tanhf(val.x);
+        val.y       = tanhf(val.y);
+        out_ptr[id] = val;
+    }
+}
+#endif
+
+template<typename T>
+void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream)
+{
+    const int data_type_factor = 4 / sizeof(T);  // 1 for fp32, 2 for fp16 and bf16
+    dim3      block, grid;
+    if (n / 4 / data_type_factor <= 1024) {
+        block.x = n / 4 / data_type_factor;
+        grid.x  = m;
+    }
+    else {
+        block.x = 1024;
+        grid.x  = ceil(m * n / 1024.);
+    }
+    add_bias_tanh<T><<<grid, block, 0, stream>>>(out, bias, m, n / data_type_factor);
+}
+
+template void invokeAddBiasTanh(float* out, const float* bias, const int m, const int n, cudaStream_t stream);
+template void invokeAddBiasTanh(half* out, const half* bias, const int m, const int n, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void
+invokeAddBiasTanh(__nv_bfloat16* out, const __nv_bfloat16* bias, const int m, const int n, cudaStream_t stream);
+#endif
+
+template<typename T2, int N>
+__global__ void addBiasGeluV2(T2* out,
+                              const T2* __restrict bias,
+                              const int* ia3_tasks,
+                              const T2*  ia3_weights,
+                              const int  size,
+                              const int* padding_offset,
+                              const int  seq_len)
+{
+    const bool with_ia3 = ia3_tasks != nullptr;
+    for (int id = blockIdx.x * blockDim.x + threadIdx.x; id < size; id += blockDim.x * gridDim.x) {
+        T2 val = out[id];
+        if (bias != nullptr) {
+            T2 reg_bias = ldg(&bias[id % N]);
+            val         = hadd2(val, reg_bias);
+        }
+        val = GeluActivation<T2>::apply(val);
+        if (with_ia3) {
+            const int word_id  = id / N;
+            const int offset   = padding_offset == nullptr ? 0 : padding_offset[word_id];
+            const int batch_id = (word_id + offset) / seq_len;
+            const int task     = ia3_tasks[batch_id];
+            val                = val * ia3_weights[task * N + (id % N)];
+        }
+        out[id] = val;
+    }
+}
+
+template<typename T2, int N, int ELEMENT_PER_ROUND>
+__global__ void addBiasGeluV3(T2* out,
+                              const T2* __restrict bias,
+                              const int* ia3_tasks,
+                              const T2*  ia3_weights,
+                              const int  size,
+                              const int* padding_offset,
+                              const int  seq_len)
+{
+    const bool with_ia3 = ia3_tasks != nullptr;
+    T2         buffer[ELEMENT_PER_ROUND];
+    T2         tmp_bias[ELEMENT_PER_ROUND];
+    for (int id = blockIdx.x * blockDim.x * ELEMENT_PER_ROUND + threadIdx.x * ELEMENT_PER_ROUND; id < size;
+         id += blockDim.x * gridDim.x * ELEMENT_PER_ROUND) {
+#pragma unroll
+        for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
+            buffer[i] = out[id + i];
+            if (bias != nullptr) {
+                tmp_bias[i] = ldg(&bias[(id + i) % N]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < ELEMENT_PER_ROUND; i++) {
+            if (bias != nullptr) {
+                buffer[i] = hadd2(buffer[i], tmp_bias[i]);
+            }
+            buffer[i] = GeluActivation<T2>::apply(buffer[i]);
+            if (with_ia3) {
+                const int word_id  = (id + i) / N;
+                const int offset   = padding_offset == nullptr ? 0 : padding_offset[word_id];
+                const int batch_id = (word_id + offset) / seq_len;
+                const int task     = ia3_tasks[batch_id];
+                buffer[i]          = buffer[i] * ia3_weights[task * N + ((id + i) % N)];
+            }
+            out[id + i] = buffer[i];
+        }
+    }
+}
+
+#define ADD_BIAS_GELU(HALF_N, ELEMENT_PER_ROUND)                                                                       \
+    case HALF_N:                                                                                                       \
+        if (ELEMENT_PER_ROUND > 1) {                                                                                   \
+            grid.x = grid.x / ELEMENT_PER_ROUND;                                                                       \
+            addBiasGeluV3<T2, HALF_N, ELEMENT_PER_ROUND><<<grid, block, 0, stream>>>(                                  \
+                (T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len);          \
+        }                                                                                                              \
+        else {                                                                                                         \
+            addBiasGeluV2<T2, HALF_N><<<grid, block, 0, stream>>>(                                                     \
+                (T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len);          \
+        }                                                                                                              \
+        break;
+
+template<typename T>
+void invokeAddBiasGeluV2(T*           out,
+                         const T*     bias,
+                         const int*   ia3_tasks,
+                         const T*     ia3_weights,
+                         const int*   padding_offset,
+                         const int    seq_len,
+                         const int    m,
+                         const int    n,
+                         cudaStream_t stream)
+{
+    if (n % 2 == 0 && sizeof(T) == 2) {
+        const int half_n = n / 2;
+        dim3      block, grid;
+        block.x  = std::min(half_n, 512);
+        grid.x   = (m * half_n + (block.x - 1)) / block.x;
+        using T2 = typename TypeConverter<T>::Type;
+
+        if (grid.x >= 512) {
+            switch (half_n) {
+                ADD_BIAS_GELU(256, 1)
+                ADD_BIAS_GELU(512, 1)
+                ADD_BIAS_GELU(1024, 1)
+                ADD_BIAS_GELU(1536, 1)
+                ADD_BIAS_GELU(2048, 1)
+                ADD_BIAS_GELU(4096, 2)
+                ADD_BIAS_GELU(8192, 2)
+                ADD_BIAS_GELU(16384, 2)
+                ADD_BIAS_GELU(24576, 2)
+                ADD_BIAS_GELU(40960, 4)
+                default:
+                    invokeGenericActivation<GeluActivation>(out,
+                                                            bias,
+                                                            (T*)nullptr,
+                                                            (T*)nullptr,
+                                                            ia3_tasks,
+                                                            ia3_weights,
+                                                            m,
+                                                            n,
+                                                            0,
+                                                            (float*)nullptr,
+                                                            (float*)nullptr,
+                                                            padding_offset,
+                                                            seq_len,
+                                                            stream);
+                    break;
+            }
+        }
+        else {
+            switch (half_n) {
+                ADD_BIAS_GELU(256, 1)
+                ADD_BIAS_GELU(512, 1)
+                ADD_BIAS_GELU(1024, 1)
+                ADD_BIAS_GELU(1536, 1)
+                ADD_BIAS_GELU(2048, 1)
+                ADD_BIAS_GELU(4096, 1)
+                ADD_BIAS_GELU(8192, 2)
+                ADD_BIAS_GELU(16384, 2)
+                ADD_BIAS_GELU(24576, 2)
+                ADD_BIAS_GELU(40960, 2)
+                default:
+                    invokeGenericActivation<GeluActivation>(out,
+                                                            bias,
+                                                            (T*)nullptr,
+                                                            (T*)nullptr,
+                                                            ia3_tasks,
+                                                            ia3_weights,
+                                                            m,
+                                                            n,
+                                                            0,
+                                                            (float*)nullptr,
+                                                            (float*)nullptr,
+                                                            padding_offset,
+                                                            seq_len,
+                                                            stream);
+                    break;
+            }
+        }
+    }
+    else {
+        invokeGenericActivation<GeluActivation>(out,
+                                                bias,
+                                                (T*)nullptr,
+                                                (T*)nullptr,
+                                                ia3_tasks,
+                                                ia3_weights,
+                                                m,
+                                                n,
+                                                0,
+                                                (float*)nullptr,
+                                                (float*)nullptr,
+                                                padding_offset,
+                                                seq_len,
+                                                stream);
+    }
+}
+
+#undef ADD_BIAS_GELU
+
+template void invokeAddBiasGeluV2(float*       out,
+                                  const float* bias,
+                                  const int*   ia3_tasks,
+                                  const float* ia3_weights,
+                                  const int*   padding_offset,
+                                  const int    seq_len,
+                                  const int    m,
+                                  const int    n,
+                                  cudaStream_t stream);
+template void invokeAddBiasGeluV2(half*        out,
+                                  const half*  bias,
+                                  const int*   ia3_tasks,
+                                  const half*  ia3_weights,
+                                  const int*   padding_offset,
+                                  const int    seq_len,
+                                  const int    m,
+                                  const int    n,
+                                  cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeAddBiasGeluV2(__nv_bfloat16*       out,
+                                  const __nv_bfloat16* bias,
+                                  const int*           ia3_tasks,
+                                  const __nv_bfloat16* ia3_weights,
+                                  const int*           padding_offset,
+                                  const int            seq_len,
+                                  const int            m,
+                                  const int            n,
+                                  cudaStream_t         stream);
+#endif  // ENABLE_BF16
+
+template<typename T>
+__global__ void sigmoid_kernel(T* data, const int size, const float scale)
+{
+    const int index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    if (index < size) {
+        float val   = cuda_cast<float>(data[index]);
+        val         = 1.0f / (1.0f + exp(-val)) * scale;
+        data[index] = T(val);
+    }
+}
+
+template<>
+__global__ void sigmoid_kernel(half2* data, const int size, const float scale)
+{
+    const int index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    if (index < size / 2) {
+        half2  val        = data[index];
+        float2 val_float2 = cuda_cast<float2>(val);
+        val_float2.x      = 1.0f / (1.0f + exp(-val_float2.x)) * scale;
+        val_float2.y      = 1.0f / (1.0f + exp(-val_float2.y)) * scale;
+        data[index]       = cuda_cast<half2>(val_float2);
+    }
+}
+
+template<typename T>
+void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream)
+{
+    if (std::is_same<T, float>::value || (size % 2 != 0)) {
+        dim3 block(128);
+        dim3 grid((size + 127) / 128);
+        sigmoid_kernel<<<grid, block, 0, stream>>>(data, size, scale);
+    }
+    else {
+        dim3 block(128);
+        dim3 grid((size + 255) / 256);
+        sigmoid_kernel<<<grid, block, 0, stream>>>((half2*)data, size, scale);
+    }
+}
+
+template void invokeSigmoid(float* data, const int size, const float scale, cudaStream_t stream);
+template void invokeSigmoid(half* data, const int size, const float scale, cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/activation_kernels.h b/src/fastertransformer/kernels/activation_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4c561e483921708cf62f6997c970cbdbe4299f6
--- /dev/null
+++ b/src/fastertransformer/kernels/activation_kernels.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+namespace fastertransformer {
+
+// clang-format off
+template<typename T> struct GeluActivation;
+template<typename T> struct ReluActivation;
+template<typename T> struct SiluActivation;
+template<typename T> struct IdentityActivation;
+// clang-format on
+
+template<template<typename T> class Activation, typename T, typename BT>
+void invokeGenericActivation(T*           out,
+                             const BT*    bias,
+                             const T*     gated_weights,
+                             const BT*    gated_bias,
+                             const int*   ia3_tasks,
+                             const T*     ia3_weights,
+                             const int    m,
+                             const int    n,
+                             const int    int8_mode,
+                             const float* activation_in,
+                             const float* activation_out,
+                             const int*   padding_offset,
+                             const int    seq_len,
+                             cudaStream_t stream);
+
+template<template<typename T> class Activation, typename T, typename BT>
+void invokeGenericActivation(T*           out,
+                             const BT*    bias,
+                             const T*     gated_weights,
+                             const BT*    gated_bias,
+                             const int*   ia3_tasks,
+                             const T*     ia3_weights,
+                             const int    m,
+                             const int    n,
+                             const int    int8_mode,
+                             const float* activation_in,
+                             const float* activation_out,
+                             cudaStream_t stream)
+{
+    invokeGenericActivation<Activation, T, BT>(out,
+                                               bias,
+                                               gated_weights,
+                                               gated_bias,
+                                               ia3_tasks,
+                                               ia3_weights,
+                                               m,
+                                               n,
+                                               int8_mode,
+                                               activation_in,
+                                               activation_out,
+                                               (const int*)nullptr,
+                                               0,
+                                               stream);
+}
+
+template<typename T>
+void invokeAddBiasGeluV2(T*           out,
+                         const T*     bias,
+                         const int*   ia3_tasks,
+                         const T*     ia3_weights,
+                         const int*   padding_offset,
+                         const int    seq_len,
+                         const int    m,
+                         const int    n,
+                         cudaStream_t stream);
+
+template<typename T>
+void invokeAddBias(T* out, T const* bias, const int m, const int n, cudaStream_t stream)
+{
+    invokeGenericActivation<IdentityActivation, T, T>(
+        out, bias, nullptr, nullptr, nullptr, nullptr, m, n, 0, nullptr, nullptr, stream);
+}
+
+template<typename T>
+void invokeAddBiasGeluV2(
+    T* out, const T* bias, const int* ia3_tasks, const T* ia3_weights, const int m, const int n, cudaStream_t stream)
+{
+    invokeAddBiasGeluV2(out, bias, ia3_tasks, ia3_weights, nullptr, 0, m, n, stream);
+}
+
+template<typename T>
+void invokeAddBiasTanh(T* out, const T* bias, const int m, const int n, cudaStream_t stream);
+
+template<typename T>
+void invokeSigmoid(T* data, const int size, const float scale, cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/ban_bad_words.cu b/src/fastertransformer/kernels/ban_bad_words.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e5fb77f004ff236f6c935b194b5e5be13ddb19c5
--- /dev/null
+++ b/src/fastertransformer/kernels/ban_bad_words.cu
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/ban_bad_words.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void ban_bad_words(T*         logits,
+                              const int* output_ids_buf,
+                              const int* parent_ids_buf,
+                              int        batch_size,
+                              int        beam_width,
+                              const int* bad_words,
+                              size_t     bad_words_len,
+                              bool       share_words,
+                              int        id_offset,
+                              int        vocab_size_padded,
+                              size_t     step)
+{
+    const int id        = blockIdx.x * blockDim.x + threadIdx.x;
+    const int batch_idx = blockIdx.y / beam_width;
+    const int beam_idx  = blockIdx.y % beam_width;
+
+    const int* base_bad_words         = share_words ? bad_words : bad_words + batch_idx * 2 * bad_words_len;
+    const int* base_bad_words_offsets = base_bad_words + bad_words_len;
+
+    if (id >= bad_words_len || base_bad_words_offsets[id] < 0) {
+        return;
+    }
+
+    const int item_end   = base_bad_words_offsets[id];
+    const int item_start = (id > 0) ? base_bad_words_offsets[id - 1] : 0;
+    const int item_size  = item_end - item_start;
+
+    /* The single-token case unconditionally bans the token */
+    bool should_ban = item_size == 1;
+
+    /* Multi-token case and enough previously generated tokens to look for a match */
+    if (item_size > 1 && step >= item_size - 1) {
+        should_ban             = true;
+        int        parent_id   = beam_idx;
+        const bool gather_beam = beam_width > 1;
+
+        for (int token_idx = item_size - 2; token_idx >= 0; token_idx--) {
+            const int previous_token = output_ids_buf[(step - (item_size - 1) + token_idx) * batch_size * beam_width
+                                                      + id_offset + batch_idx * beam_width + parent_id];
+
+            if (previous_token != base_bad_words[item_start + token_idx]) {
+                should_ban = false;
+                break;
+            }
+            if (gather_beam) {
+                parent_id = parent_ids_buf[(step - (item_size - 1) + token_idx) * beam_width * batch_size + id_offset
+                                           + batch_idx * beam_width + parent_id];
+
+                if (parent_id < 0 || parent_id >= beam_width) {
+                    should_ban = false;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (should_ban) {
+        int banned_token = base_bad_words[item_end - 1];
+        if (0 < banned_token && banned_token < vocab_size_padded) {
+            logits[batch_idx * beam_width * vocab_size_padded + beam_idx * vocab_size_padded + banned_token] =
+                static_cast<T>(-INFINITY);
+        }
+    }
+}
+
+template<typename T>
+void invokeBanBadWords(T*           logits,
+                       const int*   output_ids_buf,
+                       const int*   parent_ids_buf,
+                       int          batch_size,
+                       int          local_batch_size,
+                       int          beam_width,
+                       const int*   bad_words,
+                       bool         share_words,
+                       size_t       bad_words_len,
+                       int          id_offset,
+                       int          vocab_size_padded,
+                       size_t       step,
+                       cudaStream_t stream)
+{
+    dim3 block, grid;
+    block.x = min(((bad_words_len + 32 - 1) / 32) * 32, 256UL);
+    grid.x  = (bad_words_len + block.x - 1) / block.x;
+    grid.y  = local_batch_size * beam_width;
+
+    ban_bad_words<<<grid, block, 0, stream>>>(logits,
+                                              output_ids_buf,
+                                              parent_ids_buf,
+                                              batch_size,
+                                              beam_width,
+                                              bad_words,
+                                              bad_words_len,
+                                              share_words,
+                                              id_offset,
+                                              vocab_size_padded,
+                                              step);
+    sync_check_cuda_error();
+}
+
+template void invokeBanBadWords(half*        logits,
+                                const int*   output_ids_buf,
+                                const int*   parent_ids_buf,
+                                int          batch_size,
+                                int          local_batch_size,
+                                int          beam_width,
+                                const int*   bad_words,
+                                bool         share_words,
+                                size_t       bad_words_len,
+                                int          id_offset,
+                                int          vocab_size_padded,
+                                size_t       step,
+                                cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeBanBadWords(__nv_bfloat16* logits,
+                                const int*     output_ids_buf,
+                                const int*     parent_ids_buf,
+                                int            batch_size,
+                                int            local_batch_size,
+                                int            beam_width,
+                                const int*     bad_words,
+                                bool           share_words,
+                                size_t         bad_words_len,
+                                int            id_offset,
+                                int            vocab_size_padded,
+                                size_t         step,
+                                cudaStream_t   stream);
+#endif
+template void invokeBanBadWords(float*       logits,
+                                const int*   output_ids_buf,
+                                const int*   parent_ids_buf,
+                                int          batch_size,
+                                int          local_batch_size,
+                                int          beam_width,
+                                const int*   bad_words,
+                                bool         share_words,
+                                size_t       bad_words_len,
+                                int          id_offset,
+                                int          vocab_size_padded,
+                                size_t       step,
+                                cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/ban_bad_words.h b/src/fastertransformer/kernels/ban_bad_words.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c6b31ac407566e79408faa245745b880d292e72
--- /dev/null
+++ b/src/fastertransformer/kernels/ban_bad_words.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeBanBadWords(T*           logits,
+                       const int*   output_ids_buf,
+                       const int*   parent_ids_buf,
+                       int          batch_size,
+                       int          local_batch_size,
+                       int          beam_width,
+                       const int*   bad_words,
+                       bool         share_words,
+                       size_t       bad_words_len,
+                       int          id_offset,
+                       int          vocab_size_padded,
+                       size_t       step,
+                       cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/beam_search_penalty_kernels.cu b/src/fastertransformer/kernels/beam_search_penalty_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..64c746ef08f7f68667e66c99e4f89c0d1cf56598
--- /dev/null
+++ b/src/fastertransformer/kernels/beam_search_penalty_kernels.cu
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+
+#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void add_bias_temperature(T*          logits,
+                                     const T*    bias,
+                                     const int   batch_size,
+                                     const int   beam_width,
+                                     const int   vocab_size,
+                                     const int   vocab_size_padded,
+                                     const float temperature)
+{
+    int tid  = threadIdx.x;
+    int bid  = blockIdx.x;
+    int bbid = blockIdx.y;
+
+    logits += bbid * vocab_size_padded;
+
+    const T MASK_VAL = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
+    const T inv_temp = static_cast<T>(1.0f / (temperature + 1e-6f));
+    for (int i = tid + bid * blockDim.x; i < vocab_size_padded; i += blockDim.x * gridDim.x) {
+        if (i < vocab_size) {
+            T bias_val = bias == nullptr ? (T)(0.0f) : bias[i];
+            logits[i]  = (logits[i] + bias_val) * inv_temp;
+        }
+        else {
+            logits[i] = MASK_VAL;
+        }
+    }
+}
+
+template<>
+__global__ void add_bias_temperature(half2*       logits,
+                                     const half2* bias,
+                                     const int    batch_size,
+                                     const int    beam_width,
+                                     const int    vocab_size,
+                                     const int    vocab_size_padded,
+                                     const float  temperature)
+{
+    assert(vocab_size % 2 == 0);
+    assert(vocab_size_padded % 2 == 0);
+
+    const int tid  = threadIdx.x;
+    const int bid  = blockIdx.x;
+    const int bbid = blockIdx.y;
+
+    const half2 mask_val = __float2half2_rn(-HALF_FLT_MAX);
+    const half2 inv_temp = __float2half2_rn(1.0f / (temperature + 1e-6f));
+
+    const int half_vocab_size        = vocab_size / 2;
+    const int half_vocab_size_padded = vocab_size_padded / 2;
+
+    logits += bbid * half_vocab_size_padded;
+    for (int index = tid + bid * blockDim.x; index < half_vocab_size_padded; index += blockDim.x * gridDim.x) {
+        int   vocab_idx = index % half_vocab_size_padded;
+        half2 logit     = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
+        if (vocab_idx < half_vocab_size) {
+            if (bias != nullptr) {
+                logit = __hadd2(logit, bias[vocab_idx]);
+            }
+            logit = __hmul2(logit, inv_temp);
+        }
+        logits[index] = logit;
+    }
+}
+
+template<typename T, bool IS_ADDITIVE>
+__global__ void apply_repetition_penalty(T*          logits,
+                                         const int   batch_size,
+                                         const int   beam_width,
+                                         const int   vocab_size,
+                                         const int   vocab_size_padded,
+                                         const int   step,
+                                         const int*  current_ids,
+                                         const int*  previous_ids,
+                                         const int*  parent_ids,
+                                         const int*  input_lengths,
+                                         const int   max_input_length,
+                                         const float repetition_penalty)
+{
+    assert(step > 0);
+
+    const int tid      = threadIdx.x;
+    const int bbid     = blockIdx.x;
+    const int batch_id = bbid / beam_width;
+    const int bbsize   = batch_size * beam_width;
+
+    logits += bbid * vocab_size_padded;
+    extern __shared__ char sbuf[];
+    T*                     penalty_logits = reinterpret_cast<T*>(sbuf);
+    // prevent misaligment when sizeof(T) = 2
+    int*      penalty_indices = reinterpret_cast<int*>(sbuf + (sizeof(T) * step + 31) / 32 * 32);
+    const int input_length    = (input_lengths != nullptr) ? input_lengths[bbid] : max_input_length;
+    if (tid == 0) {
+        T   repet_penalty         = static_cast<T>(repetition_penalty);
+        int prev_id               = current_ids[bbid];
+        T   prev_logit            = logits[prev_id];
+        penalty_indices[step - 1] = prev_id;
+
+        if (IS_ADDITIVE) {
+            penalty_logits[step - 1] = prev_logit - repet_penalty;
+        }
+        else {
+            penalty_logits[step - 1] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
+        }
+        if (step > 1) {
+            int parent_beam = bbid % beam_width;
+            for (int i = step - 2; i >= 0; --i) {
+                // Skip the padded tokens.
+                if (i >= input_length && i < max_input_length) {
+                    continue;
+                }
+                parent_beam        = parent_ids[i * bbsize + batch_id * beam_width + parent_beam];
+                prev_id            = previous_ids[i * bbsize + batch_id * beam_width + parent_beam];
+                prev_logit         = logits[prev_id];
+                penalty_indices[i] = prev_id;
+                if (IS_ADDITIVE) {
+                    penalty_logits[i] = prev_logit - repet_penalty;
+                }
+                else {
+                    penalty_logits[i] = prev_logit > T(0) ? prev_logit / repet_penalty : prev_logit * repet_penalty;
+                }
+            }
+        }
+    }
+    __syncthreads();
+    for (int i = tid; i < step; i += blockDim.x) {
+        if (i >= input_length && i < max_input_length) {
+            continue;
+        }
+        logits[penalty_indices[i]] = penalty_logits[i];
+    }
+}
+
+template<typename T>
+__global__ void apply_min_length_penalty(T*         logits,
+                                         const int  min_length,
+                                         const int* end_ids,
+                                         const int* sequence_lengths,
+                                         const int  max_input_length,
+                                         const int  beam_width,
+                                         const int  vocab_size_padded)
+{
+    int bbid = threadIdx.x + blockIdx.x * blockDim.x;  // batch-beam index
+    int bid  = bbid / beam_width;                      // batch index
+    // We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
+    // which is equal to the length of k/v caches.
+    if (sequence_lengths[bbid] + 1 - max_input_length < min_length) {
+        T mask_val                                      = (std::is_same<T, half>::value) ? -HALF_FLT_MAX : -FLT_MAX;
+        logits[bbid * vocab_size_padded + end_ids[bid]] = mask_val;
+    }
+}
+
+template<typename T>
+void invokeAddBiasApplyPenalties(int                         step,
+                                 T*                          logits,
+                                 const int*                  current_ids,
+                                 const int*                  previous_ids,
+                                 const int*                  parent_ids,
+                                 const int*                  input_lengths,
+                                 const int*                  sequence_lengths,
+                                 const T*                    bias,
+                                 const int                   ite,
+                                 const int                   max_input_length,
+                                 const int                   local_batch_size,
+                                 const int                   batch_size,
+                                 const int                   beam_width,
+                                 const int                   vocab_size,
+                                 const int                   vocab_size_padded,
+                                 const int*                  end_ids,
+                                 const float                 temperature,
+                                 const float                 repetition_penalty,
+                                 const RepetitionPenaltyType repetition_penalty_type,
+                                 const int                   min_length,
+                                 cudaStream_t                stream)
+{
+    if (bias != nullptr || temperature != 1.0f || vocab_size != vocab_size_padded) {
+        dim3 block(512);
+        if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0) {
+            dim3 grid((vocab_size_padded / 2 + block.x - 1) / block.x, beam_width * local_batch_size);
+            add_bias_temperature<<<grid, block, 0, stream>>>(reinterpret_cast<half2*>(logits),
+                                                             reinterpret_cast<const half2*>(bias),
+                                                             batch_size,
+                                                             beam_width,
+                                                             vocab_size,
+                                                             vocab_size_padded,
+                                                             temperature);
+        }
+        else {
+            dim3 grid((vocab_size_padded + block.x - 1) / block.x, beam_width * local_batch_size);
+            add_bias_temperature<<<grid, block, 0, stream>>>(
+                logits, bias, batch_size, beam_width, vocab_size, vocab_size_padded, temperature);
+        }
+    }
+
+    if (repetition_penalty_type != RepetitionPenaltyType::None && step > 0) {
+        if (repetition_penalty != getDefaultPenaltyValue(repetition_penalty_type)) {
+            size_t smem_size = (sizeof(T) * step + 31) / 32 * 32 + sizeof(int) * step;
+            dim3   block(256);
+            dim3   grid(beam_width * local_batch_size);
+            if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative) {
+                apply_repetition_penalty<T, false>
+                    <<<grid, block, smem_size, stream>>>(logits,
+                                                         batch_size,
+                                                         beam_width,
+                                                         vocab_size,
+                                                         vocab_size_padded,
+                                                         step,
+                                                         current_ids,
+                                                         previous_ids,
+                                                         // TODO(jaedeokk):
+                                                         //   Remove (+ite ...) by getting parent_ids with offset
+                                                         //   and then remove 'ite' argument from the function.
+                                                         parent_ids + ite * beam_width * local_batch_size,
+                                                         input_lengths,
+                                                         max_input_length,
+                                                         repetition_penalty);
+            }
+            else if (repetition_penalty_type == RepetitionPenaltyType::Additive) {
+                apply_repetition_penalty<T, true>
+                    <<<grid, block, smem_size, stream>>>(logits,
+                                                         batch_size,
+                                                         beam_width,
+                                                         vocab_size,
+                                                         vocab_size_padded,
+                                                         step,
+                                                         current_ids,
+                                                         previous_ids,
+                                                         parent_ids + ite * beam_width * local_batch_size,
+                                                         input_lengths,
+                                                         max_input_length,
+                                                         repetition_penalty);
+            }
+        }
+    }
+
+    if (step - max_input_length < min_length) {
+        FT_CHECK_WITH_INFO(sequence_lengths != nullptr, "Need sequence_lengths to apply min length penlaty");
+        FT_CHECK_WITH_INFO(end_ids != nullptr, "Need end_id to apply min length penlaty");
+
+        const int block_size = min(local_batch_size * beam_width, 1024);
+        const int grid_size  = (local_batch_size * beam_width + block_size - 1) / block_size;
+        apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
+            logits, min_length, end_ids, sequence_lengths, max_input_length, beam_width, vocab_size_padded);
+    }
+}
+
+template void invokeAddBiasApplyPenalties(int                         step,
+                                          float*                      logits,
+                                          const int*                  current_ids,
+                                          const int*                  previous_ids,
+                                          const int*                  parent_ids,
+                                          const int*                  input_lengths,
+                                          const int*                  sequence_lengths,
+                                          const float*                bias,
+                                          const int                   ite,
+                                          const int                   max_input_length,
+                                          const int                   local_batch_size,
+                                          const int                   batch_size,
+                                          const int                   beam_width,
+                                          const int                   vocab_size,
+                                          const int                   vocab_size_padded,
+                                          const int*                  end_ids,
+                                          const float                 temperature,
+                                          const float                 repetition_penalty,
+                                          const RepetitionPenaltyType repetition_penalty_type,
+                                          const int                   min_length,
+                                          cudaStream_t                stream);
+
+template void invokeAddBiasApplyPenalties(int                         step,
+                                          half*                       logits,
+                                          const int*                  current_ids,
+                                          const int*                  previous_ids,
+                                          const int*                  parent_ids,
+                                          const int*                  input_lengths,
+                                          const int*                  sequence_lengths,
+                                          const half*                 bias,
+                                          const int                   ite,
+                                          const int                   max_input_length,
+                                          const int                   local_batch_size,
+                                          const int                   batch_size,
+                                          const int                   beam_width,
+                                          const int                   vocab_size,
+                                          const int                   vocab_size_padded,
+                                          const int*                  end_ids,
+                                          const float                 temperature,
+                                          const float                 repetition_penalty,
+                                          const RepetitionPenaltyType repetition_penalty_type,
+                                          const int                   min_length,
+                                          cudaStream_t                stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/beam_search_penalty_kernels.h b/src/fastertransformer/kernels/beam_search_penalty_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae67f9654c752bb870ec35248de7739bd3db3792
--- /dev/null
+++ b/src/fastertransformer/kernels/beam_search_penalty_kernels.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeAddBiasApplyPenalties(int                         step,
+                                 T*                          logits,
+                                 const int*                  current_ids,
+                                 const int*                  previous_ids,
+                                 const int*                  parent_ids,
+                                 const int*                  input_lengths,
+                                 const int*                  sequence_lengths,
+                                 const T*                    bias,
+                                 const int                   ite,
+                                 const int                   max_input_length,
+                                 const int                   local_batch_size,
+                                 const int                   batch_size,
+                                 const int                   beam_width,
+                                 const int                   vocab_size,
+                                 const int                   vocab_size_padded,
+                                 const int*                  end_ids,
+                                 const float                 temperature,
+                                 const float                 repetition_penalty,
+                                 const RepetitionPenaltyType repetition_penalty_type,
+                                 const int                   min_length,
+                                 cudaStream_t                stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/beam_search_topk_kernels.cu b/src/fastertransformer/kernels/beam_search_topk_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fcaf644b0f320fca3eae9cf8191b53b1a6fa90bc
--- /dev/null
+++ b/src/fastertransformer/kernels/beam_search_topk_kernels.cu
@@ -0,0 +1,845 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11050)
+#include <cub/cub.cuh>
+#else
+#include "3rdparty/cub/cub.cuh"
+#endif
+
+#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
+{
+    // score = log(prob) / (length)^length_penalty.
+    if (length_penalty == 0.0f || length == 1) {
+        return log_prob;
+    }
+    return log_prob / static_cast<T>(powf((float)length, length_penalty));
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_topK_kernel(const T*    log_probs,
+                                                                     int*        topk_tmp_id_buf,
+                                                                     T*          topk_tmp_val_buf,
+                                                                     const bool* finished,
+                                                                     const int*  sequence_lengths,
+                                                                     const int   vocab_size,
+                                                                     T           diversity_rate,
+                                                                     float       length_penalty)
+{
+    typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage               temp_storage;
+
+    int            thread_id = threadIdx.x;
+    int            block_id  = blockIdx.x;  // batch beam index.
+    TopK<T, MAX_K> partial;
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+#pragma unroll
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.p[i] = -1;
+        partial.u[i] = -MAX_T_VAL;
+    }
+
+#pragma unroll
+    for (int elem_id = thread_id; elem_id < vocab_size; elem_id += THREADBLOCK_SIZE) {
+        int index = elem_id + block_id * vocab_size;
+        T   score = length_penalty == 0.0f ? log_probs[index] :
+                                             apply_length_penalty(log_probs[index],
+                                                                finished[block_id] ? sequence_lengths[block_id] :
+                                                                                       sequence_lengths[block_id] + 1,
+                                                                length_penalty);
+        partial.insert(score, index);
+    }
+
+    TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
+
+    if (thread_id == 0) {
+        int index = block_id * MAX_K;
+
+#pragma unroll
+        for (int i = 0; i < MAX_K; ++i) {
+            topk_tmp_id_buf[index + i]  = total.p[i];
+            topk_tmp_val_buf[index + i] = total.u[i] + diversity_rate * (T)i;
+        }
+    }
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__
+    void batch_topK_kernel(int* topk_tmp_id_buf, T* topk_tmp_val_buf, int* id_buf)
+{
+    int            thread_id = threadIdx.x;
+    int            block_id  = blockIdx.x;
+    const bool     IS_FP16   = std::is_same<T, half>::value;
+    const T        MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+    TopK<T, MAX_K> partial;
+    if (thread_id == 0) {
+        for (int i = 0; i < MAX_K; ++i) {
+            partial.p[i] = -1;
+            partial.u[i] = -MAX_T_VAL;
+        }
+
+        int index = block_id * MAX_K * MAX_K;
+        for (int i = 0; i < MAX_K * MAX_K; i++) {
+            partial.insert((T)topk_tmp_val_buf[index + i], topk_tmp_id_buf[index + i]);
+        }
+
+        index = block_id * MAX_K;
+        for (int i = 0; i < MAX_K; i++) {
+            id_buf[index + i] = partial.p[i];
+        }
+    }
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__
+    void batch_topK_kernel_v2(int* topk_tmp_id_buf, T* topk_tmp_val_buf, int* id_buf)
+{
+    typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage               temp_storage;
+
+    int            tid = threadIdx.x;
+    int            bid = blockIdx.x;
+    TopK<T, MAX_K> partial;
+    const bool     IS_FP16   = std::is_same<T, half>::value;
+    const T        MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+#pragma unroll
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.p[i] = -1;
+        partial.u[i] = -MAX_T_VAL;
+    }
+
+    int ite = MAX_K * MAX_K / THREADBLOCK_SIZE;
+#pragma unroll
+    for (int i = 0; i < ite; i++) {
+        int index = bid * MAX_K * MAX_K + i * THREADBLOCK_SIZE + tid;
+        partial.insert((T)topk_tmp_val_buf[index], topk_tmp_id_buf[index]);
+    }
+
+    TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
+
+    if (tid == 0) {
+#pragma unroll
+        for (int i = 0; i < MAX_K; i++) {
+            id_buf[bid * MAX_K + i] = total.p[i];
+        }
+    }
+}
+
+template<typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
+__global__ void topk_stage_1_opt3(const T* __restrict log_probs,
+                                  T*          tmp_log_probs,
+                                  int*        topk_tmp_id_buf,
+                                  T*          topk_tmp_val_buf,
+                                  const bool* finished,
+                                  const int*  sequence_lengths,
+                                  const int   k,
+                                  const int   vocab_size,
+                                  const float length_penalty,
+                                  const int*  end_ids)
+{
+    typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage     temp_storage;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    const int  row_id             = bid / BLOCKS_PER_BEAM_;  // row id for log_probs (batchbeam index)
+    const int  block_lane         = bid % BLOCKS_PER_BEAM_;  // block id for a beam
+    const int  tmp_log_buf_index  = row_id * vocab_size;
+    const int  tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM_ * k + block_lane * k;
+    TopK_2<T>  partial;
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    if (finished != nullptr && finished[row_id] == true) {
+        if (tid < k) {
+            const int index = tmp_topk_buf_index + tid;
+            if (block_lane == 0 && tid == 0) {
+                const int end_id        = end_ids[row_id / k];
+                topk_tmp_id_buf[index]  = tmp_log_buf_index + end_id;
+                topk_tmp_val_buf[index] = log_probs[tmp_log_buf_index + end_id];
+            }
+            else {
+                topk_tmp_id_buf[index]  = -1;
+                topk_tmp_val_buf[index] = -MAX_T_VAL;
+            }
+        }
+        return;
+    }
+
+    for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
+         elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
+        int index            = elem_id + tmp_log_buf_index;
+        tmp_log_probs[index] = log_probs[index];
+    }
+
+    for (int ite = 0; ite < k; ite++) {
+        partial.init();
+#pragma unroll
+        for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
+             elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
+            int index = elem_id + tmp_log_buf_index;
+            partial.insert(tmp_log_probs[index], index);
+        }
+
+        TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
+
+        if (tid == 0) {
+            const int index         = tmp_topk_buf_index + ite;
+            topk_tmp_id_buf[index]  = total.p;
+            topk_tmp_val_buf[index] = total.u;
+            tmp_log_probs[total.p]  = -MAX_T_VAL;
+        }
+        __syncthreads();
+    }
+}
+
+template<typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
+__global__ void topk_stage_2_opt3(const int* __restrict topk_tmp_id_buf,
+                                  T*             topk_tmp_val_buf,
+                                  int*           ids,
+                                  BeamHypotheses beam_hyps,
+                                  const int*     end_ids,
+                                  const int      vocab_size,
+                                  const int      k)
+{
+    const int  size      = k * k * BLOCKS_PER_BEAM_;
+    const int  tid       = threadIdx.x;
+    const int  batch_id  = blockIdx.x;
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage     temp_storage;
+    extern __shared__ char                           array[];
+    T*                                               s_val = topk_tmp_val_buf + batch_id * size;
+    int*                                             s_id  = (int*)(array);
+
+    __shared__ int  selected_beams;
+    __shared__ bool is_stop;
+
+    if (tid == 0) {
+        selected_beams = 0;
+        is_stop        = false;
+    }
+    __syncthreads();
+    if (beam_hyps.num_beams != nullptr) {
+        const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
+        if (beam_hyps.num_beams[global_batch_idx] == 0 && tid == 0) {
+            // initialize the buffer
+            beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
+        }
+        else if (beam_hyps.num_beams[global_batch_idx] == k) {
+            return;
+        }
+    }
+
+    TopK_2<T> partial;
+
+    // In some cases, we may encounter k finished sentences, but scores are bad. So, the max iteration
+    // is 2*k here
+    for (int ite = 0; ite < 2 * k; ite++) {
+        partial.init();
+#pragma unroll
+        for (int i = tid; i < size; i += BLOCK_SIZE_) {
+            partial.insert(s_val[i], i);
+        }
+
+        TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
+
+        if (tid == 0) {
+            if (beam_hyps.num_beams != nullptr
+                && topk_tmp_id_buf[batch_id * size + total.p] % vocab_size == end_ids[batch_id]) {
+                // if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
+                // https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
+                if (ite >= k) {
+                    s_val[total.p] = -MAX_T_VAL;
+                }
+                else {
+                    const int   global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
+                    const float normed_score =
+                        apply_length_penalty(s_val[total.p], beam_hyps.step, beam_hyps.length_penalty);
+                    const int num_beam = beam_hyps.num_beams[global_batch_idx];
+                    int       beam_idx = num_beam;
+                    // If there are beam_width finished sentences, check that the score of selected candidatet
+                    // is higher than min_normed_score or not. If current score is better, replace worst one
+                    // and update the min_normed_score.
+                    if (num_beam == k) {
+                        if (normed_score < beam_hyps.min_normed_scores[global_batch_idx]) {
+                            // end the tracing and exist this for loop
+                            selected_beams = k;
+                            is_stop        = true;
+                            break;
+                        }
+                        else {
+                            // find the beam index which's score = min_normed_score, erase it.
+                            for (int j = 0; j < k; j++) {
+                                if (beam_hyps.normed_scores[global_batch_idx * k + j]
+                                    == beam_hyps.min_normed_scores[global_batch_idx]) {
+                                    beam_idx = j;
+                                    beam_hyps.num_beams[global_batch_idx]--;
+
+                                    beam_hyps.min_normed_scores[global_batch_idx]     = FLT_MAX;
+                                    beam_hyps.normed_scores[global_batch_idx * k + j] = normed_score;
+                                    for (int l = 0; l < k; l++) {
+                                        beam_hyps.min_normed_scores[global_batch_idx] =
+                                            min(beam_hyps.min_normed_scores[global_batch_idx],
+                                                beam_hyps.normed_scores[global_batch_idx * k + l]);
+                                    }
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    const int tgt_id_offset = ((batch_id + beam_hyps.ite * beam_hyps.local_batch_size) * k + beam_idx)
+                                              * (beam_hyps.max_seq_len);
+                    beam_hyps.output_ids_tgt[tgt_id_offset + beam_hyps.step] = end_ids[batch_id];
+
+                    int prev_id = (topk_tmp_id_buf[batch_id * size + total.p] / vocab_size) % k;
+                    for (int j = beam_hyps.step - 1; j >= 0; j--) {
+                        const int src_idx = j * beam_hyps.batch_size * k
+                                            + beam_hyps.ite * beam_hyps.local_batch_size * k + batch_id * k + prev_id;
+
+                        beam_hyps.output_ids_tgt[tgt_id_offset + j] = beam_hyps.output_ids_src[src_idx];
+                        prev_id                                     = beam_hyps.parent_ids_src[src_idx];
+                    }
+                    const int tgt_beam_idx                       = global_batch_idx * k + beam_idx;
+                    beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = beam_hyps.step;
+                    beam_hyps.normed_scores[tgt_beam_idx]        = normed_score;
+                    beam_hyps.min_normed_scores[global_batch_idx] =
+                        min(beam_hyps.min_normed_scores[global_batch_idx], beam_hyps.normed_scores[tgt_beam_idx]);
+
+                    s_val[total.p] = -MAX_T_VAL;
+
+                    beam_hyps.num_beams[global_batch_idx]++;
+                }
+            }
+            else {
+                s_id[selected_beams] = total.p;
+                s_val[total.p]       = -MAX_T_VAL;
+                selected_beams++;
+            }
+        }
+        __syncthreads();
+        if (selected_beams >= k) {
+            break;
+        }
+    }
+    if (tid < k && is_stop == false) {
+        ids[batch_id * k + tid] = topk_tmp_id_buf[batch_id * size + s_id[tid]];
+    }
+}
+
+template<typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
+__global__ void topk_stage_1_opt2_general(const T* __restrict log_probs,
+                                          T*          tmp_log_probs,
+                                          int*        topk_tmp_id_buf,
+                                          T*          topk_tmp_val_buf,
+                                          const bool* finished,
+                                          const int*  sequence_lengths,
+                                          const int   k,
+                                          const int   vocab_size,
+                                          const float length_penalty)
+{
+    const bool                                      IS_FP16   = std::is_same<T, half>::value;
+    const T                                         MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+    typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage    temp_storage;
+
+    const int tid                = threadIdx.x;
+    const int bid                = blockIdx.x;
+    const int row_id             = bid / BLOCKS_PER_BEAM;  // row id for log_probs
+    const int block_lane         = bid % BLOCKS_PER_BEAM;  // block id for a beam
+    const int tmp_log_buf_index  = row_id * vocab_size;
+    const int tmp_topk_buf_index = row_id * BLOCKS_PER_BEAM * k + block_lane * k;
+    TopK_2<T> partial;
+
+    for (int elem_id = tid + block_lane * BLOCK_SIZE; elem_id < vocab_size; elem_id += BLOCK_SIZE * BLOCKS_PER_BEAM) {
+        int index            = elem_id + tmp_log_buf_index;
+        tmp_log_probs[index] = log_probs[index];
+    }
+
+    for (int ite = 0; ite < k; ite++) {
+        partial.init();
+#pragma unroll
+        for (int elem_id = tid + block_lane * BLOCK_SIZE; elem_id < vocab_size;
+             elem_id += BLOCK_SIZE * BLOCKS_PER_BEAM) {
+            int index = elem_id + tmp_log_buf_index;
+            partial.insert(tmp_log_probs[index], index);
+        }
+
+        TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
+
+        if (tid == 0) {
+            const int index         = tmp_topk_buf_index + ite;
+            topk_tmp_id_buf[index]  = total.p;
+            topk_tmp_val_buf[index] = total.u;
+            tmp_log_probs[total.p]  = -MAX_T_VAL;
+        }
+        __syncthreads();
+    }
+}
+
+template<typename T, int BLOCK_SIZE, int BLOCKS_PER_BEAM>
+__global__ void topk_stage_2_opt2_general(const int* __restrict topk_tmp_id_buf,
+                                          T*             topk_tmp_val_buf,
+                                          int*           ids,
+                                          BeamHypotheses beam_hyps,
+                                          const int*     end_ids,
+                                          const int      k,
+                                          const int      vocab_size)
+{
+    const int  size      = k * k * BLOCKS_PER_BEAM;
+    const int  tid       = threadIdx.x;
+    const int  batch_id  = blockIdx.x;
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage    temp_storage;
+    extern __shared__ char                          array[];
+    T*                                              s_val = topk_tmp_val_buf + batch_id * size;
+    int*                                            s_id  = (int*)(array);
+
+    __shared__ int  selected_beams;
+    __shared__ bool is_stop;
+
+    if (tid == 0) {
+        selected_beams = 0;
+        is_stop        = false;
+    }
+    __syncthreads();
+    if (beam_hyps.num_beams != nullptr) {
+        const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
+        if (beam_hyps.num_beams[global_batch_idx] == 0 && tid == 0) {
+            beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
+        }
+        else if (beam_hyps.num_beams[global_batch_idx] == k) {
+            return;
+        }
+    }
+
+    TopK_2<T> partial;
+
+    // In some cases, we may encounter k finished sentences, but scores are bad. So, the max iteration
+    // is 2*k here
+    for (int ite = 0; ite < 2 * k; ite++) {
+        partial.init();
+#pragma unroll
+        for (int i = tid; i < size; i += BLOCK_SIZE) {
+            partial.insert(s_val[i], i);
+        }
+
+        TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
+
+        if (tid == 0) {
+            if (beam_hyps.num_beams != nullptr
+                && topk_tmp_id_buf[batch_id * size + total.p] % vocab_size == end_ids[batch_id]) {
+                // if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
+                // https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
+                if (ite >= k) {
+                    s_val[total.p] = -MAX_T_VAL;
+                }
+                else {
+                    const int   global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + batch_id;
+                    const float normed_score =
+                        apply_length_penalty(s_val[total.p], beam_hyps.step, beam_hyps.length_penalty);
+                    const int num_beam = beam_hyps.num_beams[global_batch_idx];
+                    int       beam_idx = num_beam;
+                    // If there are beam_width finished sentences, check that the score of selected candidatet
+                    // is higher than min_normed_score or not. If current score is better, replace worst one
+                    // and update the min_normed_score.
+                    if (num_beam == k) {
+                        if (normed_score < beam_hyps.min_normed_scores[global_batch_idx]) {
+                            // end the tracing and exist this for loop
+                            selected_beams = k;
+                            is_stop        = true;
+                            break;
+                        }
+                        else {
+                            // find the beam index which's score = min_normed_score, erase it.
+                            for (int j = 0; j < k; j++) {
+                                if (beam_hyps.normed_scores[global_batch_idx * k + j]
+                                    == beam_hyps.min_normed_scores[global_batch_idx]) {
+                                    beam_idx = j;
+                                    beam_hyps.num_beams[global_batch_idx]--;
+
+                                    beam_hyps.min_normed_scores[global_batch_idx]     = FLT_MAX;
+                                    beam_hyps.normed_scores[global_batch_idx * k + j] = normed_score;
+                                    for (int l = 0; l < k; l++) {
+                                        beam_hyps.min_normed_scores[global_batch_idx] =
+                                            min(beam_hyps.min_normed_scores[global_batch_idx],
+                                                beam_hyps.normed_scores[global_batch_idx * k + l]);
+                                    }
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    const int tgt_id_offset = ((batch_id + beam_hyps.ite * beam_hyps.local_batch_size) * k + beam_idx)
+                                              * (beam_hyps.max_seq_len);
+                    beam_hyps.output_ids_tgt[tgt_id_offset + beam_hyps.step] = end_ids[batch_id];
+
+                    int prev_id = (topk_tmp_id_buf[batch_id * size + total.p] / vocab_size) % k;
+                    for (int j = beam_hyps.step - 1; j >= 0; j--) {
+                        const int src_idx = j * beam_hyps.batch_size * k
+                                            + beam_hyps.ite * beam_hyps.local_batch_size * k + batch_id * k + prev_id;
+
+                        beam_hyps.output_ids_tgt[tgt_id_offset + j] = beam_hyps.output_ids_src[src_idx];
+                        prev_id                                     = beam_hyps.parent_ids_src[src_idx];
+                    }
+                    const int tgt_beam_idx                       = global_batch_idx * k + beam_idx;
+                    beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = beam_hyps.step;
+                    beam_hyps.normed_scores[tgt_beam_idx]        = normed_score;
+                    beam_hyps.min_normed_scores[global_batch_idx] =
+                        min(beam_hyps.min_normed_scores[global_batch_idx], beam_hyps.normed_scores[tgt_beam_idx]);
+
+                    s_val[total.p] = -MAX_T_VAL;
+
+                    beam_hyps.num_beams[global_batch_idx]++;
+                }
+            }
+            else {
+                s_id[selected_beams] = total.p;
+                s_val[total.p]       = -MAX_T_VAL;
+                selected_beams++;
+            }
+        }
+        __syncthreads();
+        if (selected_beams >= k) {
+            break;
+        }
+    }
+    if (tid < k && is_stop == false) {
+        ids[batch_id * k + tid] = topk_tmp_id_buf[batch_id * size + s_id[tid]];
+    }
+}
+
+#define CASE_K_DIV(K, BLOCK_SIZE_1, BLOCK_SIZE_2)                                                                      \
+    case K:                                                                                                            \
+        beam_topK_kernel<T, K, BLOCK_SIZE_2><<<batch_size * beam_width, BLOCK_SIZE_2, 0, stream>>>(log_probs,          \
+                                                                                                   topk_tmp_id_buf,    \
+                                                                                                   topk_tmp_val_buf,   \
+                                                                                                   finished,           \
+                                                                                                   sequence_lengths,   \
+                                                                                                   vocab_size,         \
+                                                                                                   diversity_rate,     \
+                                                                                                   length_penalty);    \
+        if (K < 10)                                                                                                    \
+            batch_topK_kernel<T, K, BLOCK_SIZE_1>                                                                      \
+                <<<batch_size, BLOCK_SIZE_1, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids);                     \
+        else                                                                                                           \
+            batch_topK_kernel_v2<T, K, 32><<<batch_size, 32, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids);     \
+        break;
+
+#define CASE_K(K, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)                                                      \
+    case K:                                                                                                            \
+        topk_stage_1_opt3<float, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_>                                                      \
+            <<<batch_size * K * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs,                               \
+                                                                              temp_log_probs,                          \
+                                                                              topk_tmp_id_buf,                         \
+                                                                              topk_tmp_val_buf,                        \
+                                                                              finished,                                \
+                                                                              sequence_lengths,                        \
+                                                                              beam_width,                              \
+                                                                              vocab_size,                              \
+                                                                              length_penalty,                          \
+                                                                              end_ids);                                \
+        topk_stage_2_opt3<float, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_>                                                      \
+            <<<batch_size, BLOCK_SIZE_2_, K * sizeof(int), stream>>>(                                                  \
+                topk_tmp_id_buf, topk_tmp_val_buf, ids, *beam_hyps, end_ids, vocab_size, beam_width);                  \
+        sync_check_cuda_error();                                                                                       \
+        break;
+
+template<typename T>
+void invokeTopkBeamSearch(void*           workspace,
+                          size_t&         workspace_size,
+                          T*              log_probs,
+                          int*            ids,
+                          BeamHypotheses* beam_hyps,
+                          const bool*     finished,
+                          const int*      sequence_lengths,
+                          const int       batch_size,
+                          const int       beam_width,
+                          const int       vocab_size_padded_,
+                          const T         diversity_rate,
+                          const float     length_penalty,
+                          const int*      end_ids,
+                          cudaStream_t    stream)
+{
+    FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
+    // log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
+    const int vocab_size = vocab_size_padded_;
+    // Beam size should be less than or equal to vocab size.
+    assert(beam_width <= vocab_size);
+    // Beam search needs the sequence lengths of beams to apply length penalty.
+    assert(length_penalty == 0.0f || sequence_lengths != nullptr);
+    const int max_block_per_beam      = 8;
+    int       temp_log_probs_buf_size = batch_size * beam_width * vocab_size;                       // type float
+    int       topk_tmp_ids_buf_size   = batch_size * beam_width * beam_width * max_block_per_beam;  // type int
+    int       topk_tmp_val_buf_size   = batch_size * beam_width * beam_width * max_block_per_beam;  // type float
+
+    // prevent memory misaligned address
+    temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
+    topk_tmp_ids_buf_size   = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
+    topk_tmp_val_buf_size   = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
+
+    if (workspace == nullptr) {
+        workspace_size = sizeof(float) * temp_log_probs_buf_size + sizeof(int) * topk_tmp_ids_buf_size
+                         + sizeof(float) * topk_tmp_val_buf_size;
+        return;
+    }
+    else {
+        T*   temp_log_probs   = (T*)workspace;
+        int* topk_tmp_id_buf  = (int*)(temp_log_probs + temp_log_probs_buf_size);
+        T*   topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
+        if (diversity_rate == 0.0f) {
+            switch (beam_width) {
+                CASE_K(1, 128, 128, 8);
+                CASE_K(4, 128, 128, 8);
+                CASE_K(10, 128, 128, 8);
+                CASE_K(16, 128, 128, 5);
+                CASE_K(32, 256, 128, 1);
+                CASE_K(64, 256, 256, 1);
+                default:
+                    topk_stage_1_opt2_general<T, 128, 1>
+                        <<<batch_size * beam_width * 1, 128, 0, stream>>>(log_probs,
+                                                                          temp_log_probs,
+                                                                          topk_tmp_id_buf,
+                                                                          topk_tmp_val_buf,
+                                                                          finished,
+                                                                          sequence_lengths,
+                                                                          beam_width,
+                                                                          vocab_size,
+                                                                          length_penalty);
+                    topk_stage_2_opt2_general<T, 128, 1>
+                        <<<batch_size,
+                           128,
+                           beam_width * beam_width * 1 * sizeof(float) + beam_width * sizeof(int),
+                           stream>>>(
+                            topk_tmp_id_buf, topk_tmp_val_buf, ids, *beam_hyps, end_ids, beam_width, vocab_size);
+                    break;
+            }
+        }
+        else {
+            switch (beam_width) {
+                CASE_K_DIV(1, 256, 256);
+                CASE_K_DIV(4, 256, 256);
+                CASE_K_DIV(16, 256, 64);
+                CASE_K_DIV(32, 256, 64);
+                CASE_K_DIV(64, 256, 64);
+                default:
+                    FT_CHECK_WITH_INFO(false, fmtstr("Topk kernel does not support beamwidth = %d \n", beam_width));
+                    break;
+            }
+        }
+        return;
+    }
+}
+
+#undef CASE_K
+#undef CASE_K_DIV
+
+template void invokeTopkBeamSearch(void*           workspace,
+                                   size_t&         workspace_size,
+                                   float*          log_probs,
+                                   int*            ids,
+                                   BeamHypotheses* beam_hyps,
+                                   const bool*     finished,
+                                   const int*      sequence_lengths,
+                                   const int       batch_size,
+                                   const int       beam_width,
+                                   const int       vocab_size_padded_,
+                                   const float     diversity_rate,
+                                   const float     length_penalty,
+                                   const int*      end_ids,
+                                   cudaStream_t    stream);
+
+template<typename T>
+__global__ void tileEncoderResults(T*         tiled_output,
+                                   int*       tiled_sequence_length,
+                                   const T*   output,
+                                   const int* sequence_length,
+                                   const uint batch_size,
+                                   const uint beam_width,
+                                   const uint d_model)
+{
+    if (blockIdx.x == 0) {
+        for (uint i = threadIdx.x; i < batch_size * beam_width; i += blockDim.x) {
+            tiled_sequence_length[i] = sequence_length[i / beam_width];
+        }
+    }
+
+    int tgt_offset =
+        blockIdx.x * gridDim.y * gridDim.z * d_model + blockIdx.y * gridDim.z * d_model + blockIdx.z * d_model;
+    int src_offset = blockIdx.x * gridDim.z * d_model + blockIdx.z * d_model;
+    for (uint i = threadIdx.x; i < d_model; i += blockDim.x) {
+        tiled_output[i + tgt_offset] = output[i + src_offset];
+    }
+}
+
+template<typename T>
+void invokeTileEncoderResults(T*           tiled_output,
+                              int*         tiled_sequence_length,
+                              const T*     output,
+                              const int*   sequence_length,
+                              const size_t batch_size,
+                              const size_t beam_width,
+                              const size_t mem_max_seq_len,
+                              const size_t d_model,
+                              cudaStream_t stream)
+{
+    // tiled_output: [batch_size, beam_width, mem_max_seq_len, d_model]
+    // tiled_sequence_length: [batch_size, beam_width]
+
+    // output: [batch_size, mem_max_seq_len, d_model]
+    // sequence_length [batch_size]
+
+    dim3 grid(batch_size, beam_width, mem_max_seq_len);
+    bool is_half2 = (std::is_same<T, half>::value) && (d_model % 2 == 0);
+
+    if (is_half2) {
+        using T2 = typename TypeConverter<T>::Type;  // fp16 to half2, bf16 to bf162
+        dim3 block(min(512, (int)(d_model / 2)));
+        tileEncoderResults<T2><<<grid, block, 0, stream>>>((T2*)tiled_output,
+                                                           tiled_sequence_length,
+                                                           (const T2*)output,
+                                                           sequence_length,
+                                                           batch_size,
+                                                           beam_width,
+                                                           d_model / 2);
+    }
+    else {
+        dim3 block(min(512, (int)d_model));
+        tileEncoderResults<T><<<grid, block, 0, stream>>>(
+            tiled_output, tiled_sequence_length, output, sequence_length, batch_size, beam_width, d_model);
+    }
+}
+
+template void invokeTileEncoderResults(float*       tiled_output,
+                                       int*         tiled_sequence_length,
+                                       const float* output,
+                                       const int*   sequence_length,
+                                       const size_t batch_size,
+                                       const size_t beam_width,
+                                       const size_t mem_max_seq_len,
+                                       const size_t d_model,
+                                       cudaStream_t stream);
+
+template void invokeTileEncoderResults(half*        tiled_output,
+                                       int*         tiled_sequence_length,
+                                       const half*  output,
+                                       const int*   sequence_length,
+                                       const size_t batch_size,
+                                       const size_t beam_width,
+                                       const size_t mem_max_seq_len,
+                                       const size_t d_model,
+                                       cudaStream_t stream);
+
+template void invokeTileEncoderResults(half2*       tiled_output,
+                                       int*         tiled_sequence_length,
+                                       const half2* output,
+                                       const int*   sequence_length,
+                                       const size_t batch_size,
+                                       const size_t beam_width,
+                                       const size_t mem_max_seq_len,
+                                       const size_t d_model,
+                                       cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeTileEncoderResults(__nv_bfloat16*       tiled_output,
+                                       int*                 tiled_sequence_length,
+                                       const __nv_bfloat16* output,
+                                       const int*           sequence_length,
+                                       const size_t         batch_size,
+                                       const size_t         beam_width,
+                                       const size_t         mem_max_seq_len,
+                                       const size_t         d_model,
+                                       cudaStream_t         stream);
+#endif
+
+__global__ void insertUnfinishedPath(BeamHypotheses beam_hyps,
+                                     const bool*    finished,
+                                     const float*   cum_log_probs,
+                                     const int      batch_size,
+                                     const int      beam_width)
+{
+    const int bid           = blockIdx.x;
+    const int tgt_start_idx = beam_hyps.num_beams[bid];
+    if (beam_hyps.is_done[bid]) {
+        return;
+    }
+    for (int i = 0; i < beam_width; i++) {
+        if (threadIdx.x == 0) {
+            const int src_beam_idx = bid * beam_width + i;
+            const int tgt_beam_idx = bid * beam_width * 2 + i + tgt_start_idx;
+
+            const int length = beam_hyps.sequence_lengths_src[src_beam_idx];
+
+            beam_hyps.output_ids_tgt[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + length] =
+                beam_hyps.output_ids_src[length * batch_size * beam_width + src_beam_idx];
+            if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
+                beam_hyps.log_probs[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + length] =
+                    beam_hyps.log_probs_src[length * batch_size * beam_width + src_beam_idx];
+            }
+            int prev_id = beam_hyps.parent_ids_src[length * batch_size * beam_width + src_beam_idx];
+            for (int j = length - 1; j >= 0; j--) {
+                // output_ids_tgt need to use max_seq_len + 1 because its shape is
+                // [bs, beam_width, max_seq_len + 1]
+                beam_hyps.output_ids_tgt[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + j] =
+                    beam_hyps.output_ids_src[j * batch_size * beam_width + bid * beam_width + prev_id];
+                if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
+                    beam_hyps.log_probs[(tgt_beam_idx) * (beam_hyps.max_seq_len + 1) + j] =
+                        beam_hyps.log_probs_src[j * batch_size * beam_width + bid * beam_width + prev_id];
+                }
+                prev_id = beam_hyps.parent_ids_src[j * batch_size * beam_width + bid * beam_width + prev_id];
+            }
+            beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = length;
+
+            beam_hyps.normed_scores[tgt_beam_idx] = apply_length_penalty(
+                cum_log_probs[src_beam_idx], finished[src_beam_idx] ? length + 1 : length, beam_hyps.length_penalty);
+            beam_hyps.cum_log_probs[tgt_beam_idx] = cum_log_probs[src_beam_idx];
+
+            beam_hyps.num_beams[bid]++;
+        }
+    }
+}
+
+void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
+                                const bool*    finished,
+                                const float*   cum_log_probs,
+                                const int      batch_size,
+                                const int      beam_width,
+                                cudaStream_t   stream)
+{
+    insertUnfinishedPath<<<batch_size, 256, 0, stream>>>(beam_hyps, finished, cum_log_probs, batch_size, beam_width);
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/beam_search_topk_kernels.h b/src/fastertransformer/kernels/beam_search_topk_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..60732a5943a6c1e305a6f76a46e0177007474d87
--- /dev/null
+++ b/src/fastertransformer/kernels/beam_search_topk_kernels.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+
+#pragma once
+
+namespace fastertransformer {
+
+// In original beam search implementation, if a beam is finished, we set it as finished
+// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
+//
+// In this implementation, when a beam is finished, we trace the path and record it in output_ids_tgt,
+// and also record the normalized scores. And the beam search continue to use `beam_width` beams in
+// next step.
+//
+// After we collect `beam_width` beams, we will sort them by their norm_scores.
+struct BeamHypotheses {
+    int*   output_ids_tgt       = nullptr;
+    int*   sequence_lengths_tgt = nullptr;
+    float* cum_log_probs        = nullptr;  // cum_log
+    float* normed_scores        = nullptr;  // cum_log / (length**length_penalty)
+    float* log_probs            = nullptr;  // log probs of each generated token
+    float* min_normed_scores    = nullptr;  // record the min normed scores for each batch
+    int*   num_beams            = nullptr;  // the number of finished beams we collect
+    bool*  is_done              = nullptr;
+
+    // Used to set inputs
+    const int*   output_ids_src;
+    const int*   parent_ids_src;
+    const int*   sequence_lengths_src;
+    const int*   end_ids;
+    const float* log_probs_src;
+
+    // some variables for kernels
+    int   step;
+    int   ite;
+    int   batch_size;
+    int   local_batch_size;
+    int   max_seq_len;
+    float length_penalty;
+
+    bool early_stopping         = true;
+    bool is_return_normed_score = true;  // return normed_cum_log_probs or cum_log_probs
+};
+
+template<typename T>
+void invokeTopkBeamSearch(void*           workspace,
+                          size_t&         workspace_size,
+                          T*              log_probs,
+                          int*            ids,
+                          BeamHypotheses* beam_hyps,
+                          const bool*     finished,
+                          const int*      sequence_lengths,
+                          const int       batch_size,
+                          const int       beam_width,
+                          const int       vocab_size_padded_,
+                          const T         diversity_rate,
+                          const float     length_penalty,
+                          const int*      end_ids,
+                          cudaStream_t    stream);
+
+template<typename T>
+void invokeTileEncoderResults(T*           tiled_encoder_output,
+                              int*         tiled_encoder_sequence_length,
+                              const T*     encoder_output,
+                              const int*   encoder_sequence_length,
+                              const size_t batch_size,
+                              const size_t beam_width,
+                              const size_t mem_max_seq_len,
+                              const size_t d_model,
+                              cudaStream_t stream);
+
+void invokeInsertUnfinishedPath(BeamHypotheses beam_hyps,
+                                const bool*    finished,
+                                const float*   cum_log_probs,
+                                const int      batch_size,
+                                const int      beam_width,
+                                cudaStream_t   stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.cu b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a57161c8596659298631d2c054ae60e731912d7d
--- /dev/null
+++ b/src/fastertransformer/kernels/bert_preprocess_kernels.cu
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bert_preprocess_kernels.h"
+#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+
+namespace fastertransformer {
+
+__global__ void getPaddingOffsetAndCuSeqLensKernel(size_t*    h_valid_word_num,
+                                                   int*       tmp_mask_offset,
+                                                   int*       cu_seqlens,
+                                                   const int* sequence_length,
+                                                   const int  batch_size,
+                                                   const int  max_seq_len)
+{
+    // do cumulated sum
+    int        total_seq_len        = 0;
+    int        cum_offset           = 0;
+    int        index                = 0;
+    const bool calculate_cu_seqlens = cu_seqlens != nullptr;
+    for (int i = 0; i < batch_size; i++) {
+        const int seq_len = sequence_length[i];
+        if (calculate_cu_seqlens) {
+            cu_seqlens[i] = total_seq_len;
+        }
+        for (int j = 0; j < seq_len; j++) {
+            tmp_mask_offset[index] = cum_offset;
+            index++;
+        }
+        cum_offset += max_seq_len - seq_len;
+        total_seq_len += seq_len;
+    }
+    if (calculate_cu_seqlens) {
+        cu_seqlens[batch_size] = total_seq_len;
+    }
+    h_valid_word_num[0] = (size_t)total_seq_len;
+}
+
+void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
+                                        size_t*      h_token_num,
+                                        int*         tmp_mask_offset,
+                                        int*         cu_seqlens,
+                                        const int*   sequence_lengths,
+                                        const int    batch_size,
+                                        const int    max_seq_len,
+                                        cudaStream_t stream)
+{
+    h_pinned_token_num[0] = 0;
+    getPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
+        h_pinned_token_num, tmp_mask_offset, cu_seqlens, sequence_lengths, batch_size, max_seq_len);
+    while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
+    h_token_num[0] = h_pinned_token_num[0];
+    sync_check_cuda_error();
+}
+
+template<typename T>
+__global__ void buildEncoderAttentionMaskKernel(T* attention_mask, const int* sequence_lengths, const int max_seq_len)
+{
+    // sequence_lengths: [batch_size]
+    // attention_mask: [batch_size, 1, max_seq_len, max_seq_len]
+    attention_mask += blockIdx.x * max_seq_len * max_seq_len;
+    const int length = sequence_lengths[blockIdx.x];
+    for (int i = threadIdx.x; i < max_seq_len * max_seq_len; i += blockDim.x) {
+        // int row_id = i / max_seq_len;
+        int col_id = i % max_seq_len;
+        // if (row_id < length && col_id < length) {
+        // TODO (bhsueh) check this modification is ok or not on other rmodel
+        if (col_id < length) {
+            attention_mask[i] = (T)(1.0f);
+        }
+        else {
+            attention_mask[i] = (T)(0.0f);
+        }
+    }
+}
+
+template<typename T>
+void invokeBuildEncoderAttentionMask(
+    T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream)
+{
+    buildEncoderAttentionMaskKernel<<<batch_size, 256, 0, stream>>>(attention_mask, sequence_lengths, max_seq_len);
+}
+
+template void invokeBuildEncoderAttentionMask(float*       attention_mask,
+                                              const int*   sequence_lengths,
+                                              const int    batch_size,
+                                              const int    max_seq_len,
+                                              cudaStream_t stream);
+template void invokeBuildEncoderAttentionMask(half*        attention_mask,
+                                              const int*   sequence_lengths,
+                                              const int    batch_size,
+                                              const int    max_seq_len,
+                                              cudaStream_t stream);
+#ifdef ENABLE_FP8
+template void invokeBuildEncoderAttentionMask(__nv_fp8_e4m3* attention_mask,
+                                              const int*     sequence_lengths,
+                                              const int      batch_size,
+                                              const int      max_seq_len,
+                                              cudaStream_t   stream);
+#endif  // ENABLE_FP8
+#ifdef ENABLE_BF16
+template void invokeBuildEncoderAttentionMask(__nv_bfloat16* attention_mask,
+                                              const int*     sequence_lengths,
+                                              const int      batch_size,
+                                              const int      max_seq_len,
+                                              cudaStream_t   stream);
+#endif
+
+__global__ void getTrtPaddingOffsetKernel(int* trt_mha_padding_offset, const int* sequence_length, const int batch_size)
+{
+    // use for get tensorrt fused mha padding offset
+    // when we remove the padding
+
+    extern __shared__ int tmp_offset[];
+    if (threadIdx.x == 0) {
+        tmp_offset[0] = 0;
+        for (int i = 0; i < batch_size; i++) {
+            tmp_offset[i + 1] = tmp_offset[i] + sequence_length[i];
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < batch_size + 1; i += blockDim.x) {
+        trt_mha_padding_offset[i] = tmp_offset[i];
+    }
+}
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    batch_size,
+                               cudaStream_t stream)
+{
+    getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (batch_size + 1), stream>>>(
+        trt_mha_padding_offset, sequence_length, batch_size);
+}
+
+__global__ void getTrtPaddingOffsetKernel(int*       trt_mha_padding_offset,
+                                          const int* sequence_length,
+                                          const int  request_batch_size,
+                                          const int  request_seq_len)
+{
+    // use for get tensorrt fused mha padding offset
+    // when we keep the padding
+
+    extern __shared__ int tmp_offset[];
+    if (threadIdx.x == 0) {
+        tmp_offset[0] = 0;
+        for (int i = 0; i < request_batch_size; i++) {
+            tmp_offset[i * 2 + 1] = tmp_offset[i * 2] + sequence_length[i];
+            tmp_offset[i * 2 + 2] = request_seq_len * (i + 1);
+        }
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < 2 * request_batch_size + 1; i += blockDim.x) {
+        trt_mha_padding_offset[i] = tmp_offset[i];
+    }
+}
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    request_batch_size,
+                               const int    request_seq_len,
+                               cudaStream_t stream)
+{
+    getTrtPaddingOffsetKernel<<<1, 256, sizeof(int) * (2 * request_batch_size + 1), stream>>>(
+        trt_mha_padding_offset, sequence_length, request_batch_size, request_seq_len);
+}
+
+template<typename T>
+__global__ void rebuild_sequence_length_padding(const T* src, T* dst, const int* padding_offset, const int n)
+{
+    const int tid        = threadIdx.x;
+    const int bid        = blockIdx.x;
+    const int dst_seq_id = bid + padding_offset[bid];
+    const int src_seq_id = bid;
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        dst[dst_seq_id * n + i] = src[src_seq_id * n + i];
+    }
+}
+
+template<typename T>
+void invokeRebuildPadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream)
+{
+    // src: [token_num, hidden_dim]
+    // dst: [batch_size*max_seq_len, hidden_dim]
+    rebuild_sequence_length_padding<<<token_num, 256, 0, stream>>>(src, dst, padding_offset, hidden_dim);
+}
+
+template<typename T>
+void invokeRebuildPadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
+template void invokeRebuildPadding(float*       dst,
+                                   const float* src,
+                                   const int*   padding_offset,
+                                   const int    token_num,
+                                   const int    hidden_dim,
+                                   cudaStream_t stream);
+template void invokeRebuildPadding(half*        dst,
+                                   const half*  src,
+                                   const int*   padding_offset,
+                                   const int    token_num,
+                                   const int    hidden_dim,
+                                   cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeRebuildPadding(__nv_bfloat16*       dst,
+                                   const __nv_bfloat16* src,
+                                   const int*           padding_offset,
+                                   const int            token_num,
+                                   const int            hidden_dim,
+                                   cudaStream_t         stream);
+#endif  // ENABLE_BF16
+
+#ifdef ENABLE_FP8
+template void invokeRebuildPadding(__nv_fp8_e4m3*       dst,
+                                   const __nv_fp8_e4m3* src,
+                                   const int*           padding_offset,
+                                   const int            token_num,
+                                   const int            hidden_dim,
+                                   cudaStream_t         stream);
+#endif  // ENABLE_FP8
+
+template<typename T>
+__global__ void remove_padding(T* tgt, const T* src, const int* padding_offset, const int n)
+{
+    const int tid        = threadIdx.x;
+    const int bid        = blockIdx.x;
+    const int src_seq_id = bid + padding_offset[bid];
+    const int tgt_seq_id = bid;
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        tgt[tgt_seq_id * n + i] = src[src_seq_id * n + i];
+    }
+}
+
+template<typename T>
+void invokeRemovePadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream)
+{
+    remove_padding<<<token_num, 256, 0, stream>>>(dst, src, padding_offset, hidden_dim);
+}
+
+template void invokeRemovePadding(float*       dst,
+                                  const float* src,
+                                  const int*   padding_offset,
+                                  const int    token_num,
+                                  const int    hidden_dim,
+                                  cudaStream_t stream);
+
+template void invokeRemovePadding(half*        dst,
+                                  const half*  src,
+                                  const int*   padding_offset,
+                                  const int    token_num,
+                                  const int    hidden_dim,
+                                  cudaStream_t stream);
+#ifdef ENABLE_FP8
+template void invokeRemovePadding(__nv_fp8_e4m3*       dst,
+                                  const __nv_fp8_e4m3* src,
+                                  const int*           padding_offset,
+                                  const int            token_num,
+                                  const int            hidden_dim,
+                                  cudaStream_t         stream);
+#endif  // ENABLE_FP8
+#ifdef ENABLE_BF16
+template void invokeRemovePadding(__nv_bfloat16*       dst,
+                                  const __nv_bfloat16* src,
+                                  const int*           padding_offset,
+                                  const int            token_num,
+                                  const int            hidden_dim,
+                                  cudaStream_t         stream);
+#endif
+
+template<typename T>
+__global__ void buildRelativeAttentionBias(T*         relative_attention_bias,
+                                           const T*   relative_attention_bias_table,
+                                           const int  head_num,
+                                           const int  seq_len,
+                                           const int  num_bucket,
+                                           const bool is_bidirectional,
+                                           const int  max_distance)
+{
+
+    const int head_id = blockIdx.x;
+    for (int seq_id = threadIdx.x; seq_id < seq_len * seq_len; seq_id += blockDim.x) {
+        int row_id = seq_id / seq_len;
+        int col_id = seq_id % seq_len;
+
+        int relative_position = col_id - row_id;
+
+        int relative_buckets = 0;
+        int tmp_num_bucket   = num_bucket;
+        if (is_bidirectional) {
+            tmp_num_bucket /= 2;
+            if (relative_position > 0) {
+                relative_buckets += tmp_num_bucket;
+            }
+            else {
+                relative_position *= -1;
+            }
+        }
+        else {
+            relative_position = abs(relative_position);
+        }
+
+        int  max_exact = tmp_num_bucket / 2;
+        bool is_small  = relative_position < max_exact;
+
+        int relative_position_if_large =
+            max_exact
+            + (int)(logf(relative_position * 1.0f / max_exact) / logf((float)max_distance / max_exact)
+                    * (tmp_num_bucket - max_exact));
+
+        relative_position_if_large = min(relative_position_if_large, tmp_num_bucket - 1);
+
+        relative_buckets += is_small ? relative_position : relative_position_if_large;
+
+        relative_attention_bias[head_id * seq_len * seq_len + seq_id] =
+            relative_attention_bias_table[head_id * num_bucket + relative_buckets];
+    }
+}
+
+template<typename T>
+void invokeBuildRelativeAttentionBias(T*                          relative_attention_bias,
+                                      const T*                    relative_attention_bias_table,
+                                      const int                   head_num,
+                                      const int                   seq_len,
+                                      const int                   num_bucket,
+                                      const bool                  is_bidirectional,
+                                      const int                   max_distance,
+                                      const PositionEmbeddingType position_embedding_type,
+                                      cudaStream_t                stream)
+{
+    if (position_embedding_type == PositionEmbeddingType::absolute) {
+        return;
+    }
+    dim3 grid(head_num);
+    dim3 block(256);
+    buildRelativeAttentionBias<<<grid, block, 0, stream>>>(relative_attention_bias,
+                                                           relative_attention_bias_table,
+                                                           head_num,
+                                                           seq_len,
+                                                           num_bucket,
+                                                           is_bidirectional,
+                                                           max_distance);
+}
+
+template void invokeBuildRelativeAttentionBias(float*                      relative_attention_bias,
+                                               const float*                relative_attention_bias_table,
+                                               const int                   head_num,
+                                               const int                   seq_len,
+                                               const int                   num_bucket,
+                                               const bool                  is_bidirectional,
+                                               const int                   max_distance,
+                                               const PositionEmbeddingType position_embedding_type,
+                                               cudaStream_t                stream);
+
+template void invokeBuildRelativeAttentionBias(half*                       relative_attention_bias,
+                                               const half*                 relative_attention_bias_table,
+                                               const int                   head_num,
+                                               const int                   seq_len,
+                                               const int                   num_bucket,
+                                               const bool                  is_bidirectional,
+                                               const int                   max_distance,
+                                               const PositionEmbeddingType position_embedding_type,
+                                               cudaStream_t                stream);
+
+#ifdef ENABLE_BF16
+template void invokeBuildRelativeAttentionBias(__nv_bfloat16*              relative_attention_bias,
+                                               const __nv_bfloat16*        relative_attention_bias_table,
+                                               const int                   head_num,
+                                               const int                   seq_len,
+                                               const int                   num_bucket,
+                                               const bool                  is_bidirectional,
+                                               const int                   max_distance,
+                                               const PositionEmbeddingType position_embedding_type,
+                                               cudaStream_t                stream);
+#endif
+
+#ifdef ENABLE_FP8
+
+template<typename T_OUT, typename T_IN>
+__global__ void getLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
+{
+    param.output[blockIdx.x * param.d_model + threadIdx.x] =
+        (T_OUT)((float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x]
+                * __ldg(param.input_scale));
+}
+
+template<typename T_OUT, typename T_IN>
+void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
+{
+    FT_CHECK(param.d_model <= 1024);
+    getLastTokenDequantize<T_OUT, T_IN><<<param.batch_size, param.d_model, 0, param.stream>>>(param);
+}
+
+template void invokeGetLastTokenDequantize<__nv_bfloat16, __nv_fp8_e4m3>(
+    getLastTokenDequantizeParam<__nv_bfloat16, __nv_fp8_e4m3> param);
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+__global__ void quantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param)
+{
+    for (int i = threadIdx.x; i < param.d_model; i += blockDim.x) {
+        int padded_row_id = blockIdx.x + (param.padding_offset == nullptr ? 0 : param.padding_offset[blockIdx.x]);
+        if (quantize_mode == QUANTIZE_MODE::PER_TENSOR) {
+            param.dst[padded_row_id * param.d_model + i] =
+                (T_OUT)((float)param.src[blockIdx.x * param.d_model + i] * __ldg(param.scale));
+        }
+        else if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
+            param.dst[padded_row_id * param.d_model + i] =
+                (T_OUT)((float)param.src[blockIdx.x * param.d_model + i] * __ldg(param.scale + i));
+        }
+    }
+}
+
+template<>
+__global__ void
+quantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR> param)
+{
+    int padded_row_id = blockIdx.x + (param.padding_offset == nullptr ? 0 : __ldg(&param.padding_offset[blockIdx.x]));
+    __nv_fp8x4_e4m3* src_ptr = ((__nv_fp8x4_e4m3*)param.src) + blockIdx.x * (param.d_model / 4);
+    half2*           dst_ptr = ((half2*)param.dst) + padded_row_id * (param.d_model / 2);
+    half2            scale   = cuda_cast<half2>(__ldg(param.scale));
+    for (int i = threadIdx.x; i < param.d_model / 4; i += blockDim.x) {
+        half2 val_0;
+        half2 val_1;
+        fp8x4_e4m3_to_half2(&val_0, &val_1, src_ptr + i);
+
+        val_0 = hmul2(val_0, scale);
+        val_1 = hmul2(val_1, scale);
+
+        dst_ptr[2 * i + 0] = val_0;
+        dst_ptr[2 * i + 1] = val_1;
+    }
+}
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param)
+{
+    dim3 grid(param.token_num);
+    dim3 block(param.d_model);
+    FT_CHECK(block.x <= 1024);
+    if (block.x % 4 == 0) {
+        block.x /= 4;
+    }
+    quantizeMatrixRebuildPadding<<<grid, block, 0, param.stream>>>(param);
+}
+
+template void invokeQuantizeMatrixRebuildPadding<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR>(
+    QuantizeMatrixRebuildPaddingParam<half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR> param);
+
+#endif
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/kernels/bert_preprocess_kernels.h b/src/fastertransformer/kernels/bert_preprocess_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..2de48657dc92fffa02e0b033e73fec0f3580c691
--- /dev/null
+++ b/src/fastertransformer/kernels/bert_preprocess_kernels.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#ifdef ENABLE_FP8
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#endif  // ENABLE_FP8
+
+namespace fastertransformer {
+
+void invokeGetPaddingOffsetAndCuSeqLens(size_t*      h_pinned_token_num,
+                                        size_t*      h_token_num,
+                                        int*         tmp_mask_offset,
+                                        int*         cu_seqlens,
+                                        const int*   sequence_length,
+                                        const int    batch_size,
+                                        const int    max_seq_len,
+                                        cudaStream_t stream);
+
+inline void invokeGetPaddingOffset(size_t*      h_pinned_token_num,
+                                   size_t*      h_token_num,
+                                   int*         tmp_mask_offset,
+                                   const int*   sequence_length,
+                                   const int    batch_size,
+                                   const int    max_seq_len,
+                                   cudaStream_t stream)
+{
+    invokeGetPaddingOffsetAndCuSeqLens(
+        h_pinned_token_num, h_token_num, tmp_mask_offset, nullptr, sequence_length, batch_size, max_seq_len, stream);
+}
+
+template<typename T>
+void invokeBuildEncoderAttentionMask(
+    T* attention_mask, const int* sequence_lengths, const int batch_size, const int max_seq_len, cudaStream_t stream);
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    request_batch_size,
+                               cudaStream_t stream);
+
+void invokeGetTrtPaddingOffset(int*         trt_mha_padding_offset,
+                               const int*   sequence_length,
+                               const int    request_batch_size,
+                               const int    request_seq_len,
+                               cudaStream_t stream);
+
+template<typename T>
+void invokeRebuildPadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
+
+template<typename T>
+void invokeRemovePadding(
+    T* dst, const T* src, const int* padding_offset, const int token_num, const int hidden_dim, cudaStream_t stream);
+
+template<typename T>
+void invokeBuildRelativeAttentionBias(T*                          relative_attention_bias,
+                                      const T*                    relative_attention_bias_table,
+                                      const int                   head_num,
+                                      const int                   seq_len,
+                                      const int                   num_bucket,
+                                      const bool                  is_bidirectional,
+                                      const int                   max_distance,
+                                      const PositionEmbeddingType position_embedding_type,
+                                      cudaStream_t                stream);
+
+template<typename T_OUT, typename T_IN>
+struct getLastTokenDequantizeParam {
+    T_OUT* const       output;
+    T_IN const* const  input;
+    float const* const input_scale;
+
+    const int    batch_size;
+    const int    max_seq_len;
+    const int    d_model;
+    cudaStream_t stream;
+};
+
+template<typename T_OUT, typename T_IN>
+void invokeGetLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param);
+
+#ifdef ENABLE_FP8
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+struct QuantizeMatrixRebuildPaddingParam {
+    T_OUT*       dst;
+    const T_IN*  src;
+    const int*   padding_offset;
+    const int    token_num;
+    const int    d_model;
+    const float* scale;
+    cudaStream_t stream;
+};
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrixRebuildPadding(QuantizeMatrixRebuildPaddingParam<T_OUT, T_IN, quantize_mode> param);
+#endif  // ENABLE_FP8
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/custom_ar_kernels.cu b/src/fastertransformer/kernels/custom_ar_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af8aee128f6f6e96abb4f58bc5bb7638836fcf85
--- /dev/null
+++ b/src/fastertransformer/kernels/custom_ar_kernels.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "custom_ar_kernels.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+
+namespace fastertransformer {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t hadd2(const uint32_t& a, const uint32_t& b)
+{
+    uint32_t c;
+    asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ uint32_t fadd(const uint32_t& a, const uint32_t& b)
+{
+    uint32_t c;
+    asm volatile("add.f32 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void st_flag_release(uint32_t& flag, uint32_t* flag_addr)
+{
+#if __CUDA_ARCH__ >= 700
+    asm volatile("st.global.release.sys.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+#else
+    __threadfence_system();
+    asm volatile("st.global.volatile.b32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ void ld_flag_acquire(uint32_t& flag, uint32_t* flag_addr)
+{
+#if __CUDA_ARCH__ >= 700
+    asm volatile("ld.global.acquire.sys.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+#else
+    asm volatile("ld.global.volatile.b32 %0, [%1];" : "=r"(flag) : "l"(flag_addr));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Type Converter that packs data format to 128 bits data type
+template<typename T>
+struct ARTypeConverter {
+    using Type = uint4;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct ARTypeConverter<__nv_bfloat16> {
+    using Type = bf168;
+};
+#endif
+
+// add two 128b data
+template<typename T_IN, typename T_COMP>
+inline __device__ T_IN add128b(T_IN a, T_IN b);
+
+template<>
+inline __device__ uint4 add128b<uint4, uint16_t>(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = hadd2(a.x, b.x);
+    c.y = hadd2(a.y, b.y);
+    c.z = hadd2(a.z, b.z);
+    c.w = hadd2(a.w, b.w);
+    return c;
+}
+
+template<>
+inline __device__ uint4 add128b<uint4, uint32_t>(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = fadd(a.x, b.x);
+    c.y = fadd(a.y, b.y);
+    c.z = fadd(a.z, b.z);
+    c.w = fadd(a.w, b.w);
+    return c;
+}
+
+#ifdef ENABLE_BF16
+template<>
+inline __device__ bf168 add128b<bf168, __nv_bfloat16>(bf168 a, bf168 b)
+{
+    bf168 c;
+    c.x = bf16hadd2(a.x, b.x);
+    c.y = bf16hadd2(a.y, b.y);
+    c.z = bf16hadd2(a.z, b.z);
+    c.w = bf16hadd2(a.w, b.w);
+    return c;
+}
+#endif
+
+// init 128bits data with 0
+template<typename T>
+inline __device__ T init_packed_type();
+
+template<>
+inline __device__ uint4 init_packed_type()
+{
+    return make_uint4(0u, 0u, 0u, 0u);
+}
+
+#ifdef ENABLE_BF16
+template<>
+inline __device__ bf168 init_packed_type()
+{
+    bf168  val;
+    uint4& val_u = reinterpret_cast<uint4&>(val);
+    val_u        = make_uint4(0u, 0u, 0u, 0u);
+    return val;
+}
+#endif
+
+template<typename T>
+static __global__ void oneShotAllReduceKernel(AllReduceParams<T> params)
+{
+    // The block index.
+    const int bidx = blockIdx.x;
+    // The thread index with the block.
+    const int tidx = threadIdx.x;
+
+    // The number of elements packed into one for comms
+    static constexpr int NUM_ELTS = std::is_same<T, uint32_t>::value ? 4 : 8;
+
+    // Packed data type for comms
+    using PackedType = typename ARTypeConverter<T>::Type;
+
+    // The location in the destination array (load 8 fp16 or load 4 fp32 using LDG.128).
+    size_t offset = bidx * params.elts_per_block + tidx * NUM_ELTS;
+    // The end of the segment computed by that block.
+    size_t max_offset = std::min((bidx + 1) * params.elts_per_block, params.elts_per_rank);
+
+    // Synchronize the ranks.
+    volatile uint32_t* barrier_d = params.peer_barrier_ptrs[params.local_rank];
+    if (tidx < RANKS_PER_NODE) {
+        // The 1st block notifies the other ranks.
+        if (bidx == 0) {
+            params.peer_barrier_ptrs[tidx][params.local_rank] = params.barrier_flag;
+        }
+
+        // Busy-wait until all ranks are ready.
+        while (barrier_d[tidx] < params.barrier_flag) {}
+    }
+
+    // Make sure we can move on...
+    __syncthreads();
+
+    // The source pointers. Distributed round-robin for the different warps.
+    const T* src_d[RANKS_PER_NODE];
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+        int rank  = (params.local_rank + ii) % RANKS_PER_NODE;
+        src_d[ii] = params.peer_comm_buffer_ptrs[rank];
+    }
+
+    // Each block accumulates the values from the different GPUs on the same node.
+    for (size_t iter_offset = offset; iter_offset < max_offset; iter_offset += blockDim.x * NUM_ELTS) {
+        // Iterate over the different ranks/devices on the node to load the values.
+        PackedType vals[RANKS_PER_NODE];
+#pragma unroll
+        for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+            vals[ii] = reinterpret_cast<const PackedType*>(&src_d[ii][iter_offset])[0];
+        }
+
+        // Sum the values from the different ranks.
+        PackedType sums = init_packed_type<PackedType>();
+#pragma unroll
+        for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+            sums = add128b<PackedType, T>(sums, vals[ii]);
+        }
+
+        // Store to the destination buffer.
+        reinterpret_cast<PackedType*>(&params.local_output_buffer_ptr[iter_offset])[0] = sums;
+    }
+}
+
+template<typename T>
+static __global__ void twoShotAllReduceKernel(AllReduceParams<T> params)
+{
+
+    // The block index.
+    const int bidx = blockIdx.x;
+    // The thread index with the block.
+    const int tidx = threadIdx.x;
+
+    // The number of elements packed into one for comms
+    static constexpr int NUM_ELTS = std::is_same<T, uint32_t>::value ? 4 : 8;
+
+    // Packed data type for comms
+    using PackedType = typename ARTypeConverter<T>::Type;
+
+    // The location in the destination array (load 8 fp16 or load 4 fp32 using LDG.128).
+    size_t offset = bidx * params.elts_per_block + tidx * NUM_ELTS + params.rank_offset;
+    // The end of the segment computed by that block.
+    size_t max_offset = min(offset + params.elts_per_block, params.elts_total);
+
+    // Synchronize the ranks.
+    volatile uint32_t* barrier_d = params.peer_barrier_ptrs[params.local_rank];
+    if (tidx < RANKS_PER_NODE) {
+        // The 1st block notifies the other ranks.
+        if (bidx == 0) {
+            params.peer_barrier_ptrs[tidx][params.local_rank] = params.barrier_flag;
+        }
+
+        // Busy-wait until all ranks are ready.
+        while (barrier_d[tidx] < params.barrier_flag) {}
+    }
+
+    // Make sure we can move on...
+    __syncthreads();
+
+    // The source pointers. Distributed round-robin for the different warps.
+    T* src_d[RANKS_PER_NODE];
+    // The destination ranks for round-robin gathering
+    size_t dst_rank[RANKS_PER_NODE];
+#pragma unroll
+    for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+        int rank     = (params.local_rank + ii) % RANKS_PER_NODE;
+        src_d[ii]    = params.peer_comm_buffer_ptrs[rank];
+        dst_rank[ii] = rank;
+    }
+
+    // Each block accumulates the values from the different GPUs on the same node.
+    for (size_t local_offset = offset; local_offset < max_offset; local_offset += blockDim.x * NUM_ELTS) {
+
+        // Iterate over the different ranks/devices on the node to load the values.
+        PackedType vals[RANKS_PER_NODE];
+#pragma unroll
+        for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+            vals[ii] = reinterpret_cast<const PackedType*>(&src_d[ii][local_offset])[0];
+        }
+
+        // Sum the values from the different ranks.
+        PackedType sums = init_packed_type<PackedType>();
+#pragma unroll
+        for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+            sums = add128b<PackedType, T>(sums, vals[ii]);
+        }
+
+        // Store to the local buffer.
+        reinterpret_cast<PackedType*>(&src_d[0][local_offset])[0] = sums;
+    }
+
+    // sync threads to make sure all block threads have the sums
+    __syncthreads();
+
+    // barreris among the blocks with the same idx (release-acuqire semantics)
+    if (tidx < RANKS_PER_NODE) {
+        // The all blocks notifies the other ranks.
+        uint32_t flag_block_offset = RANKS_PER_NODE + bidx * RANKS_PER_NODE;
+        st_flag_release(params.barrier_flag, params.peer_barrier_ptrs[tidx] + flag_block_offset + params.local_rank);
+
+        // Busy-wait until all ranks are ready.
+        uint32_t  rank_barrier   = 0;
+        uint32_t* peer_barrier_d = params.peer_barrier_ptrs[params.local_rank] + flag_block_offset + tidx;
+        do {
+            ld_flag_acquire(rank_barrier, peer_barrier_d);
+        } while (rank_barrier != params.barrier_flag);
+    }
+
+    // sync threads to make sure all other ranks has the final partial results
+    __syncthreads();
+
+    // Gather all needed elts from other intra-node ranks
+    for (size_t local_offset = offset; local_offset < max_offset; local_offset += blockDim.x * NUM_ELTS) {
+#pragma unroll
+        for (int ii = 0; ii < RANKS_PER_NODE; ++ii) {
+            // use round-robin gathering from other ranks
+            int offset_rank = local_offset + (dst_rank[ii] - params.local_rank) * params.elts_per_rank;
+            reinterpret_cast<PackedType*>(&params.local_output_buffer_ptr[offset_rank])[0] =
+                reinterpret_cast<PackedType*>(&src_d[dst_rank[ii]][offset_rank])[0];
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void kernelLaunchConfig(
+    int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo, size_t data_type_bytes)
+{
+    assert(data_type_bytes == 2 || data_type_bytes == 4);
+    // NOTE: need to support FP16 and FP32
+    size_t elts_per_thread = 16 / data_type_bytes;
+    size_t elts_per_warp   = (16 * WARP_SIZE) / data_type_bytes;
+    switch (kernel_algo) {
+        case 0: {  // one stage all reduce algo
+            assert(elts % elts_per_warp == 0);
+            if (elts < (elts_per_thread * DEFAULT_BLOCK_SIZE)) {  // local reduce
+                threads_per_block = ((elts + elts_per_warp - 1) / elts_per_warp) * WARP_SIZE;
+                blocks_per_grid   = 1;
+            }
+            else {  // local reduce
+                if (elts % (elts_per_thread * threads_per_block) == 0) {
+                    blocks_per_grid =
+                        (elts + elts_per_thread * threads_per_block - 1) / (elts_per_thread * threads_per_block);
+                    // NOTE: need to adjust here
+                    if (blocks_per_grid > MAX_ALL_REDUCE_BLOCKS) {
+                        int iter_factor = 1;
+                        while (blocks_per_grid / iter_factor > MAX_ALL_REDUCE_BLOCKS || blocks_per_grid % iter_factor) {
+                            iter_factor += 1;
+                        }
+                        blocks_per_grid /= iter_factor;
+                    }
+                }
+                else {
+                    int total_threads = elts / elts_per_thread;
+                    blocks_per_grid   = 1;
+                    while (total_threads % blocks_per_grid != 0
+                           || total_threads / blocks_per_grid > DEFAULT_BLOCK_SIZE) {
+                        blocks_per_grid += 1;
+                    }
+                    threads_per_block = total_threads / blocks_per_grid;
+                }
+            }
+            break;
+        }
+        case 1: {  // two stage all reduce algo
+            int total_threads = elts / RANKS_PER_NODE / RANKS_PER_NODE;
+            assert(elts / RANKS_PER_NODE % RANKS_PER_NODE == 0 && total_threads % WARP_SIZE == 0);
+
+            while (total_threads % blocks_per_grid != 0 || total_threads / blocks_per_grid > DEFAULT_BLOCK_SIZE) {
+                blocks_per_grid += 1;
+            }
+
+            threads_per_block = total_threads / blocks_per_grid;
+
+            // NOTE: need to adjust here
+            if (blocks_per_grid > MAX_ALL_REDUCE_BLOCKS) {
+                int iter_factor = 1;
+                while (blocks_per_grid / iter_factor > MAX_ALL_REDUCE_BLOCKS || blocks_per_grid % iter_factor) {
+                    iter_factor += 1;
+                }
+                blocks_per_grid /= iter_factor;
+            }
+            break;
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t stream)
+{
+    size_t elts_total      = param.elts_total;
+    int    blocks_per_grid = 1, threads_per_block = DEFAULT_BLOCK_SIZE;
+    int    kernel_algo = 1;
+    if (elts_total * sizeof(T) <= DEFALUT_ALGO_AR_SIZE_THRESHOLD) {
+        kernel_algo = 0;
+    }
+
+    kernelLaunchConfig(blocks_per_grid, threads_per_block, elts_total, kernel_algo, sizeof(T));
+
+    if (kernel_algo == 0) {
+        param.elts_per_rank  = elts_total;
+        param.elts_per_block = param.elts_per_rank / blocks_per_grid;
+        oneShotAllReduceKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+    }
+    else {
+        param.elts_per_rank  = param.elts_total / RANKS_PER_NODE;
+        param.elts_per_block = param.elts_per_rank / blocks_per_grid;
+        param.rank_offset    = param.rank * param.elts_per_rank;
+        twoShotAllReduceKernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(param);
+    }
+}
+
+// Template instantiation
+template void invokeOneOrTwoShotAllReduceKernel<uint16_t>(AllReduceParams<uint16_t>& param, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeOneOrTwoShotAllReduceKernel<__nv_bfloat16>(AllReduceParams<__nv_bfloat16>& param,
+                                                               cudaStream_t                    stream);
+#endif
+template void invokeOneOrTwoShotAllReduceKernel<uint32_t>(AllReduceParams<uint32_t>& param, cudaStream_t stream);
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/kernels/custom_ar_kernels.h b/src/fastertransformer/kernels/custom_ar_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..aba07658c5105955a6e96c6cb544863f602ed033
--- /dev/null
+++ b/src/fastertransformer/kernels/custom_ar_kernels.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include <iostream>
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+#define CUSTOM_AR_SIZE_THRESHOLD 50331648
+#define MAX_ALL_REDUCE_BLOCKS 24
+#define FLAG(a) ((uint32_t)((a) % 0x146))
+#define RANKS_PER_NODE 8
+#define WARP_SIZE 32
+#define DEFAULT_BLOCK_SIZE 1024
+#define DEFALUT_ALGO_AR_SIZE_THRESHOLD 393216
+
+namespace fastertransformer {
+
+#ifdef ENABLE_BF16
+typedef struct bf168 {
+    __nv_bfloat162 x;
+    __nv_bfloat162 y;
+    __nv_bfloat162 z;
+    __nv_bfloat162 w;
+} bf168;
+#endif
+
+template<typename T>
+struct AllReduceParams {
+    size_t    elts_total;
+    size_t    elts_per_rank;
+    size_t    elts_per_block;
+    size_t    rank_offset;
+    size_t    rank, local_rank, node_id;
+    uint32_t  barrier_flag;
+    uint32_t* peer_barrier_ptrs[RANKS_PER_NODE];
+    T*        peer_comm_buffer_ptrs[RANKS_PER_NODE];
+    T*        local_output_buffer_ptr;
+};
+
+template<typename T>
+void invokeOneOrTwoShotAllReduceKernel(AllReduceParams<T>& param, cudaStream_t stream);
+
+void kernelLaunchConfig(int& blocks_per_grid, int& threads_per_block, size_t elts, int kernel_algo);
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu b/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b5cb081d4533559e2f48c64167688787a117094
--- /dev/null
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.cu
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+template<typename T, typename KERNEL_PARAMS_TYPE>
+void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    switch (params.hidden_size_per_head) {
+        case 128:
+            mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        default:
+            assert(false);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Masked_multihead_attention_params<float>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Masked_multihead_attention_params<uint16_t>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream)
+{
+    multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention.h b/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
new file mode 100644
index 0000000000000000000000000000000000000000..c56e87358be0240cfc4950d8a4d7332441b26cca
--- /dev/null
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CHECK_CUDA(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t status_ = call;                                                                                    \
+        if (status_ != cudaSuccess) {                                                                                  \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_));              \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The structure of parameters for the masked multihead attention kernel.
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+
+template<typename T>
+struct Multihead_attention_params_base {
+
+    // The output buffer. Dimensions B x D.
+    T* out = nullptr;
+
+    // The input Qs and the associated bias. Dimensions B x D and D, resp.
+    const T *q = nullptr, *q_bias = nullptr;
+    // The input Ks and the associated bias. Dimensions B x D and D, resp.
+    const T *k = nullptr, *k_bias = nullptr;
+    // The input Vs and the associated bias. Dimensions B x D and D, resp.
+    const T *v = nullptr, *v_bias = nullptr;
+
+    // The cache for the Ks. The size must be at least B x L x D.
+    T* k_cache = nullptr;
+    // The cache for the Vs. The size must be at least B x L x D.
+    T* v_cache = nullptr;
+    // The indirections to use for cache when beam sampling.
+    const int* cache_indir = nullptr;
+
+    // scales
+    const float* query_weight_output_scale               = nullptr;
+    const float* attention_qk_scale                      = nullptr;
+    const float* attention_output_weight_input_scale_inv = nullptr;
+
+    // Stride to handle the case when KQV is a single buffer
+    int stride = 0;
+
+    // The batch size.
+    int batch_size = 0;
+    // The beam width
+    int beam_width = 0;
+    // The sequence length.
+    int memory_max_len = 0;
+    // The number of heads (H).
+    int num_heads = 0;
+    // The hidden dimension per head (Dh).
+    int hidden_size_per_head = 0;
+    // The per-head latent space reserved for rotary embeddings.
+    int rotary_embedding_dim = 0;
+    // The maximum length of input sentences.
+    int max_input_length = 0;
+    // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
+    int timestep = 0;
+    // The current timestep of each sentences (support different timestep for different sentences)
+
+    // The 1.f / sqrt(Dh). Computed on the host.
+    float inv_sqrt_dh = 0.0f;
+
+    // Used when we have some input context like gpt
+    const int* total_padding_tokens = nullptr;
+
+    const bool* masked_tokens            = nullptr;
+    const int*  prefix_prompt_lengths    = nullptr;
+    int         max_prefix_prompt_length = 0;
+
+    const T* relative_attention_bias        = nullptr;
+    int      relative_attention_bias_stride = 0;
+    // The slope per head of linear position bias to attention score (H).
+    const T* linear_bias_slopes = nullptr;
+
+    const T*   ia3_key_weights   = nullptr;
+    const T*   ia3_value_weights = nullptr;
+    const int* ia3_tasks         = nullptr;
+
+    const float* qkv_scale_out       = nullptr;
+    const float* attention_out_scale = nullptr;
+    int          int8_mode           = 0;
+};
+
+template<typename T>
+struct Multihead_attention_params: public Multihead_attention_params_base<T> {
+    // allows to exist attention eary
+    bool* finished = nullptr;
+
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+
+    T**    k_cache_per_sample         = nullptr;
+    T**    v_cache_per_sample         = nullptr;
+    size_t kv_cache_per_sample_offset = 0;
+    bool   k_cache_interleaved        = true;
+};
+
+template<class T>
+using Masked_multihead_attention_params = Multihead_attention_params<T>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream);
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
new file mode 100644
index 0000000000000000000000000000000000000000..928fadc89540b256d64ce9a8e12c96447e9c6d82
--- /dev/null
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+#include "decoder_masked_multihead_attention_template.cuh"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS, stream)             \
+    size_t smem_sz = mmha::smem_size_in_bytes<T>(params, THDS_PER_VALUE, THDS_PER_BLOCK);                              \
+    dim3   grid(params.num_heads, params.batch_size);                                                                  \
+    mmha::masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, HAS_BEAMS>    \
+        <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// !!! Specialize the launcher for Cross attention
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    constexpr int THREADS_PER_VALUE = threads_per_value_t<T, Dh_MAX>::value;
+    // constexpr bool DO_CROSS_ATTENTION = std::is_same<KERNEL_PARAMS_TYPE, Cross_multihead_attention_params<T>>::value;
+    int tlength = params.timestep;
+
+    FT_CHECK(params.cache_indir == nullptr);
+
+    if (tlength < 32) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, false, stream);
+    }
+    else if (tlength < 2048) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, false, stream);
+    }
+    else {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, false, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template void mmha_launch_kernel<float, 128, 128, Masked_multihead_attention_params<float>>(
+    const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+template void mmha_launch_kernel<uint16_t, 128, 128, Masked_multihead_attention_params<uint16_t>>(
+    const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+template void mmha_launch_kernel<__nv_bfloat16, 128, 128, Masked_multihead_attention_params<__nv_bfloat16>>(
+    const Masked_multihead_attention_params<__nv_bfloat16>& params, const cudaStream_t& stream);
+#endif
+#ifdef ENABLE_FP8
+template void mmha_launch_kernel<__nv_fp8_e4m3, 128, 128, Masked_multihead_attention_params<__nv_fp8_e4m3>>(
+    const Masked_multihead_attention_params<__nv_fp8_e4m3>& params, const cudaStream_t& stream);
+#endif
+
+#undef MMHA_LAUNCH_KERNEL
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ddbbe446e227f7249638f0cc931fe7e4f2f9b9e2
--- /dev/null
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.cuh
@@ -0,0 +1,1820 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+// #define MMHA_USE_HMMA_FOR_REDUCTION
+
+// Below are knobs to extend FP32 accumulation for higher FP16 accuracy
+
+// Does not seem to affect the accuracy that much
+// #define MMHA_USE_FP32_ACUM_FOR_FMA
+
+// Seems to slightly improve the accuracy
+#define MMHA_USE_FP32_ACUM_FOR_OUT
+
+#if 0 && defined(MMHA_USE_FP32_ACUM_FOR_OUT)
+ // Does not seem to improve the accuracy
+ //#define MMHA_USE_FP32_ACUM_FOR_LOGITS
+#endif
+
+namespace mmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+//
+// The different kernels assign a threadblock for B x H pair. The grid has size (1, B, H). We use
+// 64, 128 and 256 threads per block.
+//
+// Each threadblock loads Dh values from Q and its associated bias. The kernels run a loop to
+// compute Q * K^T where K is loaded from a cache buffer -- except for the current timestep. The
+// cache buffer helps with memory accesses and contains keys with bias.
+//
+// The layout of the cache buffer for the keys is [B, H, Dh/x, L, x] where x == 8 for FP16 and
+// x == 4 for FP32 where the fastest moving dimension (contiguous data) is the rightmost one. The
+// values for x are chosen to create chunks of 16 bytes.
+//
+// The different kernels use 1, 2 or 4 threads per key (THREADS_PER_KEY). The size of the LDGs
+// depends on the number of threads per key. Each thread sums Dh / THREADS_PER_KEY elements. At
+// the end of each iteration of the Q * K^T loop, we perform a reduction between lanes using an
+// HMMA instruction (Tensor Core). Each Q * K^T valuey is stored in shared memory in FP32.
+//
+// After that loop, a parallel softmax is computed across the different Q * K^T values stored in
+// shared memory.
+//
+// The kernel ends with a loop over the values in V. We use THREADS_PER_VALUE to control how many
+// timesteps are computed by loop iteration. As with the keys, the values are read from a cache
+// except for the current timestep. The layout of the cache buffer for the values is much simpler
+// as it is [B, H, L, Dh].
+//
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Dh>
+struct Qk_vec_m_ {};
+
+template<>
+struct Qk_vec_m_<float, 32> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_m_<float, 64> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_m_<float, 128> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_m_<float, 256> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_m_<uint16_t, 32> {
+    using Type = uint32_t;
+};
+template<>
+struct Qk_vec_m_<uint16_t, 64> {
+    using Type = uint32_t;
+};
+template<>
+struct Qk_vec_m_<uint16_t, 128> {
+    using Type = uint2;
+};
+template<>
+struct Qk_vec_m_<uint16_t, 256> {
+    using Type = uint4;
+};
+#ifdef ENABLE_BF16
+template<>
+struct Qk_vec_m_<__nv_bfloat16, 32> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct Qk_vec_m_<__nv_bfloat16, 64> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct Qk_vec_m_<__nv_bfloat16, 128> {
+    using Type = bf16_4_t;
+};
+template<>
+struct Qk_vec_m_<__nv_bfloat16, 256> {
+    using Type = bf16_8_t;
+};
+#endif  // ENABLE_BF16
+
+#ifdef ENABLE_FP8
+template<>
+struct Qk_vec_m_<__nv_fp8_e4m3, 32> {
+    using Type = fp8_4_t;
+};
+template<>
+struct Qk_vec_m_<__nv_fp8_e4m3, 64> {
+    using Type = fp8_4_t;
+};
+template<>
+struct Qk_vec_m_<__nv_fp8_e4m3, 128> {
+    using Type = fp8_4_t;
+};
+template<>
+struct Qk_vec_m_<__nv_fp8_e4m3, 256> {
+    using Type = fp8_4_t;
+};
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Dh>
+struct Qk_vec_k_ {
+    using Type = typename Qk_vec_m_<T, Dh>::Type;
+};
+#ifdef ENABLE_FP8
+template<>
+struct Qk_vec_k_<__nv_fp8_e4m3, 32> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_k_<__nv_fp8_e4m3, 64> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_k_<__nv_fp8_e4m3, 128> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_k_<__nv_fp8_e4m3, 256> {
+    using Type = float4;
+};
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int THREADS_PER_KEY>
+struct K_vec_m_ {};
+
+template<>
+struct K_vec_m_<float, 4> {
+    using Type = float;
+};
+template<>
+struct K_vec_m_<float, 2> {
+    using Type = float2;
+};
+template<>
+struct K_vec_m_<float, 1> {
+    using Type = float4;
+};
+template<>
+struct K_vec_m_<uint16_t, 4> {
+    using Type = uint32_t;
+};
+template<>
+struct K_vec_m_<uint16_t, 2> {
+    using Type = uint2;
+};
+template<>
+struct K_vec_m_<uint16_t, 1> {
+    using Type = uint4;
+};
+#ifdef ENABLE_BF16
+template<>
+struct K_vec_m_<__nv_bfloat16, 4> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct K_vec_m_<__nv_bfloat16, 2> {
+    using Type = bf16_4_t;
+};
+template<>
+struct K_vec_m_<__nv_bfloat16, 1> {
+    using Type = bf16_8_t;
+};
+#endif  // ENABLE_BF16
+
+// NOTE: THREADS_PER_KEY * sizeof(K_vec_m_) = 128 bytes
+#ifdef ENABLE_FP8
+template<>
+struct K_vec_m_<__nv_fp8_e4m3, 4> {
+    using Type = fp8_4_t;
+};
+template<>
+struct K_vec_m_<__nv_fp8_e4m3, 2> {
+    using Type = fp8_4_t;
+};  // Defined for compilation-purpose only, do not use
+template<>
+struct K_vec_m_<__nv_fp8_e4m3, 1> {
+    using Type = fp8_4_t;
+};      // Defined for compilation-purpose only, do not use
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int THREADS_PER_KEY>
+struct K_vec_k_ {
+    using Type = typename K_vec_m_<T, THREADS_PER_KEY>::Type;
+};
+#ifdef ENABLE_FP8
+template<>
+struct K_vec_k_<__nv_fp8_e4m3, 4> {
+    using Type = float4;
+};
+template<>
+struct K_vec_k_<__nv_fp8_e4m3, 2> {
+    using Type = float4;
+};  // Defined for compilation-purpose only, do not use
+template<>
+struct K_vec_k_<__nv_fp8_e4m3, 1> {
+    using Type = float4;
+};      // Defined for compilation-purpose only, do not use
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int V_VEC_SIZE>
+struct V_vec_m_ {};
+
+template<>
+struct V_vec_m_<float, 1> {
+    using Type = float;
+};
+template<>
+struct V_vec_m_<float, 2> {
+    using Type = float2;
+};
+template<>
+struct V_vec_m_<float, 4> {
+    using Type = float4;
+};
+template<>
+struct V_vec_m_<uint16_t, 2> {
+    using Type = uint32_t;
+};
+template<>
+struct V_vec_m_<uint16_t, 4> {
+    using Type = uint2;
+};
+template<>
+struct V_vec_m_<uint16_t, 8> {
+    using Type = uint4;
+};
+#ifdef ENABLE_BF16
+template<>
+struct V_vec_m_<__nv_bfloat16, 2> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct V_vec_m_<__nv_bfloat16, 4> {
+    using Type = bf16_4_t;
+};
+template<>
+struct V_vec_m_<__nv_bfloat16, 8> {
+    using Type = bf16_8_t;
+};
+#endif  // ENABLE_BF16
+#ifdef ENABLE_FP8
+template<>
+struct V_vec_m_<__nv_fp8_e4m3, 4> {
+    using Type = fp8_4_t;
+};
+template<>
+struct V_vec_m_<__nv_fp8_e4m3, 8> {
+    using Type = fp8_4_t;
+};
+template<>
+struct V_vec_m_<__nv_fp8_e4m3, 16> {
+    using Type = fp8_4_t;
+};
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int V_VEC_SIZE>
+struct V_vec_k_ {
+    using Type = typename V_vec_m_<T, V_VEC_SIZE>::Type;
+};
+#ifdef ENABLE_FP8
+template<>
+struct V_vec_k_<__nv_fp8_e4m3, 4> {
+    using Type = float4;
+};
+template<>
+struct V_vec_k_<__nv_fp8_e4m3, 8> {
+    using Type = float4;
+};
+template<>
+struct V_vec_k_<__nv_fp8_e4m3, 16> {
+    using Type = float4;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+template<typename T>
+struct Qk_vec_acum_fp32_ {};
+
+template<>
+struct Qk_vec_acum_fp32_<float> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_acum_fp32_<float2> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<float4> {
+    using Type = float4;
+};
+// template<> struct Qk_vec_acum_fp32_<uint16_t> { using Type = float;        };
+template<>
+struct Qk_vec_acum_fp32_<uint32_t> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<uint2> {
+    using Type = Float4_;
+};
+template<>
+struct Qk_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+template<>
+struct Qk_vec_acum_fp32_<__nv_bfloat16> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct Qk_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+
+template<>
+struct Qk_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+template<>
+struct Qk_vec_acum_fp32_<__nv_bfloat16> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct Qk_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+#ifdef ENABLE_FP8
+// template<>
+// struct Qk_vec_acum_fp32_<fp8_2_t> {
+//     using Type = float2;
+// };
+template<>
+struct Qk_vec_acum_fp32_<fp8_4_t> {
+    using Type = Float4_;
+};
+// template<>
+// struct Qk_vec_acum_fp32_<fp8_8_t> {
+//     using Type = Float4_;
+// };
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct K_vec_acum_fp32_ {};
+
+template<>
+struct K_vec_acum_fp32_<float> {
+    using Type = float;
+};
+template<>
+struct K_vec_acum_fp32_<float2> {
+    using Type = float2;
+};
+template<>
+struct K_vec_acum_fp32_<float4> {
+    using Type = float4;
+};
+template<>
+struct K_vec_acum_fp32_<uint32_t> {
+    using Type = float2;
+};
+template<>
+struct K_vec_acum_fp32_<uint2> {
+    using Type = Float4_;
+};
+template<>
+struct K_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+template<>
+struct K_vec_acum_fp32_<__nv_bfloat16> {
+    using Type = float;
+};
+template<>
+struct K_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct K_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct K_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+#ifdef ENABLE_FP8
+// template<>
+// struct K_vec_acum_fp32_<fp8_2_t> {
+//     using Type = float2;
+// };
+template<>
+struct K_vec_acum_fp32_<fp8_4_t> {
+    using Type = Float4_;
+};
+// template<>
+// struct K_vec_acum_fp32_<fp8_8_t> {
+//     using Type = Float4_;
+// };
+#endif  // ENABLE_FP8
+#endif  // MMHA_USE_FP32_ACUM_FOR_FMA
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+template<typename T>
+struct V_vec_acum_fp32_ {};
+
+template<>
+struct V_vec_acum_fp32_<float> {
+    using Type = float;
+};
+template<>
+struct V_vec_acum_fp32_<float2> {
+    using Type = float2;
+};
+template<>
+struct V_vec_acum_fp32_<float4> {
+    using Type = float4;
+};
+template<>
+struct V_vec_acum_fp32_<uint32_t> {
+    using Type = float2;
+};
+template<>
+struct V_vec_acum_fp32_<uint2> {
+    using Type = Float4_;
+};
+template<>
+struct V_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+#ifdef ENABLE_BF16
+template<>
+struct V_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct V_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct V_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+#endif  // ENABLE_BF16
+#ifdef ENABLE_FP8
+// template<>
+// struct V_vec_acum_fp32_<fp8_2_t> {
+//     using Type = float2;
+// };
+template<>
+struct V_vec_acum_fp32_<fp8_4_t> {
+    using Type = Float4_;
+};
+// template<>
+// struct V_vec_acum_fp32_<fp8_8_t> {
+//     using Type = Float4_;
+// };
+#endif  // ENABLE_FP8
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Tout, typename Tin>
+__inline__ __device__ Tout vec_conversion(const Tin& x)
+{
+    return x;
+}
+#ifdef ENABLE_FP8
+// fp8_t
+template<>
+__inline__ __device__ float vec_conversion<float, __nv_fp8_e4m3>(const __nv_fp8_e4m3& a)
+{
+    return float(a);
+}
+template<>
+__inline__ __device__ __nv_fp8_e4m3 vec_conversion<__nv_fp8_e4m3, float>(const float& a)
+{
+    return __nv_fp8_e4m3(a);
+}
+// fp8_2_t
+template<>
+__inline__ __device__ float2 vec_conversion<float2, fp8_2_t>(const fp8_2_t& a)
+{
+    return float2(a);
+}
+template<>
+__inline__ __device__ fp8_2_t vec_conversion<fp8_2_t, float2>(const float2& a)
+{
+    return fp8_2_t(a);
+}
+// fp8_4_t
+template<>
+__inline__ __device__ float4 vec_conversion<float4, fp8_4_t>(const fp8_4_t& a)
+{
+    return float4(a);
+}
+template<>
+__inline__ __device__ fp8_4_t vec_conversion<fp8_4_t, float4>(const float4& a)
+{
+    return fp8_4_t(a);
+}
+#endif  // ENABLE_FP8
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int THREADS_PER_KEY, typename K_vec, int N>
+inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N])
+{
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+    using K_vec_acum = typename K_vec_acum_fp32_<K_vec>::Type;
+#else
+    using K_vec_acum = K_vec;
+#endif
+    // Compute the parallel products for Q*K^T (treat vector lanes separately).
+    K_vec_acum qk_vec = mul<K_vec_acum, K_vec, K_vec>(q[0], k[0]);
+#pragma unroll
+    for (int ii = 1; ii < N; ++ii) {
+        qk_vec = fma(q[ii], k[ii], qk_vec);
+    }
+
+    // Finalize the reduction across lanes.
+    float qk = sum(qk_vec);
+#pragma unroll
+    for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    }
+    return qk;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int THREADS_PER_KEY>
+struct Qk_dot {
+    template<typename K_vec, int N>
+    static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N])
+    {
+        return qk_dot_<THREADS_PER_KEY>(q, k);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 hmma_fp32(const uint2& a, uint32_t b)
+{
+    float4 c;
+    float  zero = 0.f;
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
+                 "    {%0, %1, %2, %3}, \n"
+                 "    {%4, %5}, \n"
+                 "    {%6}, \n"
+                 "    {%7, %7, %7, %7}; \n"
+
+                 : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
+                 : "r"(a.x) "r"(a.y), "r"(b), "f"(zero));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int N>
+inline __device__ float qk_hmma_dot_(const uint32_t (&q)[N], const uint32_t (&k)[N])
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+    using K_vec_acum = typename K_vec_acum_fp32_<uint32_t>::Type;
+#else
+    using K_vec_acum = uint32_t;
+#endif
+    K_vec_acum qk_vec = mul<K_vec_acum, uint32_t, uint32_t>(q[0], k[0]);
+#pragma unroll
+    for (int ii = 1; ii < N; ++ii) {
+        qk_vec = fma(q[ii], k[ii], qk_vec);
+    }
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+    uint32_t qk_vec_ = float2_to_half2(qk_vec);
+    return hmma_fp32(make_uint2(qk_vec_, 0u), 0x3c003c00u).x;
+#else
+    return hmma_fp32(make_uint2(qk_vec, 0u), 0x3c003c00u).x;
+#endif
+#else
+    return 0.f;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Qk_dot<uint16_t, 4> {
+    template<int N>
+    static inline __device__ float dot(const uint32_t (&q)[N], const uint32_t (&k)[N])
+    {
+#if __CUDA_ARCH__ >= 750 && defined(MMHA_USE_HMMA_FOR_REDUCTION)
+        return qk_hmma_dot_(q, k);
+#else
+        return qk_dot_<4>(q, k);
+#endif  // defined MMHA_USE_HMMA_FOR_REDUCTION
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
+inline __device__ float block_sum(float* red_smem, float sum)
+{
+
+    // Decompose the thread index into warp / lane.
+    int warp = threadIdx.x / WARP_SIZE;
+    int lane = threadIdx.x % WARP_SIZE;
+
+// Compute the sum per warp.
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    }
+
+    // Warp leaders store the data to shared memory.
+    if (lane == 0) {
+        red_smem[warp] = sum;
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // The warps compute the final sums.
+    if (lane < WARPS_PER_BLOCK) {
+        sum = red_smem[lane];
+    }
+
+// Parallel reduction inside the warp.
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+        sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    }
+
+    // Broadcast to other threads.
+    return __shfl_sync(uint32_t(-1), sum, 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(float& dst, float src)
+{
+    dst = src;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint16_t& dst, float src)
+{
+    dst = float_to_half(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint32_t& dst, float2 src)
+{
+    dst = float2_to_half2(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef ENABLE_BF16
+inline __device__ void convert_from_float(__nv_bfloat16& dst, float src)
+{
+    dst = __float2bfloat16(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(__nv_bfloat162& dst, float2 src)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    dst = __float22bfloat162_rn(src);
+#else
+    dst   = __floats2bfloat162_rn(src.x, src.y);
+#endif
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint2& dst, Float4_ src)
+{
+    dst.x = float2_to_half2(src.x);
+    dst.y = float2_to_half2(src.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint2& dst, float4 src)
+{
+    convert_from_float(dst, Float4_{make_float2(src.x, src.y), make_float2(src.z, src.w)});
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint4& dst, Float8_ src)
+{
+    dst.x = float2_to_half2(src.x);
+    dst.y = float2_to_half2(src.y);
+    dst.z = float2_to_half2(src.z);
+    dst.w = float2_to_half2(src.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ void convert_from_float(bf16_4_t& dst, Float4_ src)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    dst.x = __float22bfloat162_rn(src.x);
+    dst.y = __float22bfloat162_rn(src.y);
+#else
+    dst.x = __floats2bfloat162_rn(src.x.x, src.x.y);
+    dst.y = __floats2bfloat162_rn(src.y.x, src.y.y);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(bf16_4_t& dst, float4 src)
+{
+    convert_from_float(dst, Float4_{make_float2(src.x, src.y), make_float2(src.z, src.w)});
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(bf16_8_t& dst, Float8_ src)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    dst.x = __float22bfloat162_rn(src.x);
+    dst.y = __float22bfloat162_rn(src.y);
+    dst.z = __float22bfloat162_rn(src.z);
+    dst.w = __float22bfloat162_rn(src.w);
+#else
+    dst.x = __floats2bfloat162_rn(src.x.x, src.x.y);
+    dst.y = __floats2bfloat162_rn(src.y.x, src.y.y);
+    dst.z = __floats2bfloat162_rn(src.z.x, src.z.y);
+    dst.w = __floats2bfloat162_rn(src.w.x, src.w.y);
+#endif
+}
+#endif  // ENABLE_BF16
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_FP8
+inline __device__ void convert_from_float(fp8_4_t& dst, float4 src)
+{
+    dst = fp8_4_t(src);
+}
+inline __device__ void convert_from_float(fp8_2_t& dst, float2 src)
+{
+    dst = fp8_2_t(src);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(float2& dst, float2 src)
+{
+    dst = src;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(float4& dst, float4 src)
+{
+    dst = src;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float convert_to_float(float4 u)
+{
+    return u.x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float convert_to_float(uint4 u)
+{
+    float2 tmp = half2_to_float2(u.x);
+    return tmp.x;
+}
+
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float cast_to_float(float u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 cast_to_float(float2 u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 cast_to_float(float4 u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ cast_to_float(Float4_ u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ cast_to_float(Float8_ u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 cast_to_float(uint32_t u)
+{
+    return half2_to_float2(u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ cast_to_float(uint2 u)
+{
+    Float4_ tmp;
+    tmp.x = half2_to_float2(u.x);
+    tmp.y = half2_to_float2(u.y);
+    return tmp;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ cast_to_float(uint4 u)
+{
+    Float8_ tmp;
+    tmp.x = half2_to_float2(u.x);
+    tmp.y = half2_to_float2(u.y);
+    tmp.z = half2_to_float2(u.z);
+    tmp.w = half2_to_float2(u.w);
+    return tmp;
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float float_from_int8(int8_t u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 float_from_int8(int16_t u)
+{
+    union {
+        int16_t int16;
+        int8_t  int8[2];
+    };
+    int16 = u;
+    return make_float2(int8[0], int8[1]);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 float_from_int8(int32_t u)
+{
+    union {
+        int32_t int32;
+        int8_t  int8[4];
+    };
+    int32 = u;
+    return make_float4(int8[0], int8[1], int8[2], int8[3]);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// clang-format off
+inline __device__ Float8_ float_from_int8(int64_t u)
+{
+    union {
+        int64_t int64;
+        int16_t int16[4];
+    };
+    int64 = u;
+    return Float8_ {float_from_int8(int16[0]),
+                    float_from_int8(int16[1]),
+                    float_from_int8(int16[2]),
+                    float_from_int8(int16[3])};
+}
+// clang-format on
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ int8_t cast_to_int8(float val)
+{
+    union {
+        int8_t  int8[2];
+        int16_t int16;
+    };
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
+    return int8[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ int32_t cast_to_int8(float4 val)
+{
+    union {
+        int8_t  int8[4];
+        int32_t int32;
+    };
+    int8[0] = cast_to_int8(val.x);
+    int8[1] = cast_to_int8(val.y);
+    int8[2] = cast_to_int8(val.z);
+    int8[3] = cast_to_int8(val.w);
+    return int32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ int64_t cast_to_int8(Float8_ val)
+{
+    union {
+        int8_t  int8[8];
+        int64_t int64;
+    };
+    int8[0] = cast_to_int8(val.x.x);
+    int8[1] = cast_to_int8(val.x.y);
+    int8[2] = cast_to_int8(val.y.x);
+    int8[3] = cast_to_int8(val.y.y);
+    int8[4] = cast_to_int8(val.z.x);
+    int8[5] = cast_to_int8(val.z.y);
+    int8[6] = cast_to_int8(val.w.x);
+    int8[7] = cast_to_int8(val.w.y);
+    return int64;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ __host__ T div_up(T m, T n)
+{
+    return (m + n - 1) / n;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct kernel_type_t {
+    using Type = T;
+};
+
+#ifdef ENABLE_FP8
+template<>
+struct kernel_type_t<__nv_fp8_e4m3> {
+    using Type = float;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline size_t
+smem_size_in_bytes(const Multihead_attention_params<T>& params, int threads_per_value, int threads_per_block)
+{
+    using Tk = typename kernel_type_t<T>::Type;
+    // The amount of shared memory needed to store the Q*K^T values in float.
+    const int max_timesteps = min(params.timestep, params.memory_max_len);
+    size_t    qk_sz         = div_up(max_timesteps + 1, 4) * 16;
+
+    // The extra memory needed if we are not using floats for the final logits.
+    size_t logits_sz = 0;
+#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
+    if (sizeof(Tk) != 4) {
+        // TDOD
+        logits_sz = div_up(max_timesteps + 1, 4) * 4 * sizeof(Tk);
+    }
+#endif
+
+    // The total size needed during softmax.
+    size_t softmax_sz = qk_sz + logits_sz;
+
+    // The number of partial rows to reduce in the final reduction.
+    int rows_per_red = threads_per_block / threads_per_value;
+    // The amount of storage needed to finalize the outputs.
+    size_t red_sz = rows_per_red * params.hidden_size_per_head * sizeof(Tk) / 2;
+
+    // The max.
+    return max(softmax_sz, red_sz);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ constexpr uint32_t shfl_mask(int threads)
+{
+    return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T,  // The type of the inputs. Supported types: float and half.
+         int  Dh,     // The hidden dimension per head.
+         int  Dh_MAX,
+         int  THREADS_PER_KEY,    // The number of threads per key.
+         int  THREADS_PER_VALUE,  // The number of threads per value.
+         int  THREADS_PER_BLOCK,  // The number of threads in a threadblock.
+         bool HAS_BEAMS>
+__global__ void masked_multihead_attention_kernel(Multihead_attention_params<T> params)
+{
+
+    using Tk = typename kernel_type_t<T>::Type;
+#ifdef ENABLE_FP8
+    // FP8 MHA Scales
+    constexpr bool FP8_MHA_KERNEL = std::is_same<T, __nv_fp8_e4m3>::value;
+#else
+    constexpr bool FP8_MHA_KERNEL = false;
+#endif
+    // Make sure the hidden dimension per head is a multiple of the number of threads per key.
+    static_assert(Dh_MAX % THREADS_PER_KEY == 0, "");
+    // Make sure the hidden dimension per head is a multiple of the number of threads per value.
+    static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");
+
+    // The size of a warp.
+    constexpr int WARP_SIZE = 32;
+    // The number of warps in a threadblock.
+    constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+    // Use smem_size_in_bytes (above) to determine the amount of shared memory.
+    extern __shared__ char smem_[];
+
+    // The shared memory for the Q*K^T values and partial logits in softmax.
+    float* qk_smem = reinterpret_cast<float*>(smem_);
+
+    // The shared memory for the logits. For FP32, that's the same buffer as qk_smem.
+    char* logits_smem_ = smem_;
+#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
+    if (sizeof(Tk) != 4) {
+        // TODO - change to tlength
+        const int max_timesteps = min(params.timestep, params.memory_max_len);
+        logits_smem_ += div_up(max_timesteps + 1, 4) * 16;
+    }
+    Tk* logits_smem = reinterpret_cast<Tk*>(logits_smem_);
+#else
+    float*         logits_smem    = reinterpret_cast<float*>(logits_smem_);
+#endif
+
+    // The shared memory to do the final reduction for the output values. Reuse qk_smem.
+    Tk* out_smem = reinterpret_cast<Tk*>(smem_);
+
+    // The shared memory buffers for the block-wide reductions. One for max, one for sum.
+    __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+    // A vector of Q or K elements for the current timestep.
+    using Qk_vec_k = typename Qk_vec_k_<T, Dh_MAX>::Type;  // with kernel-used precision
+    using Qk_vec_m = typename Qk_vec_m_<T, Dh_MAX>::Type;  // with memory-used precision
+
+    // Use alignment for safely casting the shared buffers as Qk_vec_k.
+    // Shared memory to store Q inputs.
+    __shared__ __align__(sizeof(Qk_vec_k)) Tk q_smem[Dh_MAX];
+
+    // The number of elements per vector.
+    constexpr int QK_VEC_SIZE = sizeof(Qk_vec_m) / sizeof(T);
+    // Make sure the hidden size per head is a multiple of the vector size.
+    static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
+    // We will use block wide reduction if needed
+    // static_assert(Dh_MAX / QK_VEC_SIZE <= WARP_SIZE, "");
+    // The number of vectors per warp.
+    constexpr int QK_VECS_PER_WARP = Dh_MAX / QK_VEC_SIZE;
+
+    // The layout of the cache is [B, H, Dh/x, L, x] with x == 4/8/16 for FP32/FP16/FP8. Since each thread
+    // owns x elements, we have to decompose the linear index into chunks of x values and the posi-
+    // tion of the thread in that chunk.
+
+    // The number of elements in a chunk of 16B (that's the x in the above formula).
+    constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
+    // The number of K vectors in 16B.
+    constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec_m);
+
+    // The batch/beam idx
+    const int bi = blockIdx.y;
+    if (params.finished != nullptr && params.finished[bi] == true) {
+        return;
+    }
+    // The beam idx
+    const int beami = bi % params.beam_width;
+    // The "beam-aware" batch idx
+    const int bbi = bi / params.beam_width;
+    // The head.
+    const int hi = blockIdx.x;
+    // Combine the batch and the head indices.
+    const int bhi = bi * params.num_heads + hi;
+    // Combine the "beam-aware" batch idx and the head indices.
+    const int bbhi = bbi * params.beam_width * params.num_heads + hi;
+    // The thread in the block.
+    const int tidx = threadIdx.x;
+
+    constexpr bool handle_kv = true;
+
+    // While doing the product Q*K^T for the different keys we track the max.
+    float qk_max = -FLT_MAX;
+
+    float qk = 0.0F;
+
+    int qkv_base_offset = (params.stride == 0) ? bhi * Dh : bi * params.stride + hi * Dh;
+
+    const size_t bi_seq_len_offset = bi * params.memory_max_len;
+
+    const int tlength      = params.length_per_sample[bi] + params.max_prefix_prompt_length;
+    const int first_step   = max(0, tlength + 1 - params.memory_max_len);
+    const int tlength_circ = tlength % params.memory_max_len;
+
+    // First QK_VECS_PER_WARP load Q and K + the bias values for the current timestep.
+    const bool is_masked = tidx >= QK_VECS_PER_WARP;
+
+    // The offset in the Q and K buffer also accounts for the batch.
+    int qk_offset = qkv_base_offset + tidx * QK_VEC_SIZE;
+    // The offset in the bias buffer.
+    int qk_bias_offset = hi * Dh + tidx * QK_VEC_SIZE;
+
+    const bool do_ia3      = handle_kv && params.ia3_tasks != nullptr;
+    const int  ia3_task_id = do_ia3 ? params.ia3_tasks[bbi] : 0;
+
+    // Trigger the loads from the Q and K buffers.
+    Qk_vec_k q;
+    zero(q);
+    if (!is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh)) {
+        if (params.int8_mode == 2) {
+            using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec_m>::value>::type;
+            using Packed_Float_t = typename packed_type<float, num_elems<Qk_vec_m>::value>::type;
+            const auto q_scaling = params.qkv_scale_out[0];
+            const auto q_quant =
+                *reinterpret_cast<const Packed_Int8_t*>(&reinterpret_cast<const int8_t*>(params.q)[qk_offset]);
+
+            convert_from_float(q, mul<Packed_Float_t, float>(q_scaling, float_from_int8(q_quant)));
+        }
+        else {
+            q = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(&params.q[qk_offset]));
+        }
+    }
+
+    Qk_vec_k k;
+    zero(k);
+    {
+        if (params.int8_mode == 2) {
+            using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec_m>::value>::type;
+            using Packed_Float_t = typename packed_type<float, num_elems<Qk_vec_m>::value>::type;
+            const auto k_scaling = params.qkv_scale_out[1];
+            const auto k_quant =
+                *reinterpret_cast<const Packed_Int8_t*>(&reinterpret_cast<const int8_t*>(params.k)[qk_offset]);
+
+            convert_from_float(k, mul<Packed_Float_t, float>(k_scaling, float_from_int8(k_quant)));
+        }
+        else {
+            k = !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) ?
+                    vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(&params.k[qk_offset])) :
+                    k;
+        }
+    }
+
+    // Trigger the loads from the Q and K bias buffers.
+    Qk_vec_k q_bias;
+    zero(q_bias);
+    q_bias =
+        (!is_masked && Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.q_bias != nullptr ?
+            vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(&params.q_bias[qk_bias_offset])) :
+            q_bias;
+
+    Qk_vec_k k_bias;
+    zero(k_bias);
+    if (handle_kv) {
+        k_bias =
+            !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.k_bias != nullptr ?
+                vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(&params.k_bias[qk_bias_offset])) :
+                k_bias;
+    }
+
+    // Computes the Q/K values with bias.
+    q = add(q, q_bias);
+    if (handle_kv) {
+        k = add(k, k_bias);
+    }
+    if (do_ia3 && !is_masked) {
+        k = mul<Qk_vec_k, Qk_vec_k, Qk_vec_k>(
+            k,
+            vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<const Qk_vec_m*>(
+                &params.ia3_key_weights[(ia3_task_id * params.num_heads + hi) * Dh + tidx * QK_VEC_SIZE])));
+    }
+
+    // Padded len
+    const int padd_len = (params.total_padding_tokens == nullptr) ? 0 : params.total_padding_tokens[bi];
+    if (params.rotary_embedding_dim > 0) {
+        if (handle_kv) {
+            apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, params.timestep - padd_len);
+        }
+        else {
+            apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, params.timestep - padd_len);
+        }
+    }
+
+    if (!is_masked) {
+        // Store the Q values to shared memory.
+        *reinterpret_cast<Qk_vec_k*>(&q_smem[tidx * QK_VEC_SIZE]) = q;
+
+        // Write the K values to the global memory cache.
+        //
+        // NOTE: The stores are uncoalesced as we have multiple chunks of 16B spread across the memory
+        // system. We designed it this way as it allows much better memory loads (and there are many
+        // more loads) + the stores are really "write and forget" since we won't need the ack before
+        // the end of the kernel. There's plenty of time for the transactions to complete.
+
+        // The 16B chunk written by the thread.
+        int co = tidx / QK_VECS_IN_16B;
+        // The position of the thread in that 16B chunk.
+        int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;
+
+        if (handle_kv) {
+            // Trigger the stores to global memory.
+            if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
+                if (!params.k_cache_per_sample) {
+                    // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
+                    int offset = bhi * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B
+                                 + tlength_circ * QK_ELTS_IN_16B + ci;
+                    *reinterpret_cast<Qk_vec_m*>(&params.k_cache[offset]) = vec_conversion<Qk_vec_m, Qk_vec_k>(k);
+                }
+                else {
+                    int offset;
+                    if (params.k_cache_interleaved) {
+                        offset = params.kv_cache_per_sample_offset + hi * params.memory_max_len * Dh
+                                 + co * params.memory_max_len * QK_ELTS_IN_16B + tlength_circ * QK_ELTS_IN_16B + ci;
+                    }
+                    else {
+                        offset = params.kv_cache_per_sample_offset + hi * params.memory_max_len * Dh + tlength_circ * Dh
+                                 + co * QK_ELTS_IN_16B + ci;
+                    }
+                    *reinterpret_cast<Qk_vec_m*>(&params.k_cache_per_sample[bi][offset]) =
+                        vec_conversion<Qk_vec_m, Qk_vec_k>(k);
+                }
+            }
+        }
+
+        // Compute \sum_i Q[i] * K^T[i] for the current timestep.
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+        using Qk_vec_acum = typename Qk_vec_acum_fp32_<Qk_vec_k>::Type;
+#else
+        using Qk_vec_acum = Qk_vec_k;
+#endif
+        qk = dot<Qk_vec_acum, Qk_vec_k>(q, k);
+        if (QK_VECS_PER_WARP <= WARP_SIZE) {
+#pragma unroll
+            for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
+                qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
+            }
+        }
+    }
+
+    if (QK_VECS_PER_WARP > WARP_SIZE) {
+        constexpr int WARPS_PER_RED = (QK_VECS_PER_WARP + WARP_SIZE - 1) / WARP_SIZE;
+        qk                          = block_sum<WARPS_PER_RED>(&red_smem[WARPS_PER_RED], qk);
+    }
+
+    // Store that value in shared memory. Keep the Q*K^T value in register for softmax.
+    if (tidx == 0) {
+        // Normalize qk.
+        qk *= params.inv_sqrt_dh;
+        if (params.relative_attention_bias != nullptr) {
+            qk = add(qk,
+                     params.relative_attention_bias[hi * params.relative_attention_bias_stride
+                                                        * params.relative_attention_bias_stride
+                                                    + (tlength - padd_len) * params.relative_attention_bias_stride
+                                                    + (tlength - padd_len)]);
+        }
+        // We don't need to apply the linear position bias here since qi - ki = 0 yields the position bias 0.
+
+        qk_max                        = qk;
+        qk_smem[tlength - first_step] = qk;
+        // qk_smem[params.timestep] = qk;
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // The type of queries and keys for the math in the Q*K^T product.
+    using K_vec_k = typename K_vec_k_<T, THREADS_PER_KEY>::Type;
+    using K_vec_m = typename K_vec_m_<T, THREADS_PER_KEY>::Type;
+    // The number of elements per vector.
+    constexpr int K_VEC_SIZE = sizeof(K_vec_m) / sizeof(T);
+    // Make sure the hidden size per head is a multiple of the vector size.
+    static_assert(Dh_MAX % K_VEC_SIZE == 0, "");
+    // The number of elements per thread.
+    constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY;
+    // The number of vectors per thread.
+    constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+
+    // The position the first key loaded by each thread from the cache buffer (for this B * H).
+    int ko = tidx / THREADS_PER_KEY;
+    // The position of the thread in the chunk of keys.
+    int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+
+    static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD);
+
+    // Load the Q values from shared memory. The values are reused during the loop on K.
+    K_vec_k q_vec[K_VECS_PER_THREAD];
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+        q_vec[ii] = *reinterpret_cast<const K_vec_k*>(&q_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]);
+    }
+
+    // The number of timesteps loaded per iteration.
+    constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+    // The number of keys per warp.
+    constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+    // The base pointer for the key in the cache buffer.
+    T* k_cache =
+        params.k_cache_per_sample ?
+            (params.k_cache_per_sample[bi] + params.kv_cache_per_sample_offset + hi * params.memory_max_len * Dh + ki) :
+            &params.k_cache[bhi * params.memory_max_len * Dh + ki];
+    // Base pointer for the beam's batch, before offsetting with indirection buffer
+    // T* k_cache_batch = &params.k_cache[bbhi * params.memory_max_len * Dh + ki];
+    T* k_cache_batch = k_cache;
+
+    // Pick a number of keys to make sure all the threads of a warp enter (due to shfl_sync).
+    // int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
+    int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+
+    // prefix prompt length if has
+    const int prefix_prompt_length = (params.prefix_prompt_lengths == nullptr) ? 0 : params.prefix_prompt_lengths[bi];
+
+    // Iterate over the keys/timesteps to compute the various (Q*K^T)_{ti} values.
+    const int* beam_indices = HAS_BEAMS ? &params.cache_indir[bi_seq_len_offset] : nullptr;
+
+    for (int ti = first_step + ko; ti < ti_end; ti += K_PER_ITER) {
+        const int ti_circ = ti % params.memory_max_len;
+        bool      is_mask = (params.masked_tokens != nullptr) && params.masked_tokens[bi_seq_len_offset + ti];
+
+        // The keys loaded from the key cache.
+        K_vec_k k[K_VECS_PER_THREAD];
+        K_vec_k k_vec_zero;
+        zero(k_vec_zero);
+#pragma unroll
+        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj =
+                params.k_cache_interleaved ? ii * params.memory_max_len + ti_circ : ti_circ * Dh / QK_ELTS_IN_16B + ii;
+            // if( ti < params.timestep ) {
+            const bool within_bounds = (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.memory_max_len);
+            if (ti < tlength) {
+                if (!within_bounds) {
+                    k[ii] = k_vec_zero;
+                }
+                else {
+                    if (HAS_BEAMS) {
+                        const int beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh;
+                        k[ii]                 = vec_conversion<K_vec_k, K_vec_m>(
+                            (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
+                    }
+                    else {
+                        k[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                            (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+                    }
+                }
+            }
+        }
+
+        // Perform the dot product and normalize qk.
+        //
+        // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+        float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k) * params.inv_sqrt_dh;
+
+        // Store the product to shared memory. There's one qk value per timestep. Update the max.
+        // if( ti < params.timestep && tidx % THREADS_PER_KEY == 0 ) {
+        if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+            if (params.relative_attention_bias != nullptr) {
+                qk = add(qk,
+                         params.relative_attention_bias[hi * params.relative_attention_bias_stride
+                                                            * params.relative_attention_bias_stride
+                                                        + tlength * params.relative_attention_bias_stride + ti]);
+            }
+            if (params.linear_bias_slopes != nullptr) {
+                // Apply the linear position bias: (ki - qi) * slope[hi].
+                // The padding token locates between the input context and the generated tokens.
+                // We need to remove the number of padding tokens in the distance computation.
+                //   ti   : 0 1 2 3 4 5 6 7 8 9(tlength)
+                //   token: i i i i p p p o o o where i=input, p=pad, o=output.
+                // e.g. ti = 2, dist = (9 - 3) - 2 = 4.
+                int   max_context_length = params.max_prefix_prompt_length + params.max_input_length;
+                float dist               = (ti < max_context_length ? ti + padd_len : ti) - tlength;
+
+                qk += mul<float, T, float>(params.linear_bias_slopes[hi], dist);
+            }
+            qk_max                   = is_mask ? qk_max : fmaxf(qk_max, qk);
+            qk_smem[ti - first_step] = qk;
+        }
+    }
+
+// Perform the final reduction to compute the max inside each warp.
+//
+// NOTE: In a group of THREADS_PER_KEY threads, the leader already has the max value for the
+// group so it's not needed to run the reduction inside the group (again).
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+        qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+    }
+
+    // Decompose the thread index into warp and lane.
+    const int warp = tidx / WARP_SIZE;
+    const int lane = tidx % WARP_SIZE;
+
+    // The warp leader writes the max to shared memory.
+    if (lane == 0) {
+        red_smem[warp] = qk_max;
+    }
+
+    // Make sure the products are in shared memory.
+    __syncthreads();
+
+    // The warps finalize the reduction.
+    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+        qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+    }
+
+    // Broadcast to all the threads in the warp.
+    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
+
+    // Compute the logits and start the sum.
+    float sum = 0.f;
+    // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
+    for (int ti = first_step + tidx; ti <= tlength; ti += THREADS_PER_BLOCK) {
+        bool is_mask = (params.masked_tokens != nullptr) && params.masked_tokens[bi_seq_len_offset + ti];
+#ifdef FP8_MHA
+        float logit = 0.f;
+        if (FP8_MHA_KERNEL) {
+            logit = is_mask ? 0.f :
+                              __expf((qk_smem[ti - first_step] - qk_max) * params.query_weight_output_scale[0]
+                                     * params.query_weight_output_scale[0]);
+        }
+        else {
+            logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max);
+        }
+#else
+        float logit       = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max);
+#endif
+        sum += logit;
+        qk_smem[ti - first_step] = logit;
+    }
+
+    // Compute the sum.
+    sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
+
+    // Normalize the logits.
+    float inv_sum = __fdividef(1.f, sum + 1.e-6f);
+
+    for (int ti = first_step + tidx; ti <= tlength; ti += THREADS_PER_BLOCK) {
+        float logit = qk_smem[ti - first_step] * inv_sum;
+        convert_from_float(logits_smem[ti - first_step], logit);
+    }
+
+    // Put Values part below so we leverage __syncthreads
+    // from the previous step
+
+    // The number of elements per vector.
+    constexpr int V_VEC_SIZE = Dh_MAX / THREADS_PER_VALUE;
+    // A vector of V elements for the current timestep.
+    using V_vec_k = typename V_vec_k_<T, V_VEC_SIZE>::Type;
+    using V_vec_m = typename V_vec_m_<T, V_VEC_SIZE>::Type;
+
+    // The value computed by this thread.
+    int vo = tidx / THREADS_PER_VALUE;
+    // The hidden dimensions computed by this particular thread.
+    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+
+    // The base pointer for the value in the cache buffer.
+    T* v_cache =
+        params.v_cache_per_sample ?
+            (params.v_cache_per_sample[bi] + params.kv_cache_per_sample_offset + hi * params.memory_max_len * Dh + vi) :
+            &params.v_cache[bhi * params.memory_max_len * Dh + vi];
+    // Base pointer for the beam's batch, before offsetting with indirection buffer
+    // T* v_cache_batch = &params.v_cache[bbhi * params.memory_max_len * Dh + vi];
+    T* v_cache_batch = v_cache;
+
+    // The number of values processed per iteration of the loop.
+    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+    // One group of threads computes the product(s) for the current timestep.
+    V_vec_k v_bias;
+    zero(v_bias);
+    // if( vo == params.timestep % V_PER_ITER ) {
+    if (Dh == Dh_MAX || vi < Dh) {
+        if (handle_kv) {
+            if (vo == tlength % V_PER_ITER) {
+                // Trigger the loads from the V bias buffer.
+                if (params.v_bias != nullptr) {
+                    v_bias = vec_conversion<V_vec_k, V_vec_m>(
+                        *reinterpret_cast<const V_vec_m*>(&params.v_bias[hi * Dh + vi]));
+                }
+            }
+        }
+    }
+
+    // From previous, before values, step
+    // Also make sure the logits are in shared memory.
+    __syncthreads();
+
+    // Values continued
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+    using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+#else
+    using V_vec_acum = V_vec_k;
+#endif
+    // The partial outputs computed by each thread.
+    V_vec_acum out;
+    zero(out);
+
+    // Loop over the timesteps to compute the partial outputs.
+    // for( int ti = vo; ti < params.timestep; ti += V_PER_ITER ) {
+    if (Dh == Dh_MAX || vi < Dh) {
+
+        // Separate the ti < memory_max_len and ti > memory_max_len
+        // to prevent ti % memory_len when ti < memory_len, and
+        // the compiler cannot optimize the codes automatically.
+        const int min_length = min(tlength, params.memory_max_len);
+        for (int ti = first_step + vo; ti < min_length; ti += V_PER_ITER) {
+            // Fetch offset based on cache_indir when beam sampling
+            const int beam_src    = HAS_BEAMS ? params.cache_indir[bi_seq_len_offset + ti] : 0;
+            const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0;
+            // Load the values from the cache.
+            V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(
+                *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * Dh]));
+            // Load the logits from shared memory.
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+            float logit = logits_smem[ti - first_step];
+            out         = fma(logit, cast_to_float(v), out);
+#else  // MMHA_USE_FP32_ACUM_FOR_LOGITS
+#ifdef FP8_MHA
+            Tk logit;
+            if (FP8_MHA_KERNEL) {
+                // NOTE: fake quantization
+                // logit = vec_conversion<Tk, Tquant>(vec_conversion<Tquant, Tk>(mul<Tk, float, Tk>(1.0f /
+                // params.attention_qk_scale[0], logits_smem[ti])));
+                logit = logits_smem[ti - first_step];
+            }
+            else {
+                logit = logits_smem[ti - first_step];
+            }
+            out = fma(logit, v, out);
+#else   // FP8_MHA
+            Tk logit = logits_smem[ti - first_step];
+            out      = fma(logit, v, out);
+#endif  // FP8_MHA
+#endif  // MMHA_USE_FP32_ACUM_FOR_LOGITS
+        }
+        for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+            if (ti < params.memory_max_len) {
+                // handled by previous loop
+                continue;
+            }
+            const int ti_circ = ti % params.memory_max_len;
+
+            // Fetch offset based on cache_indir when beam sampling
+            const int beam_src    = HAS_BEAMS ? params.cache_indir[bi_seq_len_offset + ti_circ] : 0;
+            const int beam_offset = HAS_BEAMS ? beam_src * params.num_heads * params.memory_max_len * Dh : 0;
+            // Load the values from the cache.
+            V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(
+                *reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti_circ * Dh]));
+            // Load the logits from shared memory.
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+            float logit = logits_smem[ti - first_step];
+            out         = fma(logit, cast_to_float(v), out);
+#else  // MMHA_USE_FP32_ACUM_FOR_LOGITS
+#ifdef FP8_MHA
+            Tk logit;
+            if (FP8_MHA_KERNEL) {
+                // NOTE: fake quantization
+                // logit = vec_conversion<Tk, Tquant>(vec_conversion<Tquant, Tk>(mul<Tk, float, Tk>(1.0f /
+                // params.attention_qk_scale[0], logits_smem[ti])));
+                logit = logits_smem[ti - first_step];
+            }
+            else {
+                logit = logits_smem[ti - first_step];
+            }
+            out = fma(logit, v, out);
+#else   // FP8_MHA
+            Tk logit = logits_smem[ti - first_step];
+            out      = fma(logit, v, out);
+#endif  // FP8_MHA
+#endif  // MMHA_USE_FP32_ACUM_FOR_LOGITS
+        }
+    }
+
+    // One group of threads computes the product(s) for the current timestep.
+    // if( vo == params.timestep % V_PER_ITER ) {
+    if (vo == tlength % V_PER_ITER && (Dh == Dh_MAX || vi < Dh)) {
+
+        V_vec_k v;
+
+        // Trigger the loads from the V buffer.
+        const auto v_offset = qkv_base_offset + vi;
+        if (params.int8_mode == 2) {
+            using Packed_Int8_t  = typename packed_type<int8_t, num_elems<V_vec_k>::value>::type;
+            using Packed_Float_t = typename packed_type<float, num_elems<V_vec_k>::value>::type;
+            const auto v_scaling = params.qkv_scale_out[2];
+            const auto v_quant =
+                *reinterpret_cast<const Packed_Int8_t*>(&reinterpret_cast<const int8_t*>(params.v)[v_offset]);
+
+            convert_from_float(v, mul<Packed_Float_t, float>(v_scaling, float_from_int8(v_quant)));
+        }
+        else {
+            v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&params.v[v_offset]));
+        }
+        // Trigger the loads from the V bias buffer.
+        // V_vec v_bias = *reinterpret_cast<const V_vec*>(&params.v_bias[hi*Dh + vi]);
+
+        // Compute the V values with bias.
+        if (handle_kv) {
+            v = add(v, v_bias);
+
+            if (do_ia3) {
+                v = mul<V_vec_k, V_vec_k, V_vec_k>(
+                    v,
+                    *reinterpret_cast<const V_vec_k*>(
+                        &params.ia3_value_weights[(ia3_task_id * params.num_heads + hi) * Dh + vi]));
+            }
+
+            // Store the values with bias back to global memory in the cache for V.
+            //*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
+            *reinterpret_cast<V_vec_m*>(&v_cache[tlength_circ * Dh]) = vec_conversion<V_vec_m, V_vec_k>(v);
+        }
+
+        // Initialize the output value with the current timestep.
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+        // out = fma(logits_smem[params.timestep], cast_to_float(v), out);
+        out = fma(logits_smem[tlength - first_step], cast_to_float(v), out);
+#else  // MMHA_USE_FP32_ACUM_FOR_LOGITS
+       // out = fma(logits_smem[params.timestep], v, out);
+#ifdef FP8_MHA
+        Tk logit;
+        if (FP8_MHA_KERNEL) {
+            // NOTE: fake quantization
+            // logit = mul<Tk, float, Tk>(1.0f / params.attention_qk_scale[0], logits_smem[tlength]);
+            logit = logits_smem[tlength - first_step];
+        }
+        else {
+            logit = logits_smem[tlength - first_step];
+        }
+        out = fma(logit, v, out);
+#else   // FP8_MHA
+        out = fma(logits_smem[tlength - first_step], v, out);
+#endif  // FP8_MHA
+#endif  // MMHA_USE_FP32_ACUM_FOR_LOGITS
+    }
+
+    // Make sure we can start writing to shared memory.
+    __syncthreads();
+
+    // Run the final reduction amongst the different groups computing different partial outputs.
+    if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+        for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) {
+
+            // The midpoint in the number of active groups.
+            int midpoint = active_groups / 2;
+
+            // The upper part of active threads store to shared memory.
+            if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+                convert_from_float(*reinterpret_cast<V_vec_k*>(&out_smem[(vo - midpoint) * Dh + vi]), out);
+#else
+                *reinterpret_cast<V_vec_k*>(&out_smem[(vo - midpoint) * Dh + vi]) = out;
+#endif
+            }
+            __syncthreads();
+
+            // The bottom warps update their values.
+            if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+                out = add(*reinterpret_cast<const V_vec_k*>(&out_smem[vo * Dh + vi]), out);
+            }
+            __syncthreads();
+        }
+    }
+
+    // Output the final values.
+    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+        if (FP8_MHA_KERNEL) {
+#ifdef FP8_MHA
+            // float result_scale = params.attention_qk_scale[0] * params.query_weight_output_scale[0] *
+            // params.attention_output_weight_input_scale_inv[0];
+            float result_scale =
+                params.query_weight_output_scale[0] * params.attention_output_weight_input_scale_inv[0];
+            convert_from_float(*reinterpret_cast<V_vec_m*>(&params.out[bhi * Dh + vi]),
+                               mul<V_vec_acum, float, V_vec_acum>(result_scale, out));
+#endif  // FP8_MHA
+        }
+        else if (params.int8_mode == 2) {
+            using Packed_Int8_t = typename packed_type<int8_t, num_elems<V_vec_acum>::value>::type;
+            out                 = mul<V_vec_acum, float>(*params.attention_out_scale, out);
+            *reinterpret_cast<Packed_Int8_t*>(&(reinterpret_cast<int8_t*>(params.out)[bhi * Dh + vi])) =
+                cast_to_int8(out);
+        }
+        else {
+            convert_from_float(*reinterpret_cast<V_vec_m*>(&params.out[bhi * Dh + vi]), out);
+        }
+#else   // MMHA_USE_FP32_ACUM_FOR_OUT
+        // TODO: support int8_mode?
+        *reinterpret_cast<V_vec_m*>(&params.out[bhi * Dh + vi]) = vec_conversion<V_vec_m, V_vec_acum>(out);
+#endif  // MMHA_USE_FP32_ACUM_FOR_OUT
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace mmha
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Dh_MAX>
+struct threads_per_value_t {
+    static const int value = Dh_MAX * sizeof(T) / 16;
+};
+#ifdef ENABLE_FP8
+template<int Dh_MAX>
+struct threads_per_value_t<__nv_fp8_e4m3, Dh_MAX> {
+    static const int value = Dh_MAX * 4 / 16;  // DEBUG: float v
+};
+#endif
+
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream);
diff --git a/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h b/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..075e61b3c01f4cab459d116d1cd2e307f41c5b33
--- /dev/null
+++ b/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
@@ -0,0 +1,1856 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include <stdint.h>
+
+using namespace fastertransformer;
+
+namespace mmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Float8_ {
+    float2 x;
+    float2 y;
+    float2 z;
+    float2 w;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Float4_ {
+    float2 x;
+    float2 y;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+struct bf16_4_t {
+    __nv_bfloat162 x;
+    __nv_bfloat162 y;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct bf16_8_t {
+    __nv_bfloat162 x;
+    __nv_bfloat162 y;
+    __nv_bfloat162 z;
+    __nv_bfloat162 w;
+};
+#endif
+
+#ifdef ENABLE_FP8
+using fp8_2_t = __nv_fp8x2_e4m3;
+using fp8_4_t = __nv_fp8x4_e4m3;
+struct fp8_8_t {
+    __nv_fp8_e4m3 x;
+    __nv_fp8_e4m3 y;
+    __nv_fp8_e4m3 z;
+    __nv_fp8_e4m3 w;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct num_elems;
+template<>
+struct num_elems<float> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<float2> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<float4> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<Float4_> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<Float8_> {
+    static constexpr int value = 8;
+};
+
+template<>
+struct num_elems<uint32_t> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<uint2> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<uint4> {
+    static constexpr int value = 8;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct num_elems<__nv_bfloat162> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<bf16_4_t> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<bf16_8_t> {
+    static constexpr int value = 8;
+};
+#endif
+
+#ifdef ENABLE_FP8
+template<>
+struct num_elems<__nv_fp8_e4m3> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<fp8_2_t> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<fp8_4_t> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<fp8_8_t> {
+    static constexpr int value = 8;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int N>
+struct packed_type;
+template<typename T>
+struct packed_type<T, 1> {
+    using type = T;
+};
+template<>
+struct packed_type<int8_t, 2> {
+    using type = int16_t;
+};
+template<>
+struct packed_type<int8_t, 4> {
+    using type = int32_t;
+};
+template<>
+struct packed_type<int8_t, 8> {
+    using type = int64_t;
+};
+
+template<>
+struct packed_type<float, 2> {
+    using type = float2;
+};
+template<>
+struct packed_type<float, 4> {
+    using type = float4;
+};
+template<>
+struct packed_type<float, 8> {
+    using type = Float8_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float add(float a, float b)
+{
+    return a + b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 add(float2 a, float2 b)
+{
+    float2 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 add(float4 a, float4 b)
+{
+    float4 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    c.z = add(a.z, b.z);
+    c.w = add(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b)
+{
+    return a + b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b)
+{
+    return bf16hadd2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b)
+{
+    bf16_4_t c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b)
+{
+    bf16_8_t c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    c.z = add(a.z, b.z);
+    c.w = add(a.w, b.w);
+    return c;
+}
+#endif  // ENABLE_BF16
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint16_t add(uint16_t a, uint16_t b)
+{
+    uint16_t c;
+    asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t add(uint32_t a, uint32_t b)
+{
+    uint32_t c;
+    asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint2 add(uint2 a, uint2 b)
+{
+    uint2 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint4 add(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    c.z = add(a.z, b.z);
+    c.w = add(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint16_t float_to_half(float f)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800  // Is it better?
+    float zero = 0.f;
+    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
+#else
+    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
+#endif
+    return tmp.u16[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t float2_to_half2(float2 f)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+#else
+    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+#endif
+    return tmp.u32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float half_to_float(uint16_t h)
+{
+    float f;
+    asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+    return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 half2_to_float2(uint32_t v)
+{
+    uint16_t lo, hi;
+    asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+    return make_float2(half_to_float(lo), half_to_float(hi));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float add(float a, uint16_t b)
+{
+    return a + half_to_float(b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ float add(float a, __nv_bfloat16 b)
+{
+    return a + __bfloat162float(b);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_FP8
+inline __device__ float add(float a, __nv_fp8_e4m3 b)
+{
+    return a + (float)(b);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 add(uint32_t a, float2 fb)
+{
+    float2 fa = half2_to_float2(a);
+    return add(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ add(uint2 a, Float4_ fb)
+{
+    Float4_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ add(uint4 a, Float8_ fb)
+{
+    Float8_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    fc.z = add(a.z, fb.z);
+    fc.w = add(a.w, fb.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t h0_h0(uint16_t a)
+{
+    uint32_t b;
+    asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+    return b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float fma(float a, float b, float c)
+{
+    return a * b + c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(float2 a, float2 b, float2 c)
+{
+    float2 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(float a, float2 b, float2 c)
+{
+    float2 d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 fma(float4 a, float4 b, float4 c)
+{
+    float4 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    d.z = fma(a.z, b.z, c.z);
+    d.w = fma(a.w, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 fma(float a, float4 b, float4 c)
+{
+    float4 d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    d.z = fma(a, b.z, c.z);
+    d.w = fma(a, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 fma(float a, float4 b, Float4_ c)
+{
+    float4 d;
+    d.x = fma(a, b.x, c.x.x);
+    d.y = fma(a, b.y, c.x.y);
+    d.z = fma(a, b.z, c.y.x);
+    d.w = fma(a, b.w, c.y.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c)
+{
+    Float4_ d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
+{
+    Float8_ d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    d.z = fma(a, b.z, c.z);
+    d.w = fma(a, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ float2 add(__nv_bfloat162 a, float2 fb)
+{
+    float2 fa = bf1622float2(a);
+    return add(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ add(bf16_4_t a, Float4_ fb)
+{
+    Float4_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ add(bf16_8_t a, Float8_ fb)
+{
+    Float8_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    fc.z = add(a.z, fb.z);
+    fc.w = add(a.w, fb.w);
+    return fc;
+}
+#endif  // ENABLE_BF16
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c)
+{
+    uint32_t d;
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c)
+{
+    return fma(h0_h0(a), b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c)
+{
+    uint2 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c)
+{
+    uint32_t s = h0_h0(a);
+    uint2    d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c)
+{
+    uint4 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    d.z = fma(a.z, b.z, c.z);
+    d.w = fma(a.w, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c)
+{
+    uint32_t s = h0_h0(a);
+    uint4    d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    d.z = fma(s, b.z, c.z);
+    d.w = fma(s, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float fma(uint16_t a, uint16_t b, float fc)
+{
+    float fa = half_to_float(a);
+    float fb = half_to_float(b);
+    return fa * fb + fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(uint32_t a, uint32_t b, float2 fc)
+{
+    float2 fa = half2_to_float2(a);
+    float2 fb = half2_to_float2(b);
+    return fma(fa, fb, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(uint16_t a, uint32_t b, float2 fc)
+{
+    return fma(h0_h0(a), b, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(uint2 a, uint2 b, Float4_ fc)
+{
+    Float4_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(uint16_t a, uint2 b, Float4_ fc)
+{
+    uint32_t s = h0_h0(a);
+    Float4_  fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(uint4 a, uint4 b, Float8_ fc)
+{
+    Float8_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    fd.z = fma(a.z, b.z, fc.z);
+    fd.w = fma(a.w, b.w, fc.w);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc)
+{
+    uint32_t s = h0_h0(a);
+    Float8_  fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    fd.z = fma(s, b.z, fc.z);
+    fd.w = fma(s, b.w, fc.w);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef ENABLE_BF16
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c)
+{
+    return bf16hfma2(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c)
+{
+    return bf16hfma2(bf162bf162(a), b, c);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c)
+{
+    bf16_4_t d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_4_t       d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c)
+{
+    bf16_8_t d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    d.z = fma(a.z, b.z, c.z);
+    d.w = fma(a.w, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_8_t       d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    d.z = fma(s, b.z, c.z);
+    d.w = fma(s, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc)
+{
+    return __bfloat162float(a) * __bfloat162float(b) + fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc)
+{
+    float2 fa = bf1622float2(a);
+    float2 fb = bf1622float2(b);
+    return fma(fa, fb, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc)
+{
+    return fma(bf162bf162(a), b, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc)
+{
+    Float4_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float4_        fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc)
+{
+    Float8_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    fd.z = fma(a.z, b.z, fc.z);
+    fd.w = fma(a.w, b.w, fc.w);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float8_        fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    fd.z = fma(s, b.z, fc.z);
+    fd.w = fma(s, b.w, fc.w);
+    return fd;
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b)
+{
+    return Acc{};  // for compile
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul<float, float>(float a, float b)
+{
+    return a * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(float2 a, float2 b)
+{
+    float2 c;
+    c.x = a.x * b.x;
+    c.y = a.y * b.y;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(float a, float2 b)
+{
+    float2 c;
+    c.x = a * b.x;
+    c.y = a * b.y;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float4 mul(float4 a, float4 b)
+{
+    float4 c;
+    c.x = a.x * b.x;
+    c.y = a.y * b.y;
+    c.z = a.z * b.z;
+    c.w = a.w * b.w;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float4 mul(float a, float4 b)
+{
+    float4 c;
+    c.x = a * b.x;
+    c.y = a * b.y;
+    c.z = a * b.z;
+    c.w = a * b.w;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(float a, Float8_ b)
+{
+    Float8_ c;
+    c.x = mul<float2, float, float2>(a, b.x);
+    c.y = mul<float2, float, float2>(a, b.y);
+    c.z = mul<float2, float, float2>(a, b.z);
+    c.w = mul<float2, float, float2>(a, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint16_t mul(uint16_t a, uint16_t b)
+{
+    uint16_t c;
+    asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint32_t mul(uint32_t a, uint32_t b)
+{
+    uint32_t c;
+    asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint32_t mul(uint16_t a, uint32_t b)
+{
+    return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint2 mul(uint2 a, uint2 b)
+{
+    uint2 c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint2 mul(uint16_t a, uint2 b)
+{
+    uint32_t s = h0_h0(a);
+    uint2    c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint4 mul(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+    c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
+    c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint4 mul(uint16_t a, uint4 b)
+{
+    uint32_t s = h0_h0(a);
+    uint4    c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+    c.z = mul<uint32_t, uint32_t, uint32_t>(s, b.z);
+    c.w = mul<uint32_t, uint32_t, uint32_t>(s, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(uint16_t a, uint16_t b)
+{
+    float fa = half_to_float(a);
+    float fb = half_to_float(b);
+    return fa * fb;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(uint16_t a, float b)
+{
+    return half_to_float(a) * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(uint32_t a, uint32_t b)
+{
+    float2 fa = half2_to_float2(a);
+    float2 fb = half2_to_float2(b);
+    return mul<float2, float2, float2>(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(uint16_t a, uint32_t b)
+{
+    return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(uint2 a, uint2 b)
+{
+    Float4_ fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(uint16_t a, uint2 b)
+{
+    uint32_t s = h0_h0(a);
+    Float4_  fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(uint4 a, uint4 b)
+{
+    Float8_ fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+    fc.z = mul<float2, uint32_t, uint32_t>(a.z, b.z);
+    fc.w = mul<float2, uint32_t, uint32_t>(a.w, b.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(uint16_t a, uint4 b)
+{
+    uint32_t s = h0_h0(a);
+    Float8_  fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+    fc.z = mul<float2, uint32_t, uint32_t>(s, b.z);
+    fc.w = mul<float2, uint32_t, uint32_t>(s, b.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __hmul(a, b);
+#else
+    return bf16hmul(a, b);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b)
+{
+    return bf16hmul2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b)
+{
+    return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b)
+{
+    bf16_4_t c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_4_t       c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b)
+{
+    bf16_8_t c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+    c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_8_t       c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+    c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b)
+{
+    float fa = (float)a;
+    float fb = (float)b;
+    return fa * fb;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(__nv_bfloat16 a, float b)
+{
+    return __bfloat162float(a) * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b)
+{
+    float2 fa = bf1622float2(a);
+    float2 fb = bf1622float2(b);
+    return mul<float2, float2, float2>(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b)
+{
+    return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b)
+{
+    Float4_ fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float4_        fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b)
+{
+    Float8_ fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+    fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float8_        fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+    fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+    return fc;
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(float v)
+{
+    return v;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(float2 v)
+{
+    return v.x + v.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(float4 v)
+{
+    return v.x + v.y + v.z + v.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ float sum(__nv_bfloat162 v)
+{
+    float2 vf = bf1622float2(v);
+    return vf.x + vf.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(bf16_4_t v)
+{
+    return sum(v.x) + sum(v.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(bf16_8_t v)
+{
+    return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint16_t v)
+{
+    return half_to_float(v);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint32_t v)
+{
+    float2 tmp = half2_to_float2(v);
+    return tmp.x + tmp.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint2 v)
+{
+    uint32_t c = add(v.x, v.y);
+    return sum(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint4 v)
+{
+#if 1
+    uint32_t c = add(v.x, v.y);
+    c          = add(c, v.z);
+    c          = add(c, v.w);
+#else
+    uint32_t c = add(v.x, v.y);
+    uint32_t d = add(v.z, v.w);
+    c          = add(c, d);
+#endif
+    return sum(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(Float4_ v)
+{
+    return v.x.x + v.x.y + v.y.x + v.y.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(Float8_ v)
+{
+    return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ float dot(T a, T b)
+{
+    return sum(mul<T, T, T>(a, b));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename A, typename T>
+inline __device__ float dot(T a, T b)
+{
+    return sum(mul<A, T, T>(a, b));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void zero(uint16_t& dst)
+{
+    dst = uint16_t(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ void zero(T& dst)
+{
+    constexpr int WORDS = sizeof(T) / 4;
+    union {
+        T        raw;
+        uint32_t words[WORDS];
+    } tmp;
+#pragma unroll
+    for (int ii = 0; ii < WORDS; ++ii) {
+        tmp.words[ii] = 0u;
+    }
+    dst = tmp.raw;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const float t_step)
+{
+    const float inv_freq = t_step / pow(10000.0f, zid / (float)rot_embed_dim);
+    return {cos(inv_freq), sin(inv_freq)};
+}
+
+inline __device__ float2 rotary_embedding_transform(const float2 v, const float2 coef)
+{
+    float2 rot_v;
+    rot_v.x = coef.x * v.x - coef.y * v.y;
+    rot_v.y = coef.x * v.y + coef.y * v.x;
+    return rot_v;
+}
+
+inline __device__ uint32_t rotary_embedding_transform(const uint32_t v, const float2 coef)
+{
+    float2 fv     = half2_to_float2(v);
+    float2 rot_fv = rotary_embedding_transform(fv, coef);
+    return float2_to_half2(rot_fv);
+}
+
+#ifdef ENABLE_BF16
+inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162 v, const float2 coef)
+{
+    float2 fv     = bf1622float2(v);
+    float2 rot_fv = rotary_embedding_transform(fv, coef);
+    return __floats2bfloat162_rn(rot_fv.x, rot_fv.y);
+}
+#endif
+
+inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step)
+{
+    return;
+}
+
+inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step)
+{
+    return;
+}
+
+inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+
+    Float4_&   q_    = *reinterpret_cast<Float4_*>(&q);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    q_.x             = rotary_embedding_transform(q_.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    q_.y             = rotary_embedding_transform(q_.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+
+    Float4_&   q_    = *reinterpret_cast<Float4_*>(&q);
+    Float4_&   k_    = *reinterpret_cast<Float4_*>(&k);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    q_.x             = rotary_embedding_transform(q_.x, coef0);
+    k_.x             = rotary_embedding_transform(k_.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    q_.y             = rotary_embedding_transform(q_.y, coef1);
+    k_.y             = rotary_embedding_transform(k_.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+}
+
+inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    k.z              = rotary_embedding_transform(k.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+    k.w              = rotary_embedding_transform(k.w, coef3);
+}
+
+#ifdef ENABLE_BF16
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void
+apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    k.z              = rotary_embedding_transform(k.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+    k.w              = rotary_embedding_transform(k.w, coef3);
+}
+#endif  // ENABLE_BF16
+
+template<typename Vec_T, typename T>
+__device__ __inline__ void vec_from_smem_transpose(Vec_T& vec, T* smem, int transpose_idx, int smem_pitch);
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    return;
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint32_t& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+    tmp.u16[0] = smem[transpose_idx];
+    tmp.u16[1] = smem[smem_pitch + transpose_idx];
+
+    vec = tmp.u32;
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint2& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp_1, tmp_2;
+    tmp_1.u32 = *reinterpret_cast<uint32_t*>(&smem[transpose_idx]);
+    tmp_2.u32 = *reinterpret_cast<uint32_t*>(&smem[smem_pitch + transpose_idx]);
+
+    union {
+        uint2    u32x2;
+        uint16_t u16[4];
+    } tmp_3;
+    tmp_3.u16[0] = tmp_1.u16[0];
+    tmp_3.u16[1] = tmp_2.u16[0];
+    tmp_3.u16[2] = tmp_1.u16[1];
+    tmp_3.u16[3] = tmp_2.u16[1];
+
+    vec = tmp_3.u32x2;
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint4& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint64_t u64;
+        uint16_t u16[4];
+    } tmp_1, tmp_2;
+    tmp_1.u64 = *reinterpret_cast<uint64_t*>(&smem[transpose_idx]);
+    tmp_2.u64 = *reinterpret_cast<uint64_t*>(&smem[smem_pitch + transpose_idx]);
+
+    union {
+        uint4    u32x4;
+        uint16_t u16[8];
+    } tmp_3;
+    tmp_3.u16[0] = tmp_1.u16[0];
+    tmp_3.u16[1] = tmp_2.u16[0];
+    tmp_3.u16[2] = tmp_1.u16[1];
+    tmp_3.u16[3] = tmp_2.u16[1];
+    tmp_3.u16[4] = tmp_1.u16[2];
+    tmp_3.u16[5] = tmp_2.u16[2];
+    tmp_3.u16[6] = tmp_1.u16[3];
+    tmp_3.u16[7] = tmp_2.u16[3];
+
+    vec = tmp_3.u32x4;
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+vec_from_smem_transpose(bf16_4_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t      u32;
+        __nv_bfloat16 bf16[2];
+    } tmp_1, tmp_2;
+    tmp_1.u32 = *reinterpret_cast<uint32_t*>(&smem[transpose_idx]);
+    tmp_2.u32 = *reinterpret_cast<uint32_t*>(&smem[smem_pitch + transpose_idx]);
+
+    vec.x = __nv_bfloat162{tmp_1.bf16[0], tmp_2.bf16[0]};
+    vec.y = __nv_bfloat162{tmp_1.bf16[1], tmp_2.bf16[1]};
+}
+
+template<>
+__device__ __inline__ void
+vec_from_smem_transpose(bf16_8_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint64_t      u64;
+        __nv_bfloat16 bf16[4];
+    } tmp_1, tmp_2;
+    tmp_1.u64 = *reinterpret_cast<uint64_t*>(&smem[transpose_idx]);
+    tmp_2.u64 = *reinterpret_cast<uint64_t*>(&smem[smem_pitch + transpose_idx]);
+
+    vec.x = __nv_bfloat162{tmp_1.bf16[0], tmp_2.bf16[0]};
+    vec.y = __nv_bfloat162{tmp_1.bf16[1], tmp_2.bf16[1]};
+    vec.z = __nv_bfloat162{tmp_1.bf16[2], tmp_2.bf16[2]};
+    vec.w = __nv_bfloat162{tmp_1.bf16[3], tmp_2.bf16[3]};
+}
+#endif  // ENABLE_BF16
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float4& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    vec.x = smem[transpose_idx];
+    vec.z = smem[transpose_idx + 1];
+    vec.y = smem[smem_pitch + transpose_idx];
+    vec.w = smem[smem_pitch + transpose_idx + 1];
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint32_t& vec, half* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        half     u16[2];
+    } tmp;
+    tmp.u16[0] = smem[transpose_idx];
+    tmp.u16[1] = smem[smem_pitch + transpose_idx];
+
+    vec = tmp.u32;
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+vec_from_smem_transpose(__nv_bfloat162& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    vec.x = smem[transpose_idx];
+    vec.y = smem[smem_pitch + transpose_idx];
+}
+#endif
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float2& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    vec.x = smem[transpose_idx];
+    vec.y = smem[smem_pitch + transpose_idx];
+}
+
+template<typename Vec_T, typename T>
+__device__ __inline__ void write_smem_transpose(const Vec_T& vec, T* smem, int transpose_idx, int smem_pitch);
+
+template<>
+__device__ __inline__ void write_smem_transpose(const float& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    return;
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+write_smem_transpose(const bf16_4_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    return;
+}
+
+template<>
+__device__ __inline__ void
+write_smem_transpose(const bf16_8_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    return;
+}
+#endif
+
+#ifdef ENABLE_FP8
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float4& vec, __nv_fp8_e4m3* smem, int transpose_idx, int smem_pitch)
+{
+    // TODO
+    printf("[ERROR] still no have implementation for vec_from_smem_transpose under __nv_fp8_e4m3 \n");
+}
+#endif  // ENABLE_FP8
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint4& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint64_t u64;
+        uint16_t u16[4];
+    } tmp_1, tmp_2;
+
+    union {
+        uint4    u32x4;
+        uint16_t u16[8];
+    } tmp_3;
+    tmp_3.u32x4  = vec;
+    tmp_1.u16[0] = tmp_3.u16[0];
+    tmp_2.u16[0] = tmp_3.u16[1];
+    tmp_1.u16[1] = tmp_3.u16[2];
+    tmp_2.u16[1] = tmp_3.u16[3];
+    tmp_1.u16[2] = tmp_3.u16[4];
+    tmp_2.u16[2] = tmp_3.u16[5];
+    tmp_1.u16[3] = tmp_3.u16[6];
+    tmp_2.u16[3] = tmp_3.u16[7];
+
+    *reinterpret_cast<uint64_t*>(&smem[transpose_idx])              = tmp_1.u64;
+    *reinterpret_cast<uint64_t*>(&smem[smem_pitch + transpose_idx]) = tmp_2.u64;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint2& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp_1, tmp_2;
+
+    union {
+        uint2    u32x2;
+        uint16_t u16[4];
+    } tmp_3;
+    tmp_3.u32x2  = vec;
+    tmp_1.u16[0] = tmp_3.u16[0];
+    tmp_2.u16[0] = tmp_3.u16[1];
+    tmp_1.u16[1] = tmp_3.u16[2];
+    tmp_2.u16[1] = tmp_3.u16[3];
+
+    *reinterpret_cast<uint32_t*>(&smem[transpose_idx])              = tmp_1.u32;
+    *reinterpret_cast<uint32_t*>(&smem[smem_pitch + transpose_idx]) = tmp_2.u32;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint32_t& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+    tmp.u32 = vec;
+
+    smem[transpose_idx]              = tmp.u16[0];
+    smem[smem_pitch + transpose_idx] = tmp.u16[1];
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const float4& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    smem[transpose_idx]                  = vec.x;
+    smem[transpose_idx + 1]              = vec.z;
+    smem[smem_pitch + transpose_idx]     = vec.y;
+    smem[smem_pitch + transpose_idx + 1] = vec.w;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint32_t& vec, half* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        half     u16[2];
+    } tmp;
+
+    tmp.u32                          = vec;
+    smem[transpose_idx]              = tmp.u16[0];
+    smem[smem_pitch + transpose_idx] = tmp.u16[1];
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+write_smem_transpose(const __nv_bfloat162& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    smem[transpose_idx]              = vec.x;
+    smem[smem_pitch + transpose_idx] = vec.y;
+}
+#endif
+
+template<>
+__device__ __inline__ void write_smem_transpose(const float2& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    smem[transpose_idx]              = vec.x;
+    smem[smem_pitch + transpose_idx] = vec.y;
+}
+
+#ifdef ENABLE_FP8
+template<>
+__device__ __inline__ void
+write_smem_transpose(const float4& vec, __nv_fp8_e4m3* smem, int transpose_idx, int smem_pitch)
+{
+    printf("[ERROR] still no have implementation for vec_from_smem_transpose under __nv_fp8_e4m3 \n");
+}
+#endif  // ENABLE_FP8
+
+}  // namespace mmha
diff --git a/src/fastertransformer/kernels/decoding_kernels.cu b/src/fastertransformer/kernels/decoding_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..89f0d50116f748cbca44fbc1fc7f06d93e210e96
--- /dev/null
+++ b/src/fastertransformer/kernels/decoding_kernels.cu
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+// static const float HALF_FLT_MAX = 65504.F;
+
+template<typename T>
+__global__ void decodingInitialize(bool*      finished,
+                                   int*       sequence_length,
+                                   int*       word_ids,
+                                   T*         cum_log_probs,
+                                   const int* sentence_ids,
+                                   const int  batch_size,
+                                   const int  beam_width,
+                                   const int  max_input_length)
+{
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? (T)HALF_FLT_MAX : (T)1e20f;  // BF16 and FP32 have the same dynamic range
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * beam_width;
+         index += blockDim.x * gridDim.x) {
+        finished[index]        = false;
+        sequence_length[index] = max_input_length;
+        if (word_ids != nullptr) {
+            word_ids[index] = sentence_ids[index / beam_width];
+        }
+        cum_log_probs[index] = (index % beam_width == 0) ? (T)0.0f : (T)-MAX_T_VAL;
+    }
+}
+
+template<typename T>
+void invokeDecodingInitialize(bool*        finished,
+                              int*         sequence_length,
+                              int*         word_ids,
+                              T*           cum_log_probs,
+                              const int*   sentence_ids,
+                              const int    batch_size,
+                              const int    beam_width,
+                              const int    max_input_length,
+                              cudaStream_t stream)
+{
+    dim3 grid((int)ceil(batch_size * beam_width * 1.0 / 256));
+    dim3 block(256);
+
+    decodingInitialize<T><<<grid, block, 0, stream>>>(
+        finished, sequence_length, word_ids, cum_log_probs, sentence_ids, batch_size, beam_width, max_input_length);
+}
+
+template void invokeDecodingInitialize(bool*        finished,
+                                       int*         sequence_length,
+                                       int*         word_ids,
+                                       float*       cum_log_probs,
+                                       const int*   sentence_ids,
+                                       const int    batch_size,
+                                       const int    beam_width,
+                                       const int    max_input_length,
+                                       cudaStream_t stream);
+
+template void invokeDecodingInitialize(bool*        finished,
+                                       int*         sequence_length,
+                                       int*         word_ids,
+                                       half*        cum_log_probs,
+                                       const int*   sentence_ids,
+                                       const int    batch_size,
+                                       const int    beam_width,
+                                       const int    max_input_length,
+                                       cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void invokeDecodingInitialize(bool*          finished,
+                                       int*           sequence_length,
+                                       int*           word_ids,
+                                       __nv_bfloat16* cum_log_probs,
+                                       const int*     sentence_ids,
+                                       const int      batch_size,
+                                       const int      beam_width,
+                                       const int      max_input_length,
+                                       cudaStream_t   stream);
+#endif
+
+// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
+template<typename T>
+__global__ void embeddingLookupPosEncoding(T*             from_tensor,
+                                           const T*       embedding_table,
+                                           const T*       position_encoding,
+                                           const int*     all_ids,
+                                           const int*     padding_count,
+                                           const int*     input_lengths,
+                                           const int      local_token_num,
+                                           const int64_t  hidden_units,
+                                           const int      step,
+                                           const int      max_input_length,
+                                           const int      token_num,
+                                           const int      ite,
+                                           const T        scale)
+{
+    // 1. lookup from embedding table
+    // 2. multiply scale
+    // 3. add the position encoding
+    const int id_offset = step * token_num + ite * local_token_num;
+
+    const bool use_padding_count = padding_count != nullptr;
+    const bool use_input_len     = input_lengths != nullptr;
+
+    for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units;
+         index += blockDim.x * gridDim.x) {
+        const int row_index   = index / hidden_units;
+        const int col_index   = index % hidden_units;
+        int       step_offset = step;
+        if (use_padding_count) {
+            step_offset -= padding_count[row_index];
+        }
+        else if (use_input_len) {
+            step_offset -= max_input_length - input_lengths[row_index];
+        }
+        step_offset *= hidden_units;
+
+        T val = embedding_table[all_ids[id_offset + row_index] * hidden_units + col_index] * scale;
+        val   = val + position_encoding[step_offset + col_index];
+
+        from_tensor[index] = val;
+    }
+}
+
+// No absolute position embedding
+// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
+template<typename T, int PROMPT_SRC>
+__global__ void embeddingLookup(T*                    from_tensor,
+                                const T*              embedding_table,
+                                const int*            all_ids,
+                                pPromptTuningParam<T> prompt_param,
+                                const int             local_token_num,
+                                const int64_t         hidden_units,
+                                const int             step,
+                                const int             token_num,
+                                const int             ite,
+                                const int             seq_len,
+                                const T               scale)
+{
+    // 1. lookup from embedding table
+    // 2. multiply scale
+    const int id_offset = step * token_num + ite * local_token_num;
+
+    for (int64_t index = blockIdx.x * blockDim.x + threadIdx.x; index < local_token_num * hidden_units;
+         index += blockDim.x * gridDim.x) {
+
+        const int word_index     = index / hidden_units;
+        const int word_index_row = word_index / seq_len;  // batch_id
+        const int col_index      = index % hidden_units;
+        const int input_id       = all_ids == nullptr ? word_index : all_ids[id_offset + word_index];
+        const int prompt_id      = input_id - prompt_param.p_prompt_tuning_id_start;
+        T         embedding      = (T)0.0f;
+        if (PROMPT_SRC > 0 && prompt_id >= 0) {
+            if (PROMPT_SRC == 1) {
+                // from loaded prompt embedding tables
+                embedding =
+                    prompt_param.p_prompt_tuning_batch_weights[word_index_row][prompt_id * hidden_units + col_index];
+            }
+            else {
+                // from request prompt embedding
+                embedding =
+                    prompt_param
+                        .request_prompt_embedding[word_index_row * prompt_param.request_prompt_max_length * hidden_units
+                                                  + prompt_id * hidden_units + col_index];
+            }
+        }
+        else {
+            embedding = embedding_table[input_id * hidden_units + col_index];
+        }
+        from_tensor[index] = embedding * scale;
+    }
+}
+
+#define EMBEDDING_LOOKUP(PROMPT_SRC)                                                                                   \
+    embeddingLookup<T, PROMPT_SRC><<<grid, block, 0, stream>>>(from_tensor,                                            \
+                                                               embedding_table,                                        \
+                                                               all_ids,                                                \
+                                                               prompt_param,                                           \
+                                                               local_token_num,                                        \
+                                                               hidden_units,                                           \
+                                                               step,                                                   \
+                                                               token_num,                                              \
+                                                               ite,                                                    \
+                                                               seq_len,                                                \
+                                                               scale);
+
+/* Adapter function for invokeEmbeddingLookupPosEncoding{PadCount,InputLen} */
+template<typename T>
+void invokeEmbeddingLookupPosEncoding(T*                    from_tensor,
+                                      const T*              embedding_table,
+                                      const T*              position_encoding,
+                                      const int*            all_ids,
+                                      const int*            padding_count,
+                                      const int*            input_lengths,
+                                      pPromptTuningParam<T> prompt_param,
+                                      const int             local_token_num,
+                                      const int             hidden_units,
+                                      const T               scale,
+                                      const int             step,
+                                      const int             max_input_length,
+                                      const int             token_num,
+                                      const int             ite,
+                                      const int             seq_len,
+                                      cudaStream_t          stream)
+{
+    dim3 grid(min(local_token_num, 65536));
+    dim3 block(min(hidden_units, 1024));
+    if (position_encoding != nullptr) {
+        FT_CHECK_WITH_INFO(prompt_param.use_request_p_prompt_embedding == false
+                               && prompt_param.p_prompt_tuning_batch_weights == nullptr,
+                           fmtstr("embeddingLookupPosEncoding still not support prompt tuning"));
+        embeddingLookupPosEncoding<T><<<grid, block, 0, stream>>>(from_tensor,
+                                                                  embedding_table,
+                                                                  position_encoding,
+                                                                  all_ids,
+                                                                  padding_count,
+                                                                  input_lengths,
+                                                                  local_token_num,
+                                                                  hidden_units,
+                                                                  step,
+                                                                  max_input_length,
+                                                                  token_num,
+                                                                  ite,
+                                                                  scale);
+    }
+    else {
+        if (prompt_param.use_request_p_prompt_embedding) {
+            EMBEDDING_LOOKUP(2);
+        }
+        else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) {
+            EMBEDDING_LOOKUP(1);
+        }
+        else {
+            EMBEDDING_LOOKUP(0);
+        }
+    }
+}
+
+#undef EMBEDDING_LOOKUP
+
+template<typename T>
+void invokeEmbeddingLookupPosEncodingPadCount(T*                    from_tensor,
+                                              const T*              embedding_table,
+                                              const T*              position_encoding,
+                                              const int*            all_ids,
+                                              const int*            pad_count,
+                                              pPromptTuningParam<T> prompt_param,
+                                              const int             local_token_num,
+                                              const int             hidden_units,
+                                              const T               scale,
+                                              const int             step,
+                                              const int             token_num,
+                                              const int             ite,
+                                              const int             seq_len,
+                                              cudaStream_t          stream)
+{
+    invokeEmbeddingLookupPosEncoding<T>(from_tensor,
+                                        embedding_table,
+                                        position_encoding,
+                                        all_ids,
+                                        pad_count,
+                                        nullptr,
+                                        prompt_param,
+                                        local_token_num,
+                                        hidden_units,
+                                        scale,
+                                        step,
+                                        0,
+                                        token_num,
+                                        ite,
+                                        seq_len,
+                                        stream);
+}
+
+#define INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT(T)                                                                   \
+    template void invokeEmbeddingLookupPosEncodingPadCount(T*                    from_tensor,                          \
+                                                           const T*              embedding_table,                      \
+                                                           const T*              position_encoding,                    \
+                                                           const int*            all_ids,                              \
+                                                           const int*            pad_count,                            \
+                                                           pPromptTuningParam<T> prompt_param,                         \
+                                                           const int             local_token_num,                      \
+                                                           const int             hidden_units,                         \
+                                                           const T               scale,                                \
+                                                           const int             step,                                 \
+                                                           const int             token_num,                            \
+                                                           const int             ite,                                  \
+                                                           const int             seq_len,                              \
+                                                           cudaStream_t          stream)
+INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT(float);
+INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT(half);
+#ifdef ENABLE_BF16
+INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_LOOKUP_POS_ENCODING_PAD_COUNT
+
+template<typename T>
+__global__ void paddingEmbedding(T*            padded_embedding_kernel,
+                                 T*            padded_embedding_bias,
+                                 const T*      embedding_kernel,
+                                 const T*      embedding_bias,
+                                 const int64_t hidden_unit,
+                                 const int64_t vocab_size,
+                                 const int64_t vocab_size_padded)
+{
+    for (int64_t id = threadIdx.x + blockIdx.x * blockDim.x; id < hidden_unit * vocab_size_padded;
+         id += blockDim.x * gridDim.x) {
+        int row_id = id / vocab_size_padded;
+        int col_id = id % vocab_size_padded;
+        if (col_id < vocab_size) {
+            padded_embedding_kernel[id] = embedding_kernel[row_id * vocab_size + col_id];
+        }
+        else {
+            padded_embedding_kernel[id] = (T)(0.0f);
+        }
+    }
+
+    for (int id = threadIdx.x + blockIdx.x * blockDim.x; id < vocab_size_padded; id += blockDim.x * gridDim.x) {
+        if (id < vocab_size) {
+            padded_embedding_bias[id] = embedding_bias[id];
+        }
+        else {
+            padded_embedding_bias[id] = (T)(0.0f);
+        }
+    }
+}
+
+template<typename T>
+void invokePaddingEmbedding(T*           padded_embedding_kernel,
+                            T*           padded_embedding_bias,
+                            const T*     embedding_kernel,
+                            const T*     embedding_bias,
+                            const int    hidden_unit,
+                            const int    vocab_size,
+                            const int    vocab_size_padded,
+                            cudaStream_t stream)
+{
+    dim3 block(512);
+    dim3 grid((int)(ceil(hidden_unit * vocab_size_padded / 512.)));
+    paddingEmbedding<<<grid, block, 0, stream>>>(padded_embedding_kernel,
+                                                 padded_embedding_bias,
+                                                 embedding_kernel,
+                                                 embedding_bias,
+                                                 hidden_unit,
+                                                 vocab_size,
+                                                 vocab_size_padded);
+}
+
+template void invokePaddingEmbedding(float*       padded_embedding_kernel,
+                                     float*       padded_embedding_bias,
+                                     const float* embedding_kernel,
+                                     const float* embedding_bias,
+                                     const int    hidden_unit,
+                                     const int    vocab_size,
+                                     const int    vocab_size_padded,
+                                     cudaStream_t stream);
+
+template void invokePaddingEmbedding(half*        padded_embedding_kernel,
+                                     half*        padded_embedding_bias,
+                                     const half*  embedding_kernel,
+                                     const half*  embedding_bias,
+                                     const int    hidden_unit,
+                                     const int    vocab_size,
+                                     const int    vocab_size_padded,
+                                     cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokePaddingEmbedding(__nv_bfloat16*       padded_embedding_kernel,
+                                     __nv_bfloat16*       padded_embedding_bias,
+                                     const __nv_bfloat16* embedding_kernel,
+                                     const __nv_bfloat16* embedding_bias,
+                                     const int            hidden_unit,
+                                     const int            vocab_size,
+                                     const int            vocab_size_padded,
+                                     cudaStream_t         stream);
+#endif
+
+template<typename T>
+__global__ void paddingEmbeddingKernel(T*        padded_embedding_kernel,
+                                       const T*  embedding_kernel,
+                                       const int hidden_unit,
+                                       const int vocab_size,
+                                       const int vocab_size_padded)
+{
+    for (int id = threadIdx.x + blockIdx.x * blockDim.x; id < hidden_unit * vocab_size_padded;
+         id += blockDim.x * gridDim.x) {
+        int row_id = id / hidden_unit;
+        int col_id = id % hidden_unit;
+        if (row_id < vocab_size) {
+            padded_embedding_kernel[id] = embedding_kernel[row_id * hidden_unit + col_id];
+        }
+        else {
+            padded_embedding_kernel[id] = (T)(0.0f);
+        }
+    }
+}
+
+template<typename T>
+void invokePaddingEmbeddingKernel(T*           padded_embedding_kernel,
+                                  const T*     embedding_kernel,
+                                  const int    hidden_unit,
+                                  const int    vocab_size,
+                                  const int    vocab_size_padded,
+                                  cudaStream_t stream)
+{
+    dim3 block(512);
+    dim3 grid((int)(ceil(hidden_unit * vocab_size_padded / 512.)));
+    paddingEmbeddingKernel<<<grid, block, 0, stream>>>(
+        padded_embedding_kernel, embedding_kernel, hidden_unit, vocab_size, vocab_size_padded);
+}
+
+template void invokePaddingEmbeddingKernel(float*       padded_embedding_kernel,
+                                           const float* embedding_kernel,
+                                           const int    hidden_unit,
+                                           const int    vocab_size,
+                                           const int    vocab_size_padded,
+                                           cudaStream_t stream);
+
+template void invokePaddingEmbeddingKernel(half*        padded_embedding_kernel,
+                                           const half*  embedding_kernel,
+                                           const int    hidden_unit,
+                                           const int    vocab_size,
+                                           const int    vocab_size_padded,
+                                           cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void invokePaddingEmbeddingKernel(__nv_bfloat16*       padded_embedding_kernel,
+                                           const __nv_bfloat16* embedding_kernel,
+                                           const int            hidden_unit,
+                                           const int            vocab_size,
+                                           const int            vocab_size_padded,
+                                           cudaStream_t         stream);
+#endif
+
+__global__ void gatherTree(gatherTreeParam param)
+{
+    //  PREFIX SOFT PROMPT
+    //  beam: have six parts
+    //      [prompt | input | input_padding | prompt_padding | generated output | padding (use end_token)]
+    //  parents: have five parts
+    //      [prompt | input | input_padding | prompt_padding | generated output | padding (use 0)]
+    //  step_ids: need to remove prompt, input_padding and prompt_padding
+    //      the shape is [input_length + requested_output_length, bs, beam_width]
+    //      need to transpose to output_ids [bs, beam_width, input_length + requested_output_length]
+    //  max_input_length: input + input_padding + prompt_padding
+
+    //  P/PROMPT TUNING
+    //  NOTE: input (real ids | prompt virtual ids) have already been preprocessed during embedding lookup, no prompt
+    //  templates now beam: [input (real ids | prompt virtual ids) | input_padding | generated output | padding (use
+    //  end_token)] parents: [input (real ids | prompt virtual ids) | input_padding | generated output | padding (use
+    //  0)] step_ids: need to remove virtual prompt ids in input ids
+    //      the shape is [input_length (real input length, prompt length) + requested_output_length, bs, beam_width]
+    //      need to transpose to output_ids [bs, beam_width, input_length + requested_output_length]
+    //  max_input_length: input (real ids | prompt virtual ids) + input_padding
+
+    const int max_input_length = param.input_lengths == nullptr ? 0 : param.max_input_length;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < param.batch_size * param.beam_width;
+         i += gridDim.x * blockDim.x) {
+        const int batch = i / param.beam_width;
+        const int beam  = i % param.beam_width;
+        const int prompt_len =
+            param.prefix_soft_prompt_lengths == nullptr ? 0 : param.prefix_soft_prompt_lengths[batch];
+        int input_len = param.input_lengths == nullptr ? 0 : param.input_lengths[i];
+        // virtual prompts mean the prompt embedded in input ids (with prompt templates) [p/prompt tuning]
+        const int virtual_prompt_length =
+            param.p_prompt_tuning_prompt_lengths == nullptr ? 0 : param.p_prompt_tuning_prompt_lengths[batch];
+        // real input length (without virtual prompts) [p/prompt tuning]
+        input_len -= virtual_prompt_length;
+
+        const int* parent_ids = param.parent_ids;
+        const int* step_ids   = param.step_ids;
+
+        // TODO(bhsueh) optimize the reduce_max operation for large beam_width
+        int  max_len                      = -1;
+        bool update_response_input_length = param.response_input_lengths != nullptr;
+        // int selected_beam_index = 0;
+        for (int j = 0; j < param.beam_width; j++) {
+            int tmp_len =
+                param.max_sequence_lengths[batch * param.beam_width + j] + param.max_sequence_length_final_step;
+            // also remove the length of the soft prompts, p_prompt_tuning
+            param.max_sequence_lengths[batch * param.beam_width + j] =
+                tmp_len - param.max_prefix_soft_prompt_length
+                - (param.max_input_length - param.max_input_without_prompt_length);
+            // update the response input length
+            if (update_response_input_length) {
+                param.response_input_lengths[batch * param.beam_width + j] = input_len - prompt_len;
+            }
+            if (tmp_len > max_len) {
+                max_len = tmp_len;
+                // selected_beam_index = j;
+            }
+        }
+        const int max_seq_len_b = min(param.max_time, max_len);
+        if (max_seq_len_b <= 0) {
+            continue;
+        }
+
+#define GET_IX(time_ix, beam_ix)                                                                                       \
+    (param.batch_size * param.beam_width * (time_ix) + param.beam_width * batch + (beam_ix))
+
+        const int padding_offset_and_prompt_offset = max_input_length - input_len + prompt_len;
+        const int initial_tgt_ix                   = GET_IX(max_seq_len_b - 1 - padding_offset_and_prompt_offset, beam);
+        const int initial_parent_ix                = GET_IX(max_seq_len_b - 1, beam);
+        param.beams[initial_tgt_ix]                = __ldg(step_ids + initial_parent_ix);
+        int  parent    = parent_ids == nullptr ? 0 : __ldg(parent_ids + initial_parent_ix) % param.beam_width;
+        bool found_bad = false;
+
+        for (int level = max_seq_len_b - 2; level >= 0; --level) {
+            if (level < prompt_len || (level >= input_len && level < max_input_length)) {
+                continue;
+            }
+            int tgt_level = level >= max_input_length ? level - padding_offset_and_prompt_offset : level - prompt_len;
+            const int level_beam_ix   = GET_IX(tgt_level, beam);
+            const int level_parent_ix = GET_IX(level, parent);
+            if (parent < 0 || parent > param.beam_width) {
+                // param.beams[level_beam_ix] = -1;
+                param.beams[level_beam_ix] = param.end_tokens[batch];
+                parent                     = -1;
+                found_bad                  = true;
+            }
+            else {
+                param.beams[level_beam_ix] = __ldg(step_ids + level_parent_ix);
+                parent = parent_ids == nullptr ? 0 : __ldg(parent_ids + level_parent_ix) % param.beam_width;
+            }
+        }
+
+        // set the padded part as end_token
+        // input_len
+        for (int index = max_len - padding_offset_and_prompt_offset;
+             index < param.max_time - param.max_prefix_soft_prompt_length;
+             ++index) {
+            param.beams[GET_IX(index, beam)] = param.end_tokens[batch];
+        }
+
+        // Not necessary when using a BeamSearchDecoder, but necessary
+        // when a user feeds in possibly broken trajectory (i.e., non-eos
+        // entries in a beam following eos entries).
+        if (!found_bad) {
+            bool finished = false;
+            // skip the step 0 because it is often the start token
+            int start_step = max_input_length == 0 ? 1 : max_input_length;
+            for (int time = start_step; time < max_seq_len_b; ++time) {
+                const int level_beam_ix = GET_IX(time, beam);
+                if (finished) {
+                    param.beams[level_beam_ix] = param.end_tokens[batch];
+                }
+                else if (param.beams[level_beam_ix] == param.end_tokens[batch]) {
+                    finished = true;
+                }
+            }
+        }
+#undef GET_IX
+
+        // transpose on output_ids
+        // remove p_prompt tuning virtual tokens (end tokens)
+        int actual_output_length = param.max_time - param.max_prefix_soft_prompt_length
+                                   - (param.max_input_length - param.max_input_without_prompt_length);
+        if (param.output_ids != nullptr) {
+            for (int j = 0; j < actual_output_length; j++) {
+                param.output_ids[i * actual_output_length + j] =
+                    param.beams[j * param.batch_size * param.beam_width + i];
+            }
+        }
+    }
+}
+
+void invokeGatherTree(int*         beams,
+                      int*         max_sequence_lengths,
+                      const int    max_time,
+                      const int    batch_size,
+                      const int    beam_width,
+                      const int*   step_ids,
+                      const int*   parent_ids,
+                      const int*   end_tokens,
+                      cudaStream_t stream)
+{
+    gatherTreeParam param;
+    param.beams                      = beams;
+    param.max_sequence_lengths       = max_sequence_lengths;
+    param.max_time                   = max_time;
+    param.batch_size                 = batch_size;
+    param.beam_width                 = beam_width;
+    param.step_ids                   = step_ids;
+    param.parent_ids                 = parent_ids;
+    param.end_tokens                 = end_tokens;
+    param.max_input_length           = 1;
+    param.prefix_soft_prompt_lengths = nullptr;
+    param.stream                     = stream;
+    invokeGatherTree(param);
+}
+
+void invokeGatherTree(int*         beams,
+                      int*         max_sequence_lengths,
+                      const int    max_time,
+                      const int    batch_size,
+                      const int    beam_width,
+                      const int*   step_ids,
+                      const int*   parent_ids,
+                      const int*   end_tokens,
+                      const int    max_input_length,
+                      cudaStream_t stream)
+{
+    gatherTreeParam param;
+    param.beams                      = beams;
+    param.max_sequence_lengths       = max_sequence_lengths;
+    param.max_time                   = max_time;
+    param.batch_size                 = batch_size;
+    param.beam_width                 = beam_width;
+    param.step_ids                   = step_ids;
+    param.parent_ids                 = parent_ids;
+    param.end_tokens                 = end_tokens;
+    param.max_input_length           = max_input_length;
+    param.prefix_soft_prompt_lengths = nullptr;
+    param.stream                     = stream;
+    invokeGatherTree(param);
+}
+
+void invokeGatherTree(gatherTreeParam param)
+{
+    int  batchbeam = param.batch_size * param.beam_width;
+    dim3 grid(1), block(batchbeam);
+    // though decoder do not support > 1024 for now
+    if (batchbeam > 1024) {
+        grid.x  = ceil(param.batch_size * param.beam_width / 1024.);
+        block.x = 1024;
+    }
+    gatherTree<<<grid, block, 0, param.stream>>>(param);
+}
+
+__global__ void minusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num)
+{
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < token_num; i += blockDim.x * gridDim.x) {
+        if (finished[i] == false) {
+            sequence_lengths[i] -= 1;
+        }
+    }
+}
+
+void invokeMinusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream)
+{
+    dim3 block(min(256, token_num));
+    dim3 grid(ceil(token_num / 256.));
+    minusUnfinishedSeqlen<<<block, grid, 0, stream>>>(sequence_lengths, finished, token_num);
+}
+
+__global__ void plusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num)
+{
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < token_num; i += blockDim.x * gridDim.x) {
+        if (finished[i] == false) {
+            sequence_lengths[i] += 1;
+        }
+    }
+}
+
+void invokePlusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream)
+{
+    dim3 block(min(256, token_num));
+    dim3 grid(ceil(token_num / 256.));
+    plusUnfinishedSeqlen<<<block, grid, 0, stream>>>(sequence_lengths, finished, token_num);
+}
+
+template<typename T>
+__global__ void plusScalar(T* buf, const T val, const int size)
+{
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < size; i += blockDim.x * gridDim.x) {
+        buf[i] += val;
+    }
+}
+
+template<typename T>
+void invokePlusScalar(T* buf, const T val, const int size, cudaStream_t stream)
+{
+    dim3 block(min(256, size));
+    dim3 grid(ceil(size / 256.));
+    plusScalar<<<block, grid, 0, stream>>>(buf, val, size);
+}
+
+template void invokePlusScalar(int* buf, const int val, const int size, cudaStream_t stream);
+
+__global__ void finalize(int*         output_ids,
+                         int*         sequence_lengths,
+                         float*       cum_log_probs,
+                         float*       output_log_probs,
+                         const int*   topk_output_ids,
+                         const int*   topk_sequence_lengths,
+                         const float* scores,
+                         const float* topk_cum_log_probs,
+                         const float* topk_log_probs,
+                         const int*   num_beams,
+                         const int    beam_width,
+                         const int    max_seq_len)
+{
+    // output_ids: [bs, beam_width, max_seq_len]
+    // sequence_lengths: [bs, beam_width]
+    // cum_log_probs: [bs, beam_width]
+    // output_log_probs: [bs, beam_width, max_seq_len]
+    // topk_output_ids: [bs, 2 * beam_width, max_seq_len + 1]
+    // topk_sequence_lengths: [bs, 2 * beam_width]
+    // scores: [bs, 2 * beam_width]
+    // topk_cum_log_probs: [bs, 2 * beam_width]
+    // topk_log_probs: [bs, 2 * beam_width, max_seq_len + 1]
+    // num_beams: [bs]
+
+    // This kernel do a sorting for scores first, and then put the topk_output_ids
+    // into output_ids by the rank of scores.
+    // Note that we remove the start_token (the id at first position) from topk_output_ids
+
+    extern __shared__ char array[];
+    int*                   rank     = (int*)(array);
+    float*                 s_scores = (float*)(rank + beam_width);
+    if (threadIdx.x < num_beams[blockIdx.x]) {
+        s_scores[threadIdx.x] = scores[blockIdx.x * beam_width * 2 + threadIdx.x];
+    }
+    __syncthreads();
+
+    for (int i = 0; i < beam_width; i++) {
+        float score     = threadIdx.x < num_beams[blockIdx.x] ? s_scores[threadIdx.x] : -FLT_MAX;
+        float max_score = blockReduceMax<float>(score);
+
+        if (threadIdx.x == 0) {
+            for (int j = 0; j < beam_width * 2; j++) {
+                if (s_scores[j] == max_score) {
+                    rank[i]     = j;
+                    s_scores[j] = -FLT_MAX;
+                    break;
+                }
+            }
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x < beam_width) {
+        sequence_lengths[blockIdx.x * beam_width + threadIdx.x] =
+            topk_sequence_lengths[blockIdx.x * beam_width * 2 + rank[threadIdx.x]];
+        if (cum_log_probs != nullptr) {
+            cum_log_probs[blockIdx.x * beam_width + threadIdx.x] =
+                topk_cum_log_probs[blockIdx.x * beam_width * 2 + rank[threadIdx.x]];
+        }
+    }
+    for (int beam_idx = 0; beam_idx < beam_width; beam_idx++) {
+        // start from step 1 to skip the start token
+        for (int i = threadIdx.x; i < sequence_lengths[blockIdx.x * beam_width + beam_idx]; i += blockDim.x) {
+            output_ids[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i] =
+                topk_output_ids[blockIdx.x * (beam_width * 2) * (max_seq_len + 1) + rank[beam_idx] * (max_seq_len + 1)
+                                + (i + 1)];
+            if (output_log_probs != nullptr) {
+                output_log_probs[blockIdx.x * beam_width * max_seq_len + beam_idx * max_seq_len + i] =
+                    topk_log_probs[blockIdx.x * (beam_width * 2) * (max_seq_len + 1)
+                                   + rank[beam_idx] * (max_seq_len + 1) + (i + 1)];
+            }
+        }
+    }
+}
+
+void invokeFinalize(int*         output_ids,
+                    int*         sequence_lengths,
+                    float*       cum_log_probs,
+                    float*       output_log_probs,
+                    const int*   topk_output_ids,
+                    const int*   topk_sequence_lengths,
+                    const float* scores,
+                    const float* topk_cum_log_probs,
+                    const float* topk_log_probs,
+                    const int*   num_beams,
+                    const int    beam_width,
+                    const int    max_seq_len,
+                    const int    batch_size,
+                    cudaStream_t stream)
+{
+    dim3 block(beam_width * 2);
+    block.x = (block.x + 31) / 32 * 32;
+    FT_CHECK(block.x < 1024);
+    finalize<<<batch_size, block, beam_width * sizeof(int) + (beam_width * 2) * sizeof(float), stream>>>(
+        output_ids,
+        sequence_lengths,
+        cum_log_probs,
+        output_log_probs,
+        topk_output_ids,
+        topk_sequence_lengths,
+        scores,
+        topk_cum_log_probs,
+        topk_log_probs,
+        num_beams,
+        beam_width,
+        max_seq_len);
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/decoding_kernels.h b/src/fastertransformer/kernels/decoding_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..7527d8fc4673c6023d80c661c5ee4711cb78aa4b
--- /dev/null
+++ b/src/fastertransformer/kernels/decoding_kernels.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gpt_kernels.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeDecodingInitialize(bool*        finished,
+                              int*         sequence_length,
+                              int*         word_ids,
+                              T*           cum_log_probs,
+                              const int*   sentence_ids,
+                              const int    batch_size,
+                              const int    beam_width,
+                              const int    max_input_length,
+                              cudaStream_t stream);
+
+// get token from all_ids at step, then lookup from the embedding table
+// by the token
+template<typename T>
+void invokeEmbeddingLookupPosEncodingPadCount(T*                    from_tensor,
+                                              const T*              embedding_table,
+                                              const T*              position_encoding,
+                                              const int*            all_ids,
+                                              const int*            padding_count,
+                                              pPromptTuningParam<T> prompt_param,
+                                              const int             local_token_num,
+                                              const int             hidden_units,
+                                              const T               scale,
+                                              const int             step,
+                                              const int             token_num,
+                                              const int             ite,
+                                              const int             seq_len,
+                                              cudaStream_t          stream);
+
+template<typename T>
+void invokeEmbeddingLookupPosEncodingPadCount(T*           from_tensor,
+                                              const T*     embedding_table,
+                                              const T*     position_encoding,
+                                              const int*   all_ids,
+                                              const int*   padding_count,
+                                              const int    local_token_num,
+                                              const int    hidden_units,
+                                              const T      scale,
+                                              const int    step,
+                                              const int    token_num,
+                                              const int    ite,
+                                              cudaStream_t stream)
+{
+    invokeEmbeddingLookupPosEncodingPadCount(from_tensor,
+                                             embedding_table,
+                                             position_encoding,
+                                             all_ids,
+                                             padding_count,
+                                             {(const T**)nullptr, 0, 0, false, nullptr},
+                                             local_token_num,
+                                             hidden_units,
+                                             scale,
+                                             step,
+                                             token_num,
+                                             ite,
+                                             0,
+                                             stream);
+}
+
+template<typename T>
+void invokePaddingEmbedding(T*           padded_embedding_kernel,
+                            T*           padded_embedding_bias,
+                            const T*     embedding_kernel,
+                            const T*     embedding_bias,
+                            const int    hidden_unit,
+                            const int    vocab_size,
+                            const int    vocab_size_padded,
+                            cudaStream_t stream);
+
+template<typename T>
+void invokePaddingEmbeddingKernel(T*           padded_embedding_kernel,
+                                  const T*     embedding_kernel,
+                                  const int    hidden_unit,
+                                  const int    vocab_size,
+                                  const int    vocab_size_padded,
+                                  cudaStream_t stream);
+
+void invokeGatherTree(int*         beams,
+                      int*         max_sequence_lengths,
+                      const int    max_time,
+                      const int    batch_size,
+                      const int    beam_width,
+                      const int*   step_ids,
+                      const int*   parent_ids,
+                      const int*   end_tokens,
+                      cudaStream_t stream);
+
+void invokeGatherTree(int*         beams,
+                      int*         max_sequence_lengths,
+                      const int    max_time,
+                      const int    batch_size,
+                      const int    beam_width,
+                      const int*   step_ids,
+                      const int*   parent_ids,
+                      const int*   end_tokens,
+                      const int    max_input_length,
+                      cudaStream_t stream);
+
+struct gatherTreeParam {
+    int*       beams                          = nullptr;
+    int*       max_sequence_lengths           = nullptr;
+    int        max_sequence_length_final_step = 0;
+    const int* input_lengths                  = nullptr;
+    // response input lengths (used to slice the ids during postprocessing)
+    int*       response_input_lengths     = nullptr;
+    int        max_time                   = 0;
+    int        batch_size                 = 0;
+    int        beam_width                 = 0;
+    const int* step_ids                   = nullptr;
+    const int* parent_ids                 = nullptr;
+    const int* end_tokens                 = nullptr;
+    int        max_input_length           = 0;
+    const int* prefix_soft_prompt_lengths = nullptr;
+    // p_prompt_tuning prompt leangths, used to remove prompts during post-processing
+    const int* p_prompt_tuning_prompt_lengths  = nullptr;
+    int        max_input_without_prompt_length = 0;
+    // prefix soft prompt
+    int          max_prefix_soft_prompt_length = 0;
+    int*         output_ids                    = nullptr;
+    cudaStream_t stream;
+};
+
+void invokeGatherTree(gatherTreeParam param);
+
+void invokeMinusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream);
+void invokePlusUnfinishedSeqlen(int* sequence_lengths, const bool* finished, const int token_num, cudaStream_t stream);
+
+template<typename T>
+void invokePlusScalar(T* buf, const T val, const int size, cudaStream_t stream);
+
+void invokeFinalize(int*         output_ids,
+                    int*         sequence_lengths,
+                    float*       cum_log_probs,
+                    float*       output_log_probs,
+                    const int*   topk_output_ids,
+                    const int*   topk_sequence_lengths,
+                    const float* scores,
+                    const float* topk_cum_log_probs,
+                    const float* topk_log_probs,
+                    const int*   num_beams,
+                    const int    beam_width,
+                    const int    max_seq_len,
+                    const int    batch_size,
+                    cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/gen_relative_pos_bias.cu b/src/fastertransformer/kernels/gen_relative_pos_bias.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ab822f1e3de7b8b275143a510488fe3eefbd446f
--- /dev/null
+++ b/src/fastertransformer/kernels/gen_relative_pos_bias.cu
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublas_v2.h"
+#include "gen_relative_pos_bias.h"
+#include "reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cstdio>
+
+namespace fastertransformer {
+
+/*******************  invokeGenRelativePosBias  ***********************/
+// relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum]
+// relative_position_bias is [head_num, window_size^2, window_size^2]
+// grid(window_size*window_size, head_num)
+// block(window_size*window_size)
+
+template<typename T, typename Tindex>
+__global__ void gen_relative_pos_bias(T*            relative_position_bias,
+                                      const T*      relative_position_bias_table,
+                                      const Tindex* relative_position_bias_index,
+                                      const int     window_size,
+                                      const int     head_num)
+{
+    const int    h_in_window           = blockIdx.x / window_size;
+    const int    w_in_window           = blockIdx.x % window_size;
+    const int    h_in_token            = threadIdx.x / window_size;
+    const int    w_in_token            = threadIdx.x % window_size;
+    const int    head_idx              = blockIdx.y;
+    const int    elements_per_window   = window_size * window_size;
+    const size_t elements_per_window_2 = elements_per_window * elements_per_window;
+    const size_t output_idx = head_idx * elements_per_window_2 + blockIdx.x * elements_per_window + threadIdx.x;
+    if (output_idx < head_num * elements_per_window_2) {
+        const Tindex idx_in_table =
+            relative_position_bias_index[(h_in_window * window_size + w_in_window) * elements_per_window
+                                         + h_in_token * window_size + w_in_token];
+        relative_position_bias[output_idx] = relative_position_bias_table[idx_in_table * head_num + head_idx];
+    }
+}
+
+template<typename T, typename Tindex>
+void invokeGenRelativePosBias(T*            relative_position_bias,
+                              const T*      relative_position_bias_table,
+                              const Tindex* relative_position_bias_index,
+                              const int     window_size,
+                              const int     head_num,
+                              cudaStream_t  stream)
+{
+    dim3 grid(window_size * window_size, head_num);
+    dim3 block(window_size * window_size);
+
+    if (block.x > 1024) {
+        printf("[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.\n");
+        exit(-1);
+    }
+
+    gen_relative_pos_bias<<<grid, block, 0, stream>>>(
+        relative_position_bias, relative_position_bias_table, relative_position_bias_index, window_size, head_num);
+}
+
+/*******************  invokeGenRelativePosBiasV2  ***********************/
+template<typename T, typename Tindex>
+void invokeGenRelativePosBiasV2(T*            relative_position_bias,
+                                const T*      relative_coords_table,
+                                const Tindex* relative_position_bias_index,
+                                const T*      cpb_mlp_weight1,
+                                const T*      cpb_mlp_bias1,
+                                const T*      cpb_mlp_weight2,
+                                const int     window_size,
+                                const int     cpb_mlp_in_dim,
+                                const int     cpb_mlp_out_dim,
+                                const int     head_num,
+                                cudaStream_t  stream)
+{
+
+    dim3 grid(window_size * window_size, head_num);
+    dim3 block(window_size * window_size);
+
+    if (block.x > 1024) {
+        printf("[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.\n");
+        exit(-1);
+    }
+
+    T* relative_position_bias_table;
+    check_cuda_error(cudaMalloc(&relative_position_bias_table,
+                                ((2 * window_size - 1) * (2 * window_size - 1) * head_num) * sizeof(T)));
+    T* cpb_mlp_1;
+    check_cuda_error(
+        cudaMalloc(&cpb_mlp_1, ((2 * window_size - 1) * (2 * window_size - 1) * cpb_mlp_out_dim) * sizeof(T)));
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+
+    int            m     = (2 * window_size - 1) * (2 * window_size - 1);
+    T              alpha = (T)1.0f;
+    T              beta  = (T)0.0f;
+    cudaDataType_t type  = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t compute_type = std::is_same<float, T>::value ? CUBLAS_COMPUTE_32F : CUBLAS_COMPUTE_16F;
+#else
+    cudaDataType_t compute_type = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
+#endif
+    cublasGemmAlgo_t algo = std::is_same<float, T>::value ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    check_cuda_error(cublasGemmEx(cublas_handle,
+                                  CUBLAS_OP_T,
+                                  CUBLAS_OP_N,
+                                  cpb_mlp_out_dim,
+                                  m,
+                                  cpb_mlp_in_dim,
+                                  &alpha,
+                                  cpb_mlp_weight1,
+                                  type,
+                                  cpb_mlp_in_dim,
+                                  relative_coords_table,
+                                  type,
+                                  cpb_mlp_in_dim,
+                                  &beta,
+                                  cpb_mlp_1,
+                                  type,
+                                  cpb_mlp_out_dim,
+                                  compute_type,
+                                  algo));
+
+    invokeGenericActivation<ReluActivation, T, T>(
+        cpb_mlp_1, cpb_mlp_bias1, nullptr, nullptr, nullptr, nullptr, m, cpb_mlp_out_dim, 0, nullptr, nullptr, stream);
+
+    check_cuda_error(cublasGemmEx(cublas_handle,
+                                  CUBLAS_OP_T,
+                                  CUBLAS_OP_N,
+                                  head_num,
+                                  m,
+                                  cpb_mlp_out_dim,
+                                  &alpha,
+                                  cpb_mlp_weight2,
+                                  type,
+                                  cpb_mlp_out_dim,
+                                  cpb_mlp_1,
+                                  type,
+                                  cpb_mlp_out_dim,
+                                  &beta,
+                                  relative_position_bias_table,
+                                  type,
+                                  head_num,
+                                  compute_type,
+                                  algo));
+
+    gen_relative_pos_bias<<<grid, block, 0, stream>>>(
+        relative_position_bias, relative_position_bias_table, relative_position_bias_index, window_size, head_num);
+
+    invokeSigmoid(
+        relative_position_bias, window_size * window_size * window_size * window_size * head_num, 16.0f, stream);
+    check_cuda_error(cudaFree(relative_position_bias_table));
+    check_cuda_error(cudaFree(cpb_mlp_1));
+    check_cuda_error(cublasDestroy(cublas_handle));
+}
+
+/*******************  instantiation  ***********************/
+
+template void invokeGenRelativePosBias(float*       relative_position_bias,
+                                       const float* relative_position_bias_table,
+                                       const int*   relative_position_bias_index,
+                                       const int    window_size,
+                                       const int    head_num,
+                                       cudaStream_t stream);
+
+template void invokeGenRelativePosBias(half*        relative_position_bias,
+                                       const half*  relative_position_bias_table,
+                                       const int*   relative_position_bias_index,
+                                       const int    window_size,
+                                       const int    head_num,
+                                       cudaStream_t stream);
+
+template void invokeGenRelativePosBias(float*         relative_position_bias,
+                                       const float*   relative_position_bias_table,
+                                       const int64_t* relative_position_bias_index,
+                                       const int      window_size,
+                                       const int      head_num,
+                                       cudaStream_t   stream);
+
+template void invokeGenRelativePosBias(half*          relative_position_bias,
+                                       const half*    relative_position_bias_table,
+                                       const int64_t* relative_position_bias_index,
+                                       const int      window_size,
+                                       const int      head_num,
+                                       cudaStream_t   stream);
+
+__host__ __device__ uint32_t pow2_rounddown(uint32_t x)
+{
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    x >>= 1;
+    return x + 1;
+}
+
+template<typename T>
+__global__ void generate_alibi_slopes(T* alibi_slopes, const size_t num_heads)
+{
+    if (threadIdx.x < num_heads) {
+        // The nearest power of 2 greater than num_heads followed by HF's implementation.
+        int num_heads_pow2 = pow2_rounddown(num_heads);
+        // Loop over the attention head.
+        for (int h = threadIdx.x; h < num_heads; h += blockDim.x) {
+            if (h < num_heads_pow2) {
+                alibi_slopes[h] = static_cast<T>(powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2) - 3.f)), h + 1));
+            }
+            else {
+                alibi_slopes[h] = static_cast<T>(
+                    powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2 << 1) - 3.f)), (h - num_heads_pow2) * 2 + 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+void invokeBuildAlibiSlopes(T* alibi_slopes, const size_t num_heads, cudaStream_t stream)
+{
+    // Generate the slopes of a linear attention linear bias.
+    //
+    // Paper: https://arxiv.org/abs/2108.12409
+    // HF's implementation
+    //   https://github.com/huggingface/transformers/blob/56ef0ba44765162f830873c140bd40bdc975cc34/src/transformers/models/bloom/modeling_bloom.py#L86
+    // Author's implementation
+    //   https://github.com/ofirpress/attention_with_linear_biases/blob/02aa87e7a29e9340efd28d6d169018eafb3aa57a/fairseq/models/transformer.py#L760
+    //
+    // alibi_slopes: [num_heads],
+    //     strictly follows how HF implements. which treats power-of-2 heads, and non-power-of-2 heads differently.
+    //     what paper generates differs with HF's when number of heads is not a power of 2.
+    // num_heads: the number of attention heads.
+    // stream: a cuda stream.
+
+    dim3 block(min((int)num_heads, 512));
+    generate_alibi_slopes<<<1, block, 0, stream>>>(alibi_slopes, num_heads);
+}
+
+template void invokeBuildAlibiSlopes(float* alibi_slopes, const size_t num_heads, cudaStream_t stream);
+template void invokeBuildAlibiSlopes(half* alibi_slopes, const size_t num_heads, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeBuildAlibiSlopes(__nv_bfloat16* alibi_slopes, const size_t num_heads, cudaStream_t stream);
+#endif
+
+template void invokeGenRelativePosBiasV2(float*       relative_position_bias,
+                                         const float* relative_coords_table,
+                                         const int*   relative_position_bias_index,
+                                         const float* cpb_mlp_weight1,
+                                         const float* cpb_mlp_bias1,
+                                         const float* cpb_mlp_weight2,
+                                         const int    window_size,
+                                         const int    cpb_mlp_in_dim,
+                                         const int    cpb_mlp_out_dim,
+                                         const int    head_num,
+                                         cudaStream_t stream);
+
+template void invokeGenRelativePosBiasV2(half*        relative_position_bias,
+                                         const half*  relative_coords_table,
+                                         const int*   relative_position_bias_index,
+                                         const half*  cpb_mlp_weight1,
+                                         const half*  cpb_mlp_bias1,
+                                         const half*  cpb_mlp_weight2,
+                                         const int    window_size,
+                                         const int    cpb_mlp_in_dim,
+                                         const int    cpb_mlp_out_dim,
+                                         const int    head_num,
+                                         cudaStream_t stream);
+
+template void invokeGenRelativePosBiasV2(float*         relative_position_bias,
+                                         const float*   relative_coords_table,
+                                         const int64_t* relative_position_bias_index,
+                                         const float*   cpb_mlp_weight1,
+                                         const float*   cpb_mlp_bias1,
+                                         const float*   cpb_mlp_weight2,
+                                         const int      window_size,
+                                         const int      cpb_mlp_in_dim,
+                                         const int      cpb_mlp_out_dim,
+                                         const int      head_num,
+                                         cudaStream_t   stream);
+
+template void invokeGenRelativePosBiasV2(half*          relative_position_bias,
+                                         const half*    relative_coords_table,
+                                         const int64_t* relative_position_bias_index,
+                                         const half*    cpb_mlp_weight1,
+                                         const half*    cpb_mlp_bias1,
+                                         const half*    cpb_mlp_weight2,
+                                         const int      window_size,
+                                         const int      cpb_mlp_in_dim,
+                                         const int      cpb_mlp_out_dim,
+                                         const int      head_num,
+                                         cudaStream_t   stream);
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/gen_relative_pos_bias.h b/src/fastertransformer/kernels/gen_relative_pos_bias.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3c89fa4f23ecb1952a5bb59b0b215e67c8a7af9
--- /dev/null
+++ b/src/fastertransformer/kernels/gen_relative_pos_bias.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+
+#include <assert.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+namespace fastertransformer {
+
+enum class PositionEmbeddingType {
+    relative,
+    absolute,
+};
+
+template<typename T, typename Tindex>
+void invokeGenRelativePosBias(T*            relative_position_bias,
+                              const T*      relative_position_bias_table,
+                              const Tindex* relative_position_bias_index,
+                              const int     window_size,
+                              const int     head_num,
+                              cudaStream_t  stream);
+
+template<typename T>
+void invokeBuildAlibiSlopes(T* linear_position_bias_slopes, const size_t head_num, cudaStream_t stream);
+
+template<typename T, typename Tindex>
+void invokeGenRelativePosBiasV2(T*            relative_position_bias,
+                                const T*      relative_coords_table,
+                                const Tindex* relative_position_bias_index,
+                                const T*      cpb_mlp_weight1,
+                                const T*      cpb_mlp_bias1,
+                                const T*      cpb_mlp_weight2,
+                                const int     window_size,
+                                const int     cpb_mlp_in_dim,
+                                const int     cpb_mlp_out_dim,
+                                const int     head_num,
+                                cudaStream_t  stream);
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/gpt_kernels.cu b/src/fastertransformer/kernels/gpt_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9402b57fab3098f8b4fa575b13384acbdffbfbfa
--- /dev/null
+++ b/src/fastertransformer/kernels/gpt_kernels.cu
@@ -0,0 +1,1111 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11050)
+#include <cub/cub.cuh>
+#else
+#include "3rdparty/cub/cub.cuh"
+#endif
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
+template<typename T, bool OUTPUT_ID, int PROMPT_SRC>
+__global__ void start_id_embedding_position_lookups_kernel(T*                    from_tensor,
+                                                           int*                  output_ids,
+                                                           const T*              embedding_table,
+                                                           const T*              pos_table,
+                                                           pPromptTuningParam<T> prompt_param,
+                                                           const int*            input_ids,
+                                                           const int             start_step,
+                                                           const int             length,
+                                                           const int             max_length,
+                                                           const int             batch_size,
+                                                           const int64_t         hidden_units)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * length * hidden_units;
+         index += blockDim.x * gridDim.x) {
+        // transpose the input_ids [batch, length] (part of [batch, max_length]) to output_ids [length, batch]
+        if (OUTPUT_ID && index < batch_size * max_length) {
+            // for p/prompt_tuning (have prompt templates like [input1, prompt1, input2, prompt2])
+            // we have to process it to like [input1, input2, prompt1, prompt2], and then remove the prompts during post
+            // processing
+            if (PROMPT_SRC > 0) {
+                if (index < batch_size) {
+                    int no_prompt_output_seq_id = 0;
+#pragma unroll 1
+                    for (int seq_id = 0; seq_id < max_length; seq_id++) {
+                        int current_input_id = input_ids[index * max_length + seq_id];
+                        if (current_input_id < prompt_param.p_prompt_tuning_id_start) {
+                            output_ids[no_prompt_output_seq_id * batch_size + index] = current_input_id;
+                            no_prompt_output_seq_id++;
+                        }
+                    }
+                }
+            }
+            else {
+                const int seq_id   = index % max_length;
+                const int batch_id = index / max_length;
+                if (seq_id < length) {
+                    output_ids[seq_id * batch_size + batch_id] = input_ids[index];
+                }
+            }
+        }
+
+        // embedding lookup from word ids [batch, length] (part of [batch, max_length]) and [vocab, hidden] to generate
+        // embedding [batch, length, hidden]
+        const int word_index      = index / hidden_units;
+        const int word_index_row  = word_index / length;  // batch_id
+        const int word_index_col  = word_index % length;
+        const int real_word_index = word_index_row * max_length + word_index_col;
+        const int step            = start_step + word_index % length;
+        const int col_index       = index % hidden_units;
+        const int input_id        = input_ids == nullptr ? real_word_index : input_ids[real_word_index];
+        const int prompt_id       = input_id - prompt_param.p_prompt_tuning_id_start;
+        T         embedding       = (T)0.0f;
+        if (PROMPT_SRC > 0 && prompt_id >= 0) {
+            if (PROMPT_SRC == 1) {
+                // from loaded prompt embedding tables
+                embedding =
+                    prompt_param.p_prompt_tuning_batch_weights[word_index_row][prompt_id * hidden_units + col_index];
+            }
+            else {
+                // from request prompt embedding
+                embedding =
+                    prompt_param
+                        .request_prompt_embedding[word_index_row * prompt_param.request_prompt_max_length * hidden_units
+                                                  + prompt_id * hidden_units + col_index];
+            }
+        }
+        else {
+            embedding = embedding_table[input_id * hidden_units + col_index];
+        }
+        T pos_embed        = pos_table == nullptr ? (T)0.f : pos_table[(step - 1) * hidden_units + col_index];
+        from_tensor[index] = embedding + pos_embed;
+    }
+}
+
+#define WORD_POS_EMBEDDING_LOOPUP_KERNEL(OUTPUT_ID, PROMPT_SRC)                                                        \
+    start_id_embedding_position_lookups_kernel<T, OUTPUT_ID, PROMPT_SRC><<<grid, block, 0, stream>>>(from_tensor,      \
+                                                                                                     output_ids,       \
+                                                                                                     embedding_table,  \
+                                                                                                     pos_table,        \
+                                                                                                     prompt_param,     \
+                                                                                                     input_ids,        \
+                                                                                                     start_step,       \
+                                                                                                     length,           \
+                                                                                                     max_length,       \
+                                                                                                     batch_size,       \
+                                                                                                     hidden_units);
+
+template<typename T>
+void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
+                                              int*                  output_ids,
+                                              const T*              embedding_table,  // can also be inputs_embeds
+                                              const T*              pos_table,
+                                              pPromptTuningParam<T> prompt_param,
+                                              const int*            input_ids,
+                                              const int             start_step,
+                                              const int             length,
+                                              const int             max_length,
+                                              const int             batch_size,
+                                              const int             hidden_units,
+                                              cudaStream_t          stream)
+{
+    dim3       grid(min(batch_size * length, 65536));
+    dim3       block(min(hidden_units, 512));
+    const bool has_output_ids = output_ids != nullptr;
+    FT_CHECK(!(has_output_ids && input_ids == nullptr));
+
+    if (has_output_ids) {
+        if (prompt_param.use_request_p_prompt_embedding) {
+            WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 2);
+        }
+        else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) {
+            WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 1);
+        }
+        else {
+            WORD_POS_EMBEDDING_LOOPUP_KERNEL(true, 0);
+        }
+    }
+    else {
+        if (prompt_param.use_request_p_prompt_embedding) {
+            WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 2);
+        }
+        else if (prompt_param.p_prompt_tuning_batch_weights != nullptr) {
+            WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 1);
+        }
+        else {
+            WORD_POS_EMBEDDING_LOOPUP_KERNEL(false, 0);
+        }
+    }
+}
+
+template void invokeInputIdsEmbeddingLookupPosEncoding(float*                    from_tensor,
+                                                       int*                      output_ids,
+                                                       const float*              embedding_table,
+                                                       const float*              pos_table,
+                                                       pPromptTuningParam<float> prompt_param,
+                                                       const int*                input_ids,
+                                                       const int                 start_step,
+                                                       const int                 length,
+                                                       const int                 max_length,
+                                                       const int                 batch_size,
+                                                       const int                 hidden_units,
+                                                       cudaStream_t              stream);
+
+template void invokeInputIdsEmbeddingLookupPosEncoding(half*                    from_tensor,
+                                                       int*                     output_ids,
+                                                       const half*              embedding_table,
+                                                       const half*              pos_table,
+                                                       pPromptTuningParam<half> prompt_param,
+                                                       const int*               input_ids,
+                                                       const int                start_step,
+                                                       const int                length,
+                                                       const int                max_length,
+                                                       const int                batch_size,
+                                                       const int                hidden_units,
+                                                       cudaStream_t             stream);
+
+#ifdef ENABLE_BF16
+template void invokeInputIdsEmbeddingLookupPosEncoding(__nv_bfloat16*                    from_tensor,
+                                                       int*                              output_ids,
+                                                       const __nv_bfloat16*              embedding_table,
+                                                       const __nv_bfloat16*              pos_table,
+                                                       pPromptTuningParam<__nv_bfloat16> prompt_param,
+                                                       const int*                        input_ids,
+                                                       const int                         start_step,
+                                                       const int                         length,
+                                                       const int                         max_length,
+                                                       const int                         batch_size,
+                                                       const int                         hidden_units,
+                                                       cudaStream_t                      stream);
+#endif
+
+template<typename T>
+__global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param)
+{
+    // 1. Copy the input ids to output ids and transpose output ids to [seq_len, batch_size, beam_width].
+    // 2. Embedding lookup by input ids and concat with soft prompt. The axis of concatenation is on axis of seq_len.
+
+    // Assume batch size is 2 and prompts are [[t1, t2], [t3], [t4, t5]], input_ids are [[s1, s2], [s3], [s4]]
+    // then the order of output_ids is
+    // [ [?, ?, s1, s2]
+    //   [?, s3, padding, padding]
+    //   [?, ?, s4, padding] ]
+    // and the order of embedding is
+    // [ [t1, t2, s1, s2]
+    //   [t3, s3, padding, padding]
+    //   [t4, t5, s4, padding] ]
+    // where "?" means undefined values and we should attach it.
+
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x;
+         index < param.batch_size * param.beam_width * (param.max_prefix_soft_prompt_length + param.max_input_length)
+                     * param.hidden_units;
+         index += blockDim.x * gridDim.x) {
+        // transpose the input_ids [batch, length] (part of [batch, beam, max_input_length]) to
+        // output_ids [length, batch, beam].
+        // ouptut_ids need to add padding in the beginning for soft prompting.
+
+        if (index < param.batch_size * param.beam_width * param.max_input_length) {
+            int       tmp_index = index;
+            const int seq_id    = tmp_index % param.max_input_length;
+            tmp_index           = (tmp_index - seq_id) / param.max_input_length;
+            const int beam_id   = tmp_index % param.beam_width;
+            tmp_index           = (tmp_index - beam_id) / param.beam_width;
+            const int batch_id  = tmp_index % param.batch_size;
+            if (seq_id < param.max_input_length) {
+                param.output_ids[(param.prefix_soft_prompt_lengths[batch_id] + seq_id) * param.batch_size
+                                     * param.beam_width
+                                 + batch_id * param.beam_width + beam_id] = param.input_ids[index];
+            }
+        }
+
+        // embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab,
+        // hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length +
+        // max_prefix_soft_prompt_length, hidden]
+        int       tmp_index = index;
+        const int hidden_id = tmp_index % param.hidden_units;
+        tmp_index           = (tmp_index - hidden_id) / param.hidden_units;
+        const int seq_id    = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
+        tmp_index           = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
+        const int beam_id   = tmp_index % param.beam_width;
+        tmp_index           = (tmp_index - beam_id) / param.beam_width;
+        const int batch_id  = tmp_index % param.batch_size;
+        const int64_t hidden_units = param.hidden_units;
+        T         embedding =
+            (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ?
+                        (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
+                                                      + seq_id * hidden_units + hidden_id] :
+                            param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
+                                                      + beam_id * param.max_input_length
+                                                      + (seq_id - param.prefix_soft_prompt_lengths[batch_id])]
+                                          * hidden_units
+                                      + hidden_id];
+
+        T pos_embed              = param.pos_table == nullptr ?
+                                       (T)0.0f :
+                                       param.pos_table[(param.start_step + seq_id - 1) * hidden_units + hidden_id];
+        param.from_tensor[index] = embedding + pos_embed;
+
+        if (seq_id == 0 && hidden_id == 0) {
+            param.input_lengths[batch_id * param.beam_width + beam_id] += param.prefix_soft_prompt_lengths[batch_id];
+        }
+    }
+}
+
+template<typename T>
+void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param)
+{
+    dim3 grid(min(param.batch_size * param.beam_width * (param.max_input_length + param.max_prefix_soft_prompt_length),
+                  65536));
+    dim3 block(min(param.hidden_units, 512));
+    inputIdsEmbeddingLookupPosEncodingSoftPrompt<T><<<grid, block, 0, param.stream>>>(param);
+}
+
+template void
+invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<float> param);
+
+template void
+invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<half> param);
+
+#ifdef ENABLE_BF16
+template void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(
+    inputIdsEmbeddingLookupPosEncodingSoftPromptParam<__nv_bfloat16> param);
+#endif
+
+// TODO Add half2 implementation
+template<typename T>
+__global__ void transposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2)
+{
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (index < dim0 * dim1 * dim2) {
+        const int input_dim2_index = index % dim2;
+        index                      = (index - input_dim2_index) / dim2;
+        const int input_dim1_index = index % dim1;
+        index                      = (index - input_dim1_index) / dim1;
+        const int input_dim0_index = index % dim0;
+
+        out[input_dim1_index * dim0 * dim2 + input_dim0_index * dim2 + input_dim2_index] =
+            in[input_dim0_index * dim1 * dim2 + input_dim1_index * dim2 + input_dim2_index];
+    }
+}
+
+template<typename T>
+void invokeTransposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream)
+{
+    dim3 block(512);
+    dim3 grid((int)(ceil(dim0 * dim1 * dim2 / 512.)));
+    transposeAxis01<<<grid, block, 0, stream>>>(out, in, dim0, dim1, dim2);
+}
+
+template void
+invokeTransposeAxis01(float* out, float* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream);
+
+template void
+invokeTransposeAxis01(half* out, half* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream);
+
+template void
+invokeTransposeAxis01(int* out, int* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream);
+
+template<typename T>
+__global__ void transposeAxis01(T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1)
+{
+    // out: [dim1, dim0]
+    // in: [dim0, dim1]
+    // in_skipping_dim1: [dim1]
+
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (index < dim0 * dim1) {
+        const int input_dim1_index = index % dim1;
+        index                      = (index - input_dim1_index) / dim1;
+        const int input_dim0_index = index % dim0;
+        const int in_offset        = in_skipping_dim1 == nullptr ? 0 : in_skipping_dim1[input_dim1_index] * dim1;
+
+        out[input_dim1_index * dim0 + input_dim0_index] = in[in_offset + input_dim0_index * dim1 + input_dim1_index];
+    }
+}
+
+template<typename T>
+void invokeTransposeAxis01(
+    T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1, cudaStream_t stream)
+{
+    dim3 block(512);
+    dim3 grid((int)(ceil(dim0 * dim1 / 512.)));
+    transposeAxis01<<<grid, block, 0, stream>>>(out, in, in_skipping_dim1, dim0, dim1);
+}
+
+template void invokeTransposeAxis01(
+    int* out, int* in, const int* in_skipping_dim1, const int dim0, const int dim1, cudaStream_t stream);
+
+template<typename T, bool PREFIX_PROMPT>
+__global__ void buildDecoderAttentionMaskKernel(T*         attention_mask,
+                                                const int* sequence_lengths,
+                                                const int* prefix_prompt_lengths,
+                                                const int  max_seq_len,
+                                                const int  max_prompt_length)
+{
+    // sequence_lengths: [batch_size]
+    // attention_mask: [batch_size, 1, max_seq_len, max_seq_len + max_prompt_length]
+    const int max_prompt_seq_length = max_seq_len + max_prompt_length;
+    const int mask_size_per_seq     = max_seq_len * max_prompt_seq_length;
+    attention_mask += blockIdx.x * mask_size_per_seq;
+    const int seq_length    = sequence_lengths[blockIdx.x];
+    const int prompt_length = PREFIX_PROMPT ? prefix_prompt_lengths[blockIdx.x] : 0;
+    for (int i = threadIdx.x; i < mask_size_per_seq; i += blockDim.x) {
+        int row_id = i / max_prompt_seq_length;
+        int col_id = i % max_prompt_seq_length;
+        if (row_id < seq_length && col_id <= (row_id + prompt_length)) {
+            attention_mask[i] = (T)(1.0f);
+        }
+        else {
+            attention_mask[i] = (T)(0.0f);
+        }
+    }
+}
+
+template<typename T>
+void invokeBuildDecoderAttentionMask(T*           attention_mask,
+                                     const int*   sequence_lengths,
+                                     const int*   prefix_prompt_lengths,
+                                     const int    batch_size,
+                                     const int    max_seq_len,
+                                     const int    max_prompt_length,
+                                     cudaStream_t stream)
+{
+    if (max_prompt_length == 0) {
+        buildDecoderAttentionMaskKernel<T, false><<<batch_size, 256, 0, stream>>>(
+            attention_mask, sequence_lengths, prefix_prompt_lengths, max_seq_len, max_prompt_length);
+    }
+    else {
+        buildDecoderAttentionMaskKernel<T, true><<<batch_size, 256, 0, stream>>>(
+            attention_mask, sequence_lengths, prefix_prompt_lengths, max_seq_len, max_prompt_length);
+    }
+}
+
+template void invokeBuildDecoderAttentionMask(float*       attention_mask,
+                                              const int*   sequence_lengths,
+                                              const int*   prefix_prompt_lengths,
+                                              const int    batch_size,
+                                              const int    max_seq_len,
+                                              const int    max_prompt_length,
+                                              cudaStream_t stream);
+template void invokeBuildDecoderAttentionMask(half*        attention_mask,
+                                              const int*   sequence_lengths,
+                                              const int*   prefix_prompt_lengths,
+                                              const int    batch_size,
+                                              const int    max_seq_len,
+                                              const int    max_prompt_length,
+                                              cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeBuildDecoderAttentionMask(__nv_bfloat16* attention_mask,
+                                              const int*     sequence_lengths,
+                                              const int*     prefix_prompt_lengths,
+                                              const int      batch_size,
+                                              const int      max_seq_len,
+                                              const int      max_prompt_length,
+                                              cudaStream_t   stream);
+#endif
+#ifdef ENABLE_FP8
+template void invokeBuildDecoderAttentionMask(__nv_fp8_e4m3* attention_mask,
+                                              const int*     sequence_lengths,
+                                              const int*     prefix_prompt_lengths,
+                                              const int      batch_size,
+                                              const int      max_seq_len,
+                                              const int      max_prompt_length,
+                                              cudaStream_t   stream);
+#endif
+
+template<typename T>
+__launch_bounds__(1024, 1) __global__ void lookupHiddenStateOfLastToken(T*         from_tensor,
+                                                                        const T*   hidden_state,
+                                                                        const int* input_lengths,
+                                                                        const int  max_input_length,
+                                                                        const int  batch_size,
+                                                                        const int  hidden_units)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * hidden_units;
+         index += blockDim.x * gridDim.x) {
+        const int col_index = index % hidden_units;
+        const int batch_id  = index / hidden_units;
+        from_tensor[index]  = hidden_state[batch_id * max_input_length * hidden_units
+                                          + (input_lengths[batch_id] - 1) * hidden_units + col_index];
+    }
+}
+
+template<typename T>
+void invokeLookupHiddenStateOfLastToken(T*           from_tensor,
+                                        const T*     hidden_state,
+                                        const int*   input_lengths,
+                                        const int    max_input_length,
+                                        const int    batch_size,
+                                        const int    hidden_units,
+                                        cudaStream_t stream)
+{
+    const int grid_size = (int)(ceil(batch_size * hidden_units / 1024.));
+    dim3      grid(min(grid_size, 65536));
+    dim3      block(min(hidden_units, 1024));
+    lookupHiddenStateOfLastToken<T><<<grid, block, 0, stream>>>(
+        from_tensor, hidden_state, input_lengths, max_input_length, batch_size, hidden_units);
+}
+
+template void invokeLookupHiddenStateOfLastToken(float*       from_tensor,
+                                                 const float* hidden_state,
+                                                 const int*   input_lengths,
+                                                 const int    max_input_length,
+                                                 const int    batch_size,
+                                                 const int    hidden_units,
+                                                 cudaStream_t stream);
+
+template void invokeLookupHiddenStateOfLastToken(half*        from_tensor,
+                                                 const half*  hidden_state,
+                                                 const int*   input_lengths,
+                                                 const int    max_input_length,
+                                                 const int    batch_size,
+                                                 const int    hidden_units,
+                                                 cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void invokeLookupHiddenStateOfLastToken(__nv_bfloat16*       from_tensor,
+                                                 const __nv_bfloat16* hidden_state,
+                                                 const int*           input_lengths,
+                                                 const int            max_input_length,
+                                                 const int            batch_size,
+                                                 const int            hidden_units,
+                                                 cudaStream_t         stream);
+#endif
+
+template<bool PREFIX_PROMPT>
+__global__ void tileGptPromptInputs(int*       tiled_input_ids,
+                                    int*       tiled_input_lengths,
+                                    int*       tiled_prompt_lengths,
+                                    const int* input_ids,
+                                    const int* input_lengths,
+                                    const int* prefix_prompt_lengths,
+                                    const int  max_input_length)
+{
+    if (threadIdx.x == 0) {
+        tiled_input_lengths[blockIdx.x * gridDim.y + blockIdx.y] = input_lengths[blockIdx.x];
+        if (PREFIX_PROMPT) {
+            tiled_prompt_lengths[blockIdx.x * gridDim.y + blockIdx.y] = prefix_prompt_lengths[blockIdx.x];
+        }
+    }
+    for (int index = threadIdx.x; index < max_input_length; index += blockDim.x) {
+        tiled_input_ids[(blockIdx.x * gridDim.y + blockIdx.y) * max_input_length + index] =
+            input_ids[blockIdx.x * max_input_length + index];
+    }
+}
+
+void invokeTileGptPromptInputs(int*         tiled_input_ids,
+                               int*         tiled_input_lengths,
+                               int*         tiled_prompt_lengths,
+                               const int*   input_ids,
+                               const int*   input_lengths,
+                               const int*   prefix_prompt_lengths,
+                               const int    batch_size,
+                               const int    beam_width,
+                               const int    max_input_length,
+                               cudaStream_t stream)
+{
+    dim3 grid(batch_size, beam_width);
+    dim3 block(min(1024, max_input_length));
+    if (prefix_prompt_lengths != nullptr) {
+        tileGptPromptInputs<true><<<grid, block, 0, stream>>>(tiled_input_ids,
+                                                              tiled_input_lengths,
+                                                              tiled_prompt_lengths,
+                                                              input_ids,
+                                                              input_lengths,
+                                                              prefix_prompt_lengths,
+                                                              max_input_length);
+    }
+    else {
+        tileGptPromptInputs<false><<<grid, block, 0, stream>>>(tiled_input_ids,
+                                                               tiled_input_lengths,
+                                                               tiled_prompt_lengths,
+                                                               input_ids,
+                                                               input_lengths,
+                                                               prefix_prompt_lengths,
+                                                               max_input_length);
+    }
+}
+
+void invokeTileGptInputs(int*         tiled_input_ids,
+                         int*         tiled_input_lengths,
+                         const int*   input_ids,
+                         const int*   input_lengths,
+                         const int    batch_size,
+                         const int    beam_width,
+                         const int    max_input_length,
+                         cudaStream_t stream)
+{
+    invokeTileGptPromptInputs(tiled_input_ids,
+                              tiled_input_lengths,
+                              nullptr,
+                              input_ids,
+                              input_lengths,
+                              nullptr,
+                              batch_size,
+                              beam_width,
+                              max_input_length,
+                              stream);
+}
+
+void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size)
+{
+    std::vector<uint32_t> seq_len_h(batch_size);
+    for (int i = 0; i < batch_size; i++) {
+        seq_len_h[i] = seq_len.getPtr<uint32_t>()[i] + limit_len_offset;
+    }
+    cudaH2Dcpy(seq_len_d, seq_len_h.data(), batch_size);
+}
+
+template<int TB_SIZE>
+__global__ void
+find_context_dups(int* shared_contexts, const int* input_ids, const size_t batch_size, const size_t input_seq_len)
+{
+    /* We compare all context pairs (i, j), with i (tgt) < j (src) , to detect duplicate
+     * inputs. If there's a match between i and j, we store i at the
+     * j-th position of shared_context. So that we know that j can be
+     * represented by i. shared_contexts is initialized like shared_contexts[i] = i
+     * and when there's a match, we actually use shared_contexts[j] = min(shared_contexts[j], i)
+     * so that in the end, shared_contexts effectively contains an index
+     * to the match with the lowest index context.
+     * Note that shared_contexts[i] <= i, a property that will be used when uncompacting
+     * inputs.
+     */
+    typedef cub::BlockReduce<int, TB_SIZE>       BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ bool                              match;
+
+    /* Each block is responsible for a (i, j) pair. To map the block space to
+     * the i < j space, we need to convert a linear addressing to a triangle, of
+     * size (batch_size * (batch_size - 1)) / 2
+     * For more information, check https://en.wikipedia.org/wiki/Triangular_number
+     */
+
+    // blockIdx = [0, 1, 2, ... n(n-1)/2] -> base_index = [0, 1, 1, 2, 2, 2, 3, 3, 3, 3, ..., n - 2]
+    const int base_index = floorf(0.5f * (sqrtf(1 + 8 * blockIdx.x) - 1));
+    const int src_idx    = base_index + 1;  // base_index \in [1, batch_size)
+
+    const int rev_base_index = base_index * (base_index + 1) / 2;
+    const int tgt_idx        = blockIdx.x - rev_base_index;  // tgt_idx \in [0, src_idx)
+
+    const int padded_length = TB_SIZE * ((input_seq_len + TB_SIZE - 1) / TB_SIZE);
+
+    int sum = 0;
+    for (int i = threadIdx.x; i < padded_length; i += TB_SIZE) {
+        int compare =
+            (i >= input_seq_len) ? 1 : input_ids[tgt_idx * input_seq_len + i] == input_ids[src_idx * input_seq_len + i];
+
+        sum = BlockReduce(temp_storage).Sum(compare);
+
+        if (threadIdx.x == 0) {
+            match = (sum == TB_SIZE);
+        }
+
+        __syncthreads();
+
+        if (!match) {
+            break;
+        }
+    }
+
+    if (threadIdx.x == 0 && match) {
+        atomicMin(&shared_contexts[src_idx], tgt_idx);
+    }
+}
+
+constexpr int DUPS_INDICES_BLOCK_SIZE = 128;
+
+__global__ void generate_dups_indices(int*         batch_to_compact,
+                                      int*         compact_to_batch,
+                                      int*         compact_size,
+                                      const int*   shared_contexts,
+                                      const size_t batch_size,
+                                      const size_t input_seq_len)
+{
+    const int padded_batchsize = blockDim.x * ((batch_size + blockDim.x - 1) / blockDim.x);
+
+    typedef cub::BlockScan<int, DUPS_INDICES_BLOCK_SIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
+    __shared__ typename BlockScan::TempStorage                                       temp_storage;
+    __shared__ int                                                                   scan_offset;
+
+    int scan = 0;
+    for (int batch = threadIdx.x; batch < padded_batchsize; batch += blockDim.x) {
+        bool masked     = (batch >= batch_size);
+        bool first_iter = batch < blockDim.x;
+
+        int is_first_occur = masked ? 0 : shared_contexts[batch] == batch;
+        BlockScan(temp_storage).ExclusiveSum(is_first_occur, scan);
+
+        if (!masked && is_first_occur) {
+            int compact_idx = scan + (first_iter ? 0 : scan_offset);
+            // Context rep. writes initial index
+            batch_to_compact[batch]       = compact_idx;
+            compact_to_batch[compact_idx] = batch;
+        }
+
+        if (threadIdx.x == blockDim.x - 1) {
+            scan_offset = scan + is_first_occur + (first_iter ? 0 : scan_offset);
+        }
+
+        __syncthreads();
+
+        if (!masked && !is_first_occur) {
+            // Fill the rest of batch_to_compact based on what rep. wrote
+            const int src_idx       = batch_to_compact[shared_contexts[batch]];
+            batch_to_compact[batch] = src_idx;
+        }
+    }
+
+    if (threadIdx.x == 0) {
+        *compact_size = scan_offset;
+    }
+}
+
+__global__ void init_shared_contexts(int* shared_contexts, const size_t batch_size)
+{
+    const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (global_idx >= batch_size) {
+        return;
+    }
+    shared_contexts[global_idx] = global_idx;
+}
+
+void invokeFindContextDups(int*         shared_contexts,
+                           int*         batch_to_compact,
+                           int*         compact_to_batch,
+                           int*         compact_size,
+                           const int*   input_ids,
+                           const size_t batch_size,
+                           const size_t input_seq_len,
+                           cudaStream_t stream)
+{
+    dim3 block{512};
+    dim3 grid{((int)batch_size + block.x - 1) / block.x};
+    init_shared_contexts<<<grid, block, 0, stream>>>(shared_contexts, batch_size);
+
+    grid = dim3{(unsigned int)(batch_size * (batch_size - 1)) / 2};
+    if (input_seq_len <= 128) {
+        block = 128;
+        find_context_dups<128><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
+    }
+    else {
+        block = 256;
+        find_context_dups<256><<<grid, block, 0, stream>>>(shared_contexts, input_ids, batch_size, input_seq_len);
+    }
+
+    generate_dups_indices<<<1, DUPS_INDICES_BLOCK_SIZE, 0, stream>>>(
+        batch_to_compact, compact_to_batch, compact_size, shared_contexts, batch_size, input_seq_len);
+}
+
+template<typename T>
+__global__ void compact_inputs(T*         compact_input,
+                               T*         compact_attention_mask,
+                               int*       compact_input_lengths,
+                               const T*   decoder_input,
+                               const T*   decoder_mask,
+                               const int* input_lengths,
+                               const int* compact_idx,
+                               size_t     compact_size,
+                               size_t     seq_len,
+                               size_t     hidden_dimension)
+{
+    const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (global_idx < compact_size * seq_len * hidden_dimension) {
+        const int h_id     = global_idx % hidden_dimension;
+        const int seq_id   = (global_idx / hidden_dimension) % seq_len;
+        const int batch_id = global_idx / (hidden_dimension * seq_len);
+
+        compact_input[global_idx] = decoder_input[(compact_idx[batch_id] * seq_len + seq_id) * hidden_dimension + h_id];
+    }
+
+    if (global_idx < compact_size * seq_len * seq_len) {
+        const int seq1_id  = global_idx % seq_len;
+        const int seq2_id  = (global_idx / seq_len) % seq_len;
+        const int batch_id = global_idx / (seq_len * seq_len);
+
+        compact_attention_mask[global_idx] =
+            decoder_mask[(compact_idx[batch_id] * seq_len + seq2_id) * seq_len + seq1_id];
+    }
+
+    if (global_idx < compact_size) {
+        compact_input_lengths[global_idx] = input_lengths[compact_idx[global_idx]];
+    }
+}
+
+template<typename T>
+void invokeCompactInputs(T*           compact_input,
+                         T*           compact_attention_mask,
+                         int*         compact_input_lengths,
+                         const T*     decoder_input,
+                         const T*     decoder_mask,
+                         const int*   input_lengths,
+                         const int*   compact_idx,
+                         size_t       compact_size,
+                         size_t       seq_len,
+                         size_t       hidden_dimension,
+                         cudaStream_t stream)
+{
+    /* Compact relevant decoder_layer inputs based on the identical contexts.
+     * For example, decoder_input is [batch_size, seq_len, H]. It's compacted
+     * into compact_input [compact_size, seq_len, H] such that
+     * compact_input[i, ...] = decoder_input[compact_idx[i], ...] */
+    const size_t elems_n = compact_size * seq_len * max(hidden_dimension, seq_len);
+    const dim3   blockDim(512);
+    const dim3   gridDim((elems_n + 512 - 1) / 512);
+
+    compact_inputs<T><<<gridDim, blockDim, 0, stream>>>(compact_input,
+                                                        compact_attention_mask,
+                                                        compact_input_lengths,
+                                                        decoder_input,
+                                                        decoder_mask,
+                                                        input_lengths,
+                                                        compact_idx,
+                                                        compact_size,
+                                                        seq_len,
+                                                        hidden_dimension);
+}
+
+#define INSTANTIATE_INVOKE_COMPACT_INPUTS(T)                                                                           \
+    template void invokeCompactInputs<T>(T * compact_input,                                                            \
+                                         T * compact_attention_mask,                                                   \
+                                         int*         compact_input_lengths,                                           \
+                                         const T*     decoder_input,                                                   \
+                                         const T*     decoder_mask,                                                    \
+                                         const int*   input_lengths,                                                   \
+                                         const int*   compact_idx,                                                     \
+                                         size_t       compact_size,                                                    \
+                                         size_t       seq_len,                                                         \
+                                         size_t       hidden_dimension,                                                \
+                                         cudaStream_t stream)
+INSTANTIATE_INVOKE_COMPACT_INPUTS(half);
+INSTANTIATE_INVOKE_COMPACT_INPUTS(float);
+#ifdef ENABLE_BF16
+INSTANTIATE_INVOKE_COMPACT_INPUTS(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_INVOKE_COMPACT_INPUTS
+
+template<typename T>
+__global__ void uncompact_outputs(T*         uncompact_buffer,
+                                  const T*   compact_buffer,
+                                  const int* batch_to_compact_idx,
+                                  size_t     batch_size,
+                                  size_t     buffer_stride)
+{
+    /* Uncompact a buffer IN of size [Compact, Stride] into OUT of size [Batch, Stride]
+     * so that \forall i, OUT[i, :] = IN[batch_to_compact_idx[i], :]
+     */
+    const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (global_idx >= batch_size * buffer_stride) {
+        return;
+    }
+
+    const int stride_idx = global_idx % buffer_stride;
+    const int batch_idx  = global_idx / buffer_stride;
+
+    const int src                = batch_to_compact_idx[batch_idx];
+    uncompact_buffer[global_idx] = compact_buffer[src * buffer_stride + stride_idx];
+}
+
+template<typename T>
+void invokeUnCompactOutputs(T*           uncompact_buffer,
+                            const T*     compact_buffer,
+                            const int*   batch_to_compact_idx,
+                            size_t       batch_size,
+                            size_t       buffer_stride,
+                            cudaStream_t stream)
+{
+    const size_t num_elems = batch_size * buffer_stride;
+    const dim3   blockDim(1024);
+    const dim3   gridDim((num_elems + blockDim.x - 1) / blockDim.x);
+
+    uncompact_outputs<T><<<gridDim, blockDim, 0, stream>>>(
+        uncompact_buffer, compact_buffer, batch_to_compact_idx, batch_size, buffer_stride);
+}
+
+#define INSTANTIATE_INVOKE_UNCOMPACT_OUTPUTS(T)                                                                        \
+    template void invokeUnCompactOutputs(T*           uncompact_buffer,                                                \
+                                         const T*     compact_buffer,                                                  \
+                                         const int*   batch_to_compact_idx,                                            \
+                                         size_t       batch_size,                                                      \
+                                         size_t       buffer_stride,                                                   \
+                                         cudaStream_t stream)
+INSTANTIATE_INVOKE_UNCOMPACT_OUTPUTS(half);
+INSTANTIATE_INVOKE_UNCOMPACT_OUTPUTS(float);
+#ifdef ENABLE_BF16
+INSTANTIATE_INVOKE_UNCOMPACT_OUTPUTS(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_INVOKE_UNCOMPACT_OUTPUTS
+
+template<typename T>
+__global__ void uncompact_caches(T*         uncompact_k_cache,
+                                 T*         uncompact_v_cache,
+                                 const T*   compact_k_cache,
+                                 const T*   compact_v_cache,
+                                 const int* batch_to_compact_idx,
+                                 size_t     batch_size,
+                                 size_t     num_heads,
+                                 size_t     max_seq_len,
+                                 size_t     seq_len,
+                                 size_t     size_per_head,
+                                 size_t     local_batch_size,
+                                 size_t     ite)
+{
+    const int hidden_dimension    = num_heads * size_per_head;
+    const int num_elems_per_batch = seq_len * hidden_dimension;
+    const int num_elems_cache     = batch_size * num_elems_per_batch;
+    const int x_size              = 16 / sizeof(T);
+
+    for (int global_idx = blockIdx.x * blockDim.x + threadIdx.x; global_idx < 2 * num_elems_cache;
+         global_idx += blockDim.x * gridDim.x) {
+
+        const bool     handle_k  = global_idx < num_elems_cache;
+        const T* const cache_src = handle_k ? compact_k_cache : compact_v_cache;
+        T* const       cache_dst = handle_k ? uncompact_k_cache : uncompact_v_cache;
+        const int      idx       = handle_k ? global_idx : global_idx - num_elems_cache;
+
+        const int src_offset = idx % num_elems_per_batch;
+        const int batch_idx  = idx / num_elems_per_batch;
+        const int batch_src  = batch_to_compact_idx[batch_idx] - ite * local_batch_size;
+
+        if (batch_src < 0 || batch_src >= local_batch_size) {
+            continue;
+        }
+
+        int dst_offset;
+        if (handle_k) {
+            const int i0 = idx % (x_size * seq_len);
+            const int i1 = (idx / (x_size * seq_len)) % (num_heads * size_per_head / x_size);
+            dst_offset   = i1 * max_seq_len * x_size + i0;
+        }
+        else {
+            const int i0 = idx % (size_per_head * seq_len);
+            const int i1 = (idx / (size_per_head * seq_len)) % (num_heads);
+            dst_offset   = i1 * max_seq_len * size_per_head + i0;
+        }
+
+        cache_dst[batch_idx * max_seq_len * hidden_dimension + dst_offset] =
+            cache_src[batch_src * num_elems_per_batch + src_offset];
+    }
+}
+
+template<typename T>
+void invokeUnCompactCaches(T*           uncompact_k_cache,
+                           T*           uncompact_v_cache,
+                           const T*     compact_k_cache,
+                           const T*     compact_v_cache,
+                           const int*   batch_to_compact_idx,
+                           size_t       batch_size,
+                           size_t       num_heads,
+                           size_t       max_seq_len,
+                           size_t       seq_len,
+                           size_t       size_per_head,
+                           size_t       local_batch_size,
+                           size_t       ite,
+                           cudaStream_t stream)
+{
+    const dim3 blockDim(512);
+    const dim3 gridDim(1024);
+    uncompact_caches<T><<<gridDim, blockDim, 0, stream>>>(uncompact_k_cache,
+                                                          uncompact_v_cache,
+                                                          compact_k_cache,
+                                                          compact_v_cache,
+                                                          batch_to_compact_idx,
+                                                          batch_size,
+                                                          num_heads,
+                                                          max_seq_len,
+                                                          seq_len,
+                                                          size_per_head,
+                                                          local_batch_size,
+                                                          ite);
+}
+
+#define INSTANTIATE_INVOKE_UNCOMPACT_CACHES(T)                                                                         \
+    template void invokeUnCompactCaches(T*           uncompact_k_cache,                                                \
+                                        T*           uncompact_v_cache,                                                \
+                                        const T*     compact_k_cache,                                                  \
+                                        const T*     compact_v_cache,                                                  \
+                                        const int*   batch_to_compact_idx,                                             \
+                                        size_t       batch_size,                                                       \
+                                        size_t       num_heads,                                                        \
+                                        size_t       max_seq_len,                                                      \
+                                        size_t       seq_len,                                                          \
+                                        size_t       size_per_head,                                                    \
+                                        size_t       local_batch_size,                                                 \
+                                        size_t       ite,                                                              \
+                                        cudaStream_t stream)
+INSTANTIATE_INVOKE_UNCOMPACT_CACHES(half);
+INSTANTIATE_INVOKE_UNCOMPACT_CACHES(float);
+#ifdef ENABLE_BF16
+INSTANTIATE_INVOKE_UNCOMPACT_CACHES(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_INVOKE_UNCOMPACT_CACHES
+
+template<bool PREFIX_PROMPT>
+__global__ void update_padding_count(int*       total_padding_count,
+                                     const int* input_lengths,
+                                     const int* tiled_prompt_lengths,
+                                     size_t     max_input_length,
+                                     size_t     max_prompt_length,
+                                     size_t     batch_size,
+                                     size_t     beam_width)
+{
+    const int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (gidx >= batch_size * beam_width) {
+        return;
+    }
+
+    const int batch_idx = gidx / beam_width;
+
+    total_padding_count[gidx] +=
+        PREFIX_PROMPT ? (max_input_length + max_prompt_length - input_lengths[batch_idx] - tiled_prompt_lengths[gidx]) :
+                        (max_input_length - input_lengths[batch_idx]);
+}
+
+void invokeUpdatePaddingCount(int*         total_padding_count,
+                              const int*   input_lengths,
+                              const int*   tiled_prompt_lengths,
+                              size_t       max_input_length,
+                              size_t       max_prompt_length,
+                              size_t       batch_size,
+                              size_t       beam_width,
+                              cudaStream_t stream)
+{
+    dim3 blockSize(256);
+    dim3 gridSize((batch_size * beam_width + blockSize.x - 1) / blockSize.x);
+
+    if (tiled_prompt_lengths != nullptr) {
+        update_padding_count<true><<<gridSize, blockSize, 0, stream>>>(total_padding_count,
+                                                                       input_lengths,
+                                                                       tiled_prompt_lengths,
+                                                                       max_input_length,
+                                                                       max_prompt_length,
+                                                                       batch_size,
+                                                                       beam_width);
+    }
+    else {
+        update_padding_count<false><<<gridSize, blockSize, 0, stream>>>(total_padding_count,
+                                                                        input_lengths,
+                                                                        tiled_prompt_lengths,
+                                                                        max_input_length,
+                                                                        max_prompt_length,
+                                                                        batch_size,
+                                                                        beam_width);
+    }
+}
+
+template<bool PREFIX_PROMPT>
+__global__ void mask_padding_tokens(bool*        masked_tokens,
+                                    const int*   input_lengths,
+                                    const int*   tiled_prefix_prompt_lengths,
+                                    const size_t memory_len,
+                                    const size_t max_input_length,
+                                    const size_t initial_step,
+                                    size_t       beam_width)
+{
+    const int seq_len = PREFIX_PROMPT ?
+                            (input_lengths[blockIdx.x / beam_width] + tiled_prefix_prompt_lengths[blockIdx.x]) :
+                            input_lengths[blockIdx.x / beam_width];
+    for (int step = initial_step + seq_len + threadIdx.x; step < initial_step + max_input_length; step += blockDim.x) {
+        masked_tokens[blockIdx.x * memory_len + step % memory_len] = true;
+    }
+}
+
+void invokeMaskPaddingTokens(bool*        masked_tokens,
+                             const int*   input_lengths,
+                             const int*   tiled_prefix_prompt_lengths,
+                             const size_t memory_len,
+                             const size_t max_input_length,
+                             const size_t initial_step,
+                             size_t       batch_size,
+                             size_t       beam_width,
+                             cudaStream_t stream)
+{
+    dim3 blockSize(128);
+    dim3 gridSize(batch_size * beam_width);
+    if (tiled_prefix_prompt_lengths != nullptr) {
+        mask_padding_tokens<true><<<gridSize, blockSize, 0, stream>>>(masked_tokens,
+                                                                      input_lengths,
+                                                                      tiled_prefix_prompt_lengths,
+                                                                      memory_len,
+                                                                      max_input_length,
+                                                                      initial_step,
+                                                                      beam_width);
+    }
+    else {
+        mask_padding_tokens<false><<<gridSize, blockSize, 0, stream>>>(masked_tokens,
+                                                                       input_lengths,
+                                                                       tiled_prefix_prompt_lengths,
+                                                                       memory_len,
+                                                                       max_input_length,
+                                                                       initial_step,
+                                                                       beam_width);
+    }
+}
+
+template<typename T>
+__global__ void sum_length_dimension(
+    float* out_buf, const T* in_buf, const size_t batch_size, const size_t input_length, const size_t hidden_dim)
+{
+    const int bidx = blockIdx.x;
+
+    for (int hidx = threadIdx.x; hidx < hidden_dim; hidx += blockDim.x) {
+        float accum = 0.0f;
+        for (int step = 0; step < input_length; step++) {
+            accum += static_cast<float>(in_buf[(bidx * input_length + step) * hidden_dim + hidx]);
+        }
+        out_buf[bidx * hidden_dim + hidx] = accum;
+    }
+}
+
+template<typename T>
+void invokeSumLengthDimension(float*       out_buf,
+                              const T*     in_buf,
+                              const size_t batch_size,
+                              const size_t input_length,
+                              const size_t hidden_dim,
+                              cudaStream_t stream)
+{
+    dim3 gridSize(batch_size);
+    dim3 blockSize(256);
+
+    sum_length_dimension<<<gridSize, blockSize, 0, stream>>>(out_buf, in_buf, batch_size, input_length, hidden_dim);
+}
+
+#define INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(T)                                                                     \
+    template void invokeSumLengthDimension(float*       out_buf,                                                       \
+                                           const T*     in_buf,                                                        \
+                                           const size_t batch_size,                                                    \
+                                           const size_t input_length,                                                  \
+                                           const size_t hidden_dim,                                                    \
+                                           cudaStream_t stream)
+INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(half);
+INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(float);
+#ifdef ENABLE_BF16
+INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION(__nv_bfloat16);
+#endif
+#undef INSTANTIATE_INVOKE_SUM_LENGTH_DIMENSION
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/gpt_kernels.h b/src/fastertransformer/kernels/gpt_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..617f9bc0542ef0064703a9ec17cd3c0cf6e7fd56
--- /dev/null
+++ b/src/fastertransformer/kernels/gpt_kernels.h
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <unordered_map>
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct inputIdsEmbeddingLookupPosEncodingSoftPromptParam {
+    T*           from_tensor;
+    int*         output_ids;
+    int*         input_lengths;
+    const T*     embedding_table;
+    const T*     pos_table;
+    const float* prefix_soft_prompt_embedding;
+    const int*   prefix_soft_prompt_lengths;
+    int*         input_ids;
+    int          start_step;
+    int          max_input_length;
+    int          max_prefix_soft_prompt_length;
+    int          batch_size;
+    int          beam_width;
+    int          hidden_units;
+    cudaStream_t stream;
+};
+
+template<typename T>
+struct pPromptTuningParam {
+    // Batch number of ptrs, each ptr is the ptr of the specific p/prompt tuning weights for this sequence
+    const T** p_prompt_tuning_batch_weights = nullptr;
+    // The start id of p_prompt_tuning token ids (based on the tokenizer)
+    // PROMPT_0 --> p_prompt_tuning_id_start; PROMPT_1 --> p_prompt_tuning_id_start + 1; ...
+    const int p_prompt_tuning_id_start = 0;
+    // Request prompt embeddding's max length
+    const int request_prompt_max_length = 0;
+    // Whether or not use the request prompt embeddings
+    const bool use_request_p_prompt_embedding = false;
+    // Request prompt embeddings
+    const T* request_prompt_embedding = nullptr;
+};
+
+template<typename T>
+void invokeInputIdsEmbeddingLookupPosEncoding(T*                    from_tensor,
+                                              int*                  output_ids,
+                                              const T*              embedding_table,
+                                              const T*              pos_table,
+                                              pPromptTuningParam<T> prompt_param,
+                                              const int*            input_ids,
+                                              const int             start_step,
+                                              const int             length,
+                                              const int             max_length,
+                                              const int             batch_size,
+                                              const int             hidden_units,
+                                              cudaStream_t          stream);
+
+template<typename T>
+void invokeInputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLookupPosEncodingSoftPromptParam<T> param);
+
+template<typename T>
+void invokeTransposeAxis01(T* out, T* in, const int dim0, const int dim1, const int dim2, cudaStream_t stream);
+
+template<typename T>
+void invokeTransposeAxis01(
+    T* out, T* in, const int* in_skipping_dim1, const int dim0, const int dim1, cudaStream_t stream);
+
+template<typename T>
+void invokeBuildDecoderAttentionMask(T*           attention_mask,
+                                     const int*   sequence_lengths,
+                                     const int*   prefix_prompt_lengths,
+                                     const int    batch_size,
+                                     const int    max_seq_len,
+                                     const int    max_prompt_length,
+                                     cudaStream_t stream);
+
+template<typename T>
+void invokeLookupHiddenStateOfLastToken(T*           from_tensor,
+                                        const T*     hidden_state,
+                                        const int*   input_lengths,
+                                        const int    max_input_length,
+                                        const int    batch_size,
+                                        const int    hidden_units,
+                                        cudaStream_t stream);
+
+void invokeTileGptPromptInputs(int*         tiled_input_ids,
+                               int*         tiled_input_lengths,
+                               int*         tiled_prompt_lengths,
+                               const int*   input_ids,
+                               const int*   input_lengths,
+                               const int*   prefix_prompt_lengths,
+                               const int    batch_size,
+                               const int    beam_width,
+                               const int    max_input_length,
+                               cudaStream_t stream);
+
+void invokeTileGptInputs(int*         tiled_input_ids,
+                         int*         tiled_input_lengths,
+                         const int*   input_ids,
+                         const int*   input_lengths,
+                         const int    batch_size,
+                         const int    beam_width,
+                         const int    max_input_length,
+                         cudaStream_t stream);
+
+void invokeFindContextDups(int*         shared_contexts,
+                           int*         batch_to_compact,
+                           int*         compact_to_batch,
+                           int*         compact_size,
+                           const int*   input_ids,
+                           const size_t batch_size,
+                           const size_t input_seq_len,
+                           cudaStream_t stream = 0);
+
+template<typename T>
+void handleOptArg(TensorMap* input_tensors, const std::string& arg_name, T* d_ptr, T default_value, size_t size)
+{
+    if (input_tensors->isExist(arg_name)) {
+        FT_CHECK(input_tensors->at(arg_name).size() == size);
+        cudaH2Dcpy(d_ptr, input_tensors->at(arg_name).getPtr<const T>(), size);
+    }
+    else {
+        deviceFill(d_ptr, size, default_value);
+    }
+}
+
+void setSeqLimitLen(uint32_t* seq_len_d, Tensor seq_len, int limit_len_offset, int batch_size);
+
+template<typename T>
+void invokeCompactInputs(T*           compact_input,
+                         T*           compact_attention_mask,
+                         int*         compact_input_lengths,
+                         const T*     decoder_input,
+                         const T*     decoder_mask,
+                         const int*   input_lengths,
+                         const int*   compact_idx,
+                         size_t       compact_size,
+                         size_t       seq_len,
+                         size_t       hidden_dimension,
+                         cudaStream_t stream = 0);
+
+template<typename T>
+void invokeUnCompactOutputs(T*           uncompact_buffer,
+                            const T*     compact_buffer,
+                            const int*   batch_to_compact_idx,
+                            size_t       batch_size,
+                            size_t       buffer_stride,
+                            cudaStream_t stream = 0);
+
+template<typename T>
+void invokeUnCompactCaches(T*           uncompact_k_cache,
+                           T*           uncompact_v_cache,
+                           const T*     compact_k_cache,
+                           const T*     compact_v_cache,
+                           const int*   batch_to_compact_idx,
+                           size_t       batch_size,
+                           size_t       num_heads,
+                           size_t       max_seq_len,
+                           size_t       seq_len,
+                           size_t       size_per_head,
+                           size_t       local_batch_size,
+                           size_t       ite,
+                           cudaStream_t stream = 0);
+
+void invokeUpdatePaddingCount(int*         total_padding_count,
+                              const int*   input_lengths,
+                              const int*   tiled_prompt_lengths,
+                              size_t       max_input_length,
+                              size_t       max_prompt_length,
+                              size_t       batch_size,
+                              size_t       beam_width,
+                              cudaStream_t stream = 0);
+
+inline void invokeUpdatePaddingCount(int*         total_padding_count,
+                                     const int*   input_lengths,
+                                     size_t       max_input_length,
+                                     size_t       batch_size,
+                                     size_t       beam_width,
+                                     cudaStream_t stream = 0)
+{
+    invokeUpdatePaddingCount(
+        total_padding_count, input_lengths, (const int*)nullptr, max_input_length, 0, batch_size, beam_width, stream);
+}
+
+void invokeMaskPaddingTokens(bool*        masked_tokens,
+                             const int*   input_lengths,
+                             const int*   tiled_prefix_prompt_lengths,
+                             const size_t memory_len,
+                             const size_t max_input_length,
+                             const size_t initial_step,
+                             size_t       batch_size,
+                             size_t       beam_width,
+                             cudaStream_t stream = 0);
+
+inline void invokeMaskPaddingTokens(bool*        masked_tokens,
+                                    const int*   input_lengths,
+                                    const size_t memory_len,
+                                    const size_t max_input_length,
+                                    const size_t initial_step,
+                                    size_t       batch_size,
+                                    size_t       beam_width,
+                                    cudaStream_t stream = 0)
+{
+    invokeMaskPaddingTokens(masked_tokens,
+                            input_lengths,
+                            (const int*)nullptr,
+                            memory_len,
+                            max_input_length,
+                            initial_step,
+                            batch_size,
+                            beam_width,
+                            stream);
+}
+
+template<typename T>
+void invokeSumLengthDimension(float*       out_buf,
+                              const T*     in_buf,
+                              const size_t batch_size,
+                              const size_t input_length,
+                              const size_t hidden_dim,
+                              cudaStream_t stream = 0);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/logprob_kernels.cu b/src/fastertransformer/kernels/logprob_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9256dd0945aa2eb1912cd965aadc573f80aec540
--- /dev/null
+++ b/src/fastertransformer/kernels/logprob_kernels.cu
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11050)
+#include <cub/cub.cuh>
+#else
+#include "3rdparty/cub/cub.cuh"
+#endif
+
+#include "src/fastertransformer/kernels/logprob_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/utils/logger.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void log_probs_kernel(float*       log_probs,
+                                 const T*     logits,
+                                 const int*   ids,
+                                 const int*   lengths,
+                                 const size_t max_input_length,
+                                 const size_t batch_size,
+                                 const size_t vocab_size,
+                                 const size_t vocab_size_padded,
+                                 bool         batch_first)
+{
+    // Calculate the log probability from logits.
+    //   log_probs[t, :] = log(softmax(logits))[ids[t + 1, :]]
+    //
+    // log_probs: [max_length - 1, batch_size] or [batch_size, max_length -1],
+    //     log probabilities of each token.
+    // logits: [max_length, batch_size, vocab_size_padded] or [batch_size, max_length, vocab_size_padded]
+    // lengths: [batch_size], sequence lengths
+    // ids: [max_length, batch_size], token ids.
+    // batch_size: [1], batch_size. in case of beam > 1, batch x beam.
+    // vocab_size: [1], vocab_size,
+    // vocab_size: [1], vocab_size_padded, padded vocab size.
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    int tidx = threadIdx.x;                            // vocab dim
+    int bidx = batch_first ? blockIdx.x : blockIdx.y;  // batch dim
+    int step = batch_first ? blockIdx.y : blockIdx.x;  // step dim
+
+    __shared__ float s_max_logit;
+
+    if (bidx < batch_size && step < lengths[bidx] - 1) {
+        // reposition logits to data for the current batch.
+        int step_offset  = batch_first ? step * vocab_size_padded : step * batch_size * vocab_size_padded;
+        int batch_offset = batch_first ? bidx * max_input_length * vocab_size_padded : bidx * vocab_size_padded;
+        logits += step_offset + batch_offset;
+
+        // Find max(logits).
+        float local_max = -MAX_T_VAL;
+        float val       = -MAX_T_VAL;
+        for (int i = tidx; i < vocab_size; i += blockDim.x) {
+            val       = static_cast<float>(logits[i]);
+            local_max = fmax(local_max, val);
+        }
+
+        float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax<float>(local_max);
+        if (tidx == 0) {
+            s_max_logit = max_val;
+        }
+        __syncthreads();
+
+        // Calculate the denominator: sum_i exp(logits[i])
+        float local_sum_exp = 0.0f;
+        for (int i = tidx; i < vocab_size; i += blockDim.x) {
+            val = __expf(static_cast<float>(logits[i]) - s_max_logit);
+            local_sum_exp += val;
+        }
+
+        float sum_exp = blockDim.x <= 32 ? warpReduceSum(local_sum_exp) : blockReduceSum<float>(local_sum_exp);
+        if (tidx == 0) {
+            int idx = batch_first ? step + bidx * (max_input_length - 1) : step * batch_size + bidx;
+            // log_probs[step, ...] is the log probability of a token at step t + 1.
+            int token_idx  = batch_first ? step + 1 + bidx * max_input_length : (step + 1) * batch_size + bidx;
+            log_probs[idx] = static_cast<float>(logits[ids[token_idx]]) - s_max_logit - __logf(sum_exp + 1e-9f);
+        }
+    }
+}
+
+__global__ void accumulate_log_probs(float*       cum_log_probs,
+                                     const float* log_probs,
+                                     const int*   lengths,
+                                     const size_t max_input_length,
+                                     const size_t batch_size,
+                                     const bool   batch_first)
+{
+    // Accumulate the log probability along with the sequence dimension.
+    //   cum_log_probs[j] = sum_i log(softmax(logits))[ids[i,j]]
+    //
+    // cum_log_probs: [batch_size], cumulative log probability
+    // log_probs: [max_length - 1, batch_size] or [batch_size, max_length - 1],
+    //   log probability of each token
+    // lengths: [batch_size], sequence lengths
+    // batch_size: [1], batch_size. in case of beam > 1, batch x beam.
+
+    int bidx = blockIdx.x;   // batch dim
+    int tidx = threadIdx.x;  // step dim
+
+    if (bidx < batch_size) {
+        int length = lengths[bidx];
+        // reposition logits to data for the current batch.
+        log_probs += batch_first ? bidx * (max_input_length - 1) : bidx;
+        int   stride      = batch_first ? 1 : batch_size;  // stride along with seq dim.
+        float local_accum = 0.0f;
+        for (int step = tidx; step < length - 1; step += blockDim.x) {
+            local_accum += static_cast<float>(log_probs[step * stride]);
+        }
+        float accum = blockDim.x <= 32 ? warpReduceSum(local_accum) : blockReduceSum<float>(local_accum);
+        if (tidx == 0) {
+            cum_log_probs[bidx] = accum;
+        }
+    }
+}
+
+template<typename T>
+void invokeLogProbFromLogits(float*       cum_log_probs,
+                             const T*     logits,
+                             const int*   input_ids,
+                             const int*   input_lengths,
+                             const size_t max_input_length,
+                             const size_t batch_size,
+                             const size_t vocab_size,
+                             const size_t vocab_size_padded,
+                             void*        workspace,
+                             const size_t workspace_size,
+                             cudaStream_t stream,
+                             const bool   batch_first)
+{
+    // A batched version of log prob computation.
+    //
+    // cum_log_probs: [batch_size]
+    // logits: [max_input_length, batch_size, vocab_size] or [batch_size, max_input_length, vocab_size]
+    // input_ids: [max_input_length, batch_size] or [max_input_length, batch_size]
+    // input_lengths: [batch_size]
+    // workspace: workspace buffer of size at least sizeof(float) * max_input_length * batch_size.
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // block_size should be multiple of 32 to use warpReduceMax.
+    const int block_size = vocab_size < 1024 ? (vocab_size + 31) / 32 * 32 : 1024;
+    assert(block_size % 32 == 0);
+    assert(workspace != nullptr && workspace_size >= sizeof(float) * max_input_length * batch_size);
+    assert(vocab_size <= vocab_size_padded);
+
+    float* log_probs = reinterpret_cast<float*>(workspace);
+    int    gx        = batch_first ? batch_size : max_input_length - 1;
+    int    gy        = batch_first ? max_input_length - 1 : batch_size;
+    dim3   grid(gx, gy);
+    log_probs_kernel<T><<<grid, block_size, 0, stream>>>(log_probs,
+                                                         logits,
+                                                         input_ids,
+                                                         input_lengths,
+                                                         max_input_length,
+                                                         batch_size,
+                                                         vocab_size,
+                                                         vocab_size_padded,
+                                                         batch_first);
+    accumulate_log_probs<<<batch_size, block_size, 0, stream>>>(
+        cum_log_probs, log_probs, input_lengths, max_input_length, batch_size, batch_first);
+}
+
+template void invokeLogProbFromLogits(float*       cum_log_probs,
+                                      const float* logits,
+                                      const int*   input_ids,
+                                      const int*   input_lengths,
+                                      const size_t max_input_length,
+                                      const size_t batch_size,
+                                      const size_t vocab_size,
+                                      const size_t vocab_size_padded,
+                                      void*        workspace,
+                                      const size_t workspace_size,
+                                      cudaStream_t stream,
+                                      const bool   batch_first);
+
+template void invokeLogProbFromLogits(float*       cum_log_probs,
+                                      const half*  logits,
+                                      const int*   input_ids,
+                                      const int*   input_lengths,
+                                      const size_t max_input_length,
+                                      const size_t batch_size,
+                                      const size_t vocab_size,
+                                      const size_t vocab_size_padded,
+                                      void*        workspace,
+                                      const size_t workspace_size,
+                                      cudaStream_t stream,
+                                      const bool   batch_first);
+}  // end of namespace fastertransformer
diff --git a/src/fastertransformer/kernels/logprob_kernels.h b/src/fastertransformer/kernels/logprob_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b1aa4b65dfef9e705ed4f78a7d7b4d33baf8497
--- /dev/null
+++ b/src/fastertransformer/kernels/logprob_kernels.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeLogProbFromLogits(float*       cum_log_probs,
+                             const T*     logits,
+                             const int*   input_ids,
+                             const int*   input_lengths,
+                             const size_t max_input_length,
+                             const size_t batch_size,
+                             const size_t vocab_size,
+                             const size_t vocab_size_padded,
+                             void*        workspace,
+                             const size_t workspace_size,
+                             cudaStream_t stream,
+                             const bool   batch_first = false);
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.cu b/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..67d382b618484cfe54d28ffdd2e3b68404beb960
--- /dev/null
+++ b/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.cu
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11050)
+#include <cub/cub.cuh>
+#else
+#include "3rdparty/cub/cub.cuh"
+#endif
+
+#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+#define DO_SPLIT_SMALL_TOP_K_SOFTMAX
+static const int SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE = 256;
+
+#define TOPK_FP16_STORAGE 0
+
+template<typename T>
+__device__ __forceinline__ T apply_length_penalty(T log_prob, int length, float length_penalty)
+{
+    // score = log(prob) / (length)^length_penalty.
+    if (length_penalty == 0.0f || length == 1) {
+        return log_prob;
+    }
+    return log_prob / static_cast<T>(powf(length, length_penalty));
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__
+    void batch_topK_kernel(int* topk_tmp_id_buf, T* topk_tmp_val_buf, int* id_buf)
+{
+    int            thread_id = threadIdx.x;
+    int            block_id  = blockIdx.x;
+    TopK<T, MAX_K> partial;
+    if (thread_id == 0) {
+        for (int i = 0; i < MAX_K; ++i) {
+            partial.p[i] = -1;
+            partial.u[i] = -FLT_MAX;
+        }
+
+        int index = block_id * MAX_K * MAX_K;
+        for (int i = 0; i < MAX_K * MAX_K; i++) {
+            partial.insert((T)topk_tmp_val_buf[index + i], topk_tmp_id_buf[index + i]);
+        }
+
+        index = block_id * MAX_K;
+        for (int i = 0; i < MAX_K; i++) {
+            id_buf[index + i] = partial.p[i];
+        }
+    }
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void batch_topK_kernel(const int* __restrict topk_tmp_id_buf,
+                                                                      const T* __restrict topk_tmp_val_buf,
+                                                                      int* __restrict id_buf,
+                                                                      T* __restrict val_buf)
+{
+    int            thread_id = threadIdx.x;
+    int            block_id  = blockIdx.x;
+    TopK<T, MAX_K> partial;
+    if (thread_id == 0) {
+        for (int i = 0; i < MAX_K; ++i) {
+            partial.p[i] = -1;
+            partial.u[i] = -FLT_MAX;
+        }
+
+        int index = block_id * MAX_K * MAX_K;
+        for (int i = 0; i < MAX_K * MAX_K; i++) {
+            partial.insert((T)topk_tmp_val_buf[index + i], topk_tmp_id_buf[index + i]);
+        }
+
+        index = block_id * MAX_K;
+        for (int i = 0; i < MAX_K; i++) {
+            id_buf[index + i]  = partial.p[i];
+            val_buf[index + i] = partial.u[i];
+        }
+    }
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void batch_topk_kernel(const int* __restrict x,
+                                                                      const T* __restrict y,
+                                                                      int* __restrict z,
+                                                                      float* __restrict v,
+                                                                      float*         output_log_probs,
+                                                                      const bool*    finished,
+                                                                      const int*     sequence_lengths,
+                                                                      BeamHypotheses beam_hyps,
+                                                                      const int      V,
+                                                                      const int      K,
+                                                                      const int      vocab_size,
+                                                                      const float    length_penalty,
+                                                                      const T        diversity_rate)
+{
+    int thread_id = threadIdx.x;
+    int vector_id = blockIdx.x;
+
+    // reposition x, y to data for the current vector
+    x += vector_id * V;
+    y += vector_id * V;
+
+    typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ int                               selected_beams;
+    __shared__ float                             old_cum_log_probs[MAX_K];
+
+    if (thread_id == 0) {
+        selected_beams = 0;
+    }
+    if (thread_id < K) {
+        old_cum_log_probs[thread_id] = v[vector_id * K + thread_id];
+    }
+    __syncthreads();
+    if (beam_hyps.num_beams != nullptr) {
+        const int global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + vector_id;
+        if (beam_hyps.num_beams[global_batch_idx] == 0 && thread_id == 0) {
+            beam_hyps.min_normed_scores[global_batch_idx] = FLT_MAX;
+        }
+        else if (beam_hyps.num_beams[global_batch_idx] == K) {
+            return;
+        }
+    }
+
+    TopK<T, MAX_K> partial;
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.p[i] = -1;
+        partial.u[i] = -FLT_MAX;
+    }
+
+    for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
+        int i    = elem_id % K;
+        T   elem = length_penalty == 0.0f ? y[elem_id] :
+                                            apply_length_penalty(y[elem_id],
+                                                               finished[vector_id] ? sequence_lengths[vector_id] :
+                                                                                       sequence_lengths[vector_id] + 1,
+                                                               length_penalty);
+        elem += diversity_rate * (T)i;
+        int elem_idx = elem_id;  // x[elem_id];
+        partial.insert(elem, elem_idx);
+    }
+
+    TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
+
+    if (thread_id == 0) {
+        z += vector_id * K;
+        v += vector_id * K;
+
+        for (int i = 0; i < MAX_K; ++i) {
+            if (beam_hyps.num_beams != nullptr && x[total.p[i]] % vocab_size == beam_hyps.end_ids[vector_id]) {
+                // if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
+                // https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
+                if (i >= K) {
+                    // do nothing
+                }
+                else {
+                    const int   global_batch_idx = beam_hyps.ite * beam_hyps.local_batch_size + vector_id;
+                    const float normed_score     = (float)total.u[i];
+                    const int   num_beam         = beam_hyps.num_beams[global_batch_idx];
+                    int         beam_idx         = num_beam;
+                    // If there are beam_width finished sentences, check that the score of selected candidatet
+                    // is higher than min_normed_score or not. If current score is better, replace worst one
+                    // and update the min_normed_score.
+                    if (num_beam == K) {
+                        if (normed_score < beam_hyps.min_normed_scores[global_batch_idx]) {
+                            // end the tracing and exist this for loop
+                            selected_beams = K;
+                            break;
+                        }
+                        else {
+                            // find the beam index which's score = min_normed_score, erase it.
+                            for (int j = 0; j < K; j++) {
+                                if (beam_hyps.normed_scores[global_batch_idx * (K * 2) + j]
+                                    == beam_hyps.min_normed_scores[global_batch_idx]) {
+                                    beam_idx = j;
+                                    beam_hyps.num_beams[global_batch_idx]--;
+
+                                    beam_hyps.min_normed_scores[global_batch_idx]           = FLT_MAX;
+                                    beam_hyps.normed_scores[global_batch_idx * (K * 2) + j] = normed_score;
+                                    for (int l = 0; l < K; l++) {
+                                        beam_hyps.min_normed_scores[global_batch_idx] =
+                                            min(beam_hyps.min_normed_scores[global_batch_idx],
+                                                beam_hyps.normed_scores[global_batch_idx * (K * 2) + l]);
+                                    }
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    const int tgt_id_offset =
+                        ((vector_id + beam_hyps.ite * beam_hyps.local_batch_size) * (K * 2) + beam_idx)
+                        * (beam_hyps.max_seq_len);
+                    beam_hyps.output_ids_tgt[tgt_id_offset + beam_hyps.step] = beam_hyps.end_ids[vector_id];
+                    if (beam_hyps.log_probs != nullptr) {
+                        beam_hyps.log_probs[tgt_id_offset + beam_hyps.step] =
+                            (float)y[total.p[i]] - old_cum_log_probs[(x[total.p[i]] / vocab_size) % K];
+                    }
+
+                    int prev_id = (x[total.p[i]] / vocab_size) % K;
+                    for (int j = beam_hyps.step - 1; j >= 0; j--) {
+                        const int src_idx = j * beam_hyps.batch_size * K
+                                            + beam_hyps.ite * beam_hyps.local_batch_size * K + vector_id * K + prev_id;
+
+                        beam_hyps.output_ids_tgt[tgt_id_offset + j] = beam_hyps.output_ids_src[src_idx];
+                        if (beam_hyps.log_probs != nullptr && beam_hyps.log_probs_src != nullptr) {
+                            beam_hyps.log_probs[tgt_id_offset + j] = beam_hyps.log_probs_src[src_idx];
+                        }
+                        prev_id = beam_hyps.parent_ids_src[src_idx];
+                    }
+                    const int tgt_beam_idx                       = global_batch_idx * (K * 2) + beam_idx;
+                    beam_hyps.sequence_lengths_tgt[tgt_beam_idx] = beam_hyps.step;
+                    beam_hyps.normed_scores[tgt_beam_idx]        = normed_score;
+                    beam_hyps.min_normed_scores[global_batch_idx] =
+                        min(beam_hyps.min_normed_scores[global_batch_idx], beam_hyps.normed_scores[tgt_beam_idx]);
+
+                    beam_hyps.num_beams[global_batch_idx]++;
+                    beam_hyps.cum_log_probs[tgt_beam_idx] = (float)y[total.p[i]];
+                }
+            }
+            else if ((beam_hyps.num_beams != nullptr && i < 2 * K) || (beam_hyps.num_beams == nullptr && i < K)) {
+                z[selected_beams] = x[total.p[i]];
+                if (output_log_probs != nullptr) {
+                    output_log_probs[vector_id * K + selected_beams] =
+                        (float)y[total.p[i]] - old_cum_log_probs[(z[selected_beams] / vocab_size) % K];
+                }
+                v[selected_beams] = (float)y[total.p[i]];
+                selected_beams++;
+            }
+            __syncthreads();
+            if (selected_beams >= K) {
+                break;
+            }
+        }
+    }
+    if (threadIdx.x == 0 && beam_hyps.num_beams != nullptr) {
+        if (beam_hyps.num_beams[blockIdx.x] < K) {
+            beam_hyps.is_done[blockIdx.x] = false;
+        }
+        else if (beam_hyps.early_stopping) {
+            beam_hyps.is_done[blockIdx.x] = true;
+        }
+    }
+}
+
+struct __align__(8) MD
+{
+    float m;
+    float d;
+};
+
+__device__ __forceinline__ MD reduce_md_op(MD a, MD b)
+{
+    bool a_bigger  = (a.m > b.m);
+    MD   bigger_m  = a_bigger ? a : b;
+    MD   smaller_m = a_bigger ? b : a;
+    MD   res;
+    res.d = bigger_m.d + smaller_m.d * __expf(smaller_m.m - bigger_m.m);
+    res.m = bigger_m.m;
+    return res;
+}
+
+template<typename T, int MAX_K>
+struct TopKMD {
+    MD             md;
+    TopK<T, MAX_K> topk;
+};
+
+template<typename T, int MAX_K>
+__device__ __forceinline__ TopKMD<T, MAX_K> reduce_topk_md_op(const TopKMD<T, MAX_K>& a, const TopKMD<T, MAX_K>& b)
+{
+    TopKMD<T, MAX_K> res;
+    res.md   = reduce_md_op(a.md, b.md);
+    res.topk = reduce_topk_op(a.topk, b.topk);
+    return res;
+}
+
+template<typename T, int ITEMS_PER_THREAD, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_online_softmax_topk_kernel(const T* __restrict x,
+                                                                                    const T* __restrict b,
+                                                                                    const float* __restrict c,
+                                                                                    const bool* __restrict finished,
+                                                                                    int* __restrict z,
+                                                                                    T* __restrict v,
+                                                                                    int V,
+                                                                                    int K,
+                                                                                    const int* __restrict end_ids)
+{
+    int thread_id = threadIdx.x;
+    int vector_id = blockIdx.x;
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    // reposition y to data for the current vector
+    x += vector_id * V;
+
+    typedef cub::BlockReduce<TopKMD<float, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage                     temp_storage;
+
+    TopKMD<float, MAX_K> partial;
+    bool                 finish = finished[vector_id];
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.topk.p[i] = -1;
+        partial.topk.u[i] = -MAX_T_VAL;
+    }
+    partial.md.m = -MAX_T_VAL;
+    partial.md.d = 0.0F;
+
+    if (finish) {
+        for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
+            float elem = (elem_id == end_ids[vector_id / K]) ? MAX_T_VAL : -MAX_T_VAL;
+            MD    new_elem{elem, 1.0F};
+            partial.md = reduce_md_op(partial.md, new_elem);
+            partial.topk.insert(elem, elem_id);
+            // if (elem_id > THREADBLOCK_SIZE * MAX_K && (elem_id == E)) break;
+        }
+    }
+    else {
+        for (int elem_id = thread_id; elem_id < V; elem_id += THREADBLOCK_SIZE) {
+            float elem = x[elem_id] + b[elem_id];
+            MD    new_elem{elem, 1.0F};
+            partial.md = reduce_md_op(partial.md, new_elem);
+            partial.topk.insert(elem, elem_id);
+        }
+    }
+
+    TopKMD<float, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<float, MAX_K>);
+
+    if (thread_id == 0) {
+        z += vector_id * K;
+        v += vector_id * K;
+        c += vector_id;
+
+        // float d_total_inverse = __fdividef(1.0F, total.md.d);
+        float d_total_log = logf(total.md.d);
+        for (int i = 0; i < MAX_K; ++i) {
+            // float val = __expf(total.topk.u[i] - total.md.m) * d_total_inverse;
+            float val = total.topk.u[i] - total.md.m - d_total_log;
+            if (i < K) {
+                z[i] = total.topk.p[i] + vector_id * V;  // faster transformer needs absolute id
+                v[i] = val + c[0];
+            }
+        }
+    }
+}
+
+template<typename T, int ITEMS_PER_THREAD, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE, 1) __global__
+    void beam_online_softmax_topk_stage1_kernel(const T* __restrict x,
+                                                const T* __restrict b,
+                                                const bool* __restrict finished,
+                                                float* __restrict t,
+                                                int V,
+                                                int K,
+                                                const int* __restrict end_ids)
+{
+    int thread_id = threadIdx.x;
+    int vector_id = blockIdx.x;  // batch beam index.
+
+    const int PACKED_TOP_KMD_SIZE = 2 * MAX_K + 2;
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    // one will have multiple sections per V
+    const int v_local       = (V + gridDim.y - 1) / gridDim.y;
+    const int section_start = v_local * blockIdx.y;
+    int       section_end   = section_start + v_local;
+    section_end             = (section_end > V) ? V : section_end;
+
+    // reposition x to data for the current vector
+    x += vector_id * V;
+#if TOPK_FP16_STORAGE == 1
+    typedef cub::BlockReduce<TopKMD<__half, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+#else
+    typedef cub::BlockReduce<TopKMD<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+#endif
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    __shared__ float                             buf_s[PACKED_TOP_KMD_SIZE];  // save intermediate result
+
+#if TOPK_FP16_STORAGE == 1
+    TopKMD<__half, MAX_K> partial;
+#else
+    TopKMD<T, MAX_K>                                             partial;
+#endif
+    bool finish = finished[vector_id];
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.topk.p[i] = -1;
+        partial.topk.u[i] = -MAX_T_VAL;
+    }
+    partial.md.m = -MAX_T_VAL;
+    partial.md.d = 0.0F;
+
+    if (finish) {
+#pragma unroll 1
+        for (int elem_id = section_start + thread_id; elem_id < section_end; elem_id += THREADBLOCK_SIZE) {
+            float elem = (elem_id == end_ids[vector_id / K]) ? MAX_T_VAL : -MAX_T_VAL;
+            MD    new_elem{elem, 1.0F};
+            partial.md = reduce_md_op(partial.md, new_elem);
+            partial.topk.insert(elem, elem_id);
+        }
+    }
+    else {
+#pragma unroll 1
+        for (int elem_id = section_start + thread_id; elem_id < section_end; elem_id += THREADBLOCK_SIZE) {
+            T  bias = b == nullptr ? (T)0.0f : b[elem_id];  // gpt-2 does not use bias
+            T  elem = x[elem_id] + bias;
+            MD new_elem{elem, 1.0F};
+            partial.md = reduce_md_op(partial.md, new_elem);
+            partial.topk.insert(elem, elem_id);
+        }
+    }
+
+#if TOPK_FP16_STORAGE == 1
+    TopKMD<__half, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<__half, MAX_K>);
+#else
+    TopKMD<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<T, MAX_K>);
+#endif
+
+    if (thread_id == 0) {
+        for (int i = 0; i < 2 * K; i++) {
+            reinterpret_cast<int*>(buf_s)[i] = total.topk.p[i] + vector_id * V;  // faster transformer needs absolute id
+            buf_s[MAX_K + i]                 = total.topk.u[i];
+        }
+        buf_s[2 * MAX_K]     = total.md.d;
+        buf_s[2 * MAX_K + 1] = total.md.m;
+    }
+    __syncthreads();
+    for (int elem_id = thread_id; elem_id < PACKED_TOP_KMD_SIZE; elem_id += THREADBLOCK_SIZE) {
+        t[blockIdx.x * PACKED_TOP_KMD_SIZE * gridDim.y + blockIdx.y * PACKED_TOP_KMD_SIZE + elem_id] = buf_s[elem_id];
+    }
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void beam_online_softmax_topk_stage2_kernel(
+    const float* __restrict x, const float* __restrict c, int* __restrict z, T* __restrict v, int K, int parts_per_beam)
+{
+    const int vector_id           = blockIdx.x;
+    const int thread_id           = threadIdx.x;
+    const int PACKED_TOP_KMD_SIZE = 2 * MAX_K + 2;
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    extern __shared__ char buf_s_[];  // intermediate result
+    float*                 buf_s = reinterpret_cast<float*>(buf_s_);
+    //__shared__ float buf_s[PACKED_TOP_KMD_SIZE * THREADBLOCK_SIZE]; // intermediate result
+
+    typedef cub::BlockReduce<TopKMD<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage                 temp_storage;
+
+    x += vector_id * PACKED_TOP_KMD_SIZE * parts_per_beam;
+
+    TopKMD<T, MAX_K> partial;
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.topk.p[i] = -1;
+        partial.topk.u[i] = -MAX_T_VAL;
+    }
+    partial.md.m = -MAX_T_VAL;
+    partial.md.d = 0.0F;
+
+    // load and unpack into registers through smem
+    for (int idx = thread_id; idx < PACKED_TOP_KMD_SIZE * parts_per_beam; idx += THREADBLOCK_SIZE) {
+        buf_s[idx] = x[idx];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < parts_per_beam) {
+        float* b_s = buf_s + thread_id * PACKED_TOP_KMD_SIZE;
+        for (int i = 0; i < 2 * K; i++) {
+            partial.topk.p[i] = reinterpret_cast<int*>(b_s)[i];
+            partial.topk.u[i] = b_s[MAX_K + i];
+        }
+        partial.md.d = b_s[2 * MAX_K];
+        partial.md.m = b_s[2 * MAX_K + 1];
+    }
+    __syncthreads();
+
+    TopKMD<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_md_op<T, MAX_K>);
+
+    if (thread_id == 0) {
+        z += vector_id * 2 * K;
+        v += vector_id * 2 * K;
+        c += vector_id;
+
+        float d_total_log = logf(total.md.d);
+        for (int i = 0; i < MAX_K; ++i) {
+            float val = (float)total.topk.u[i] - total.md.m - d_total_log;
+            if (i < 2 * K) {
+                z[i] = total.topk.p[i];
+                v[i] = (float)val + (float)c[0];
+            }
+        }
+    }
+}
+
+template<typename T, int MAX_K>
+void beam_online_softmax_topk_stage2_kernelLauncher(const float* temp_storage,
+                                                    const float* cum_log_probs,
+                                                    int*         ids,
+                                                    T*           vals,
+                                                    int          batch_size,
+                                                    int          beam_width,
+                                                    int          parts_per_beam,
+                                                    cudaStream_t stream)
+{
+    // might rewrite beam_online_softmax_topk_stage2_kernel no to depend on constant block size
+    // in oreder to reduce compilation time
+    int smem_stage2_size = parts_per_beam * (2 * MAX_K + 2) * sizeof(float);
+
+    if (parts_per_beam <= 32) {
+        beam_online_softmax_topk_stage2_kernel<T, MAX_K, 32><<<batch_size * beam_width, 32, smem_stage2_size, stream>>>(
+            temp_storage, cum_log_probs, ids, vals, beam_width, parts_per_beam);
+        return;
+    }
+    if (parts_per_beam <= 64) {
+        beam_online_softmax_topk_stage2_kernel<T, MAX_K, 64><<<batch_size * beam_width, 64, smem_stage2_size, stream>>>(
+            temp_storage, cum_log_probs, ids, vals, beam_width, parts_per_beam);
+        return;
+    }
+    if (parts_per_beam <= 128) {
+        beam_online_softmax_topk_stage2_kernel<T, MAX_K, 128>
+            <<<batch_size * beam_width, 128, smem_stage2_size, stream>>>(
+                temp_storage, cum_log_probs, ids, vals, beam_width, parts_per_beam);
+        return;
+    }
+    assert(0);
+}
+
+template<typename T, int MAX_K>
+void topK_softMax_kernelLauncher(const T*        log_probs,
+                                 const T*        bias,
+                                 const bool*     finished,
+                                 const int*      sequence_lengths,
+                                 float*          cum_log_probs,
+                                 float*          output_log_probs,
+                                 int*            ids,
+                                 void*           temp_storage,
+                                 const int       temp_storage_size,
+                                 BeamHypotheses* beam_hyps,
+                                 const int       batch_size,
+                                 const int       beam_width,
+                                 const int       vocab_size,
+                                 const int*      end_ids,
+                                 T               diversity_rate,
+                                 const float     length_penalty,
+                                 cudaStream_t    stream)
+{
+    const int items_per_thread = 1;
+    const int block_sz         = (MAX_K < 16) ? (MAX_K < 8) ? SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE : 128 : 64;
+    // const int block_sz = SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE;
+
+    assert(temp_storage_size % 2 == 0);
+    assert(temp_storage_size >= 2 * batch_size * beam_width * beam_width * 2);
+    // Beam search needs the sequence lengths of beams to apply length penalty.
+    assert(length_penalty == 0.0f || sequence_lengths != nullptr);
+
+    const int topk_buf_offset  = ceil(batch_size * beam_width * beam_width * 2 / 4.) * 4;
+    int*      topk_tmp_id_buf  = reinterpret_cast<int*>(temp_storage);
+    T*        topk_tmp_val_buf = reinterpret_cast<T*>(topk_tmp_id_buf + topk_buf_offset);
+    float*    tmp_buffer       = reinterpret_cast<float*>(topk_tmp_val_buf + topk_buf_offset);
+
+#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
+    int voc_parts = 4;
+    if (batch_size * beam_width < 256) {
+        // Volta has 80 SMs, so we aim for three waves
+        voc_parts = (240 + batch_size * beam_width - 1) / (batch_size * beam_width);
+        voc_parts = std::min(128, voc_parts);  // we implement up to 128
+    }
+    dim3 grid(batch_size * beam_width, voc_parts);
+    cudaFuncSetAttribute(beam_online_softmax_topk_stage1_kernel<T, items_per_thread, 2 * MAX_K, block_sz>,
+                         cudaFuncAttributePreferredSharedMemoryCarveout,
+                         cudaSharedmemCarveoutMaxL1);
+    beam_online_softmax_topk_stage1_kernel<T, items_per_thread, 2 * MAX_K, block_sz>
+        <<<grid, block_sz, 0, stream>>>(log_probs, bias, finished, tmp_buffer, vocab_size, beam_width, end_ids);
+    sync_check_cuda_error();
+#endif
+    if (beam_width > 1) {
+#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
+        beam_online_softmax_topk_stage2_kernelLauncher<T, 2 * MAX_K>(
+            tmp_buffer, cum_log_probs, topk_tmp_id_buf, topk_tmp_val_buf, batch_size, beam_width, voc_parts, stream);
+        sync_check_cuda_error();
+#else
+        beam_online_softmax_topk_kernel<T, items_per_thread, MAX_K, block_sz>
+            <<<batch_size * beam_width, block_sz, 0, stream>>>(log_probs,
+                                                               bias,
+                                                               cum_log_probs,
+                                                               finished,
+                                                               topk_tmp_id_buf,
+                                                               topk_tmp_val_buf,
+                                                               vocab_size,
+                                                               beam_width,
+                                                               end_ids);
+#endif
+#if 0
+            // wrong result with diversity_rate != 0.f
+            batch_topK_kernel<T, MAX_K, 32><<<batch_size, 32, 0, stream>>>
+                                (topk_tmp_id_buf, topk_tmp_val_buf, ids, cum_log_probs);
+#else
+        // We need 2*MAX_K candidates because at most k candidates are finished, and we
+        // will not put them into next iteration
+        batch_topk_kernel<T, MAX_K * 2, 32><<<batch_size, 32, 0, stream>>>(topk_tmp_id_buf,
+                                                                           topk_tmp_val_buf,
+                                                                           ids,
+                                                                           cum_log_probs,
+                                                                           output_log_probs,
+                                                                           finished,
+                                                                           sequence_lengths,
+                                                                           *beam_hyps,
+                                                                           beam_width * beam_width * 2,
+                                                                           beam_width,
+                                                                           vocab_size,
+                                                                           length_penalty,
+                                                                           diversity_rate);
+        sync_check_cuda_error();
+#endif
+    }
+    else {
+        FT_CHECK(false);
+#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
+        beam_online_softmax_topk_stage2_kernelLauncher<float, MAX_K>(
+            tmp_buffer, cum_log_probs, ids, cum_log_probs, batch_size, beam_width, voc_parts, stream);
+#else
+        beam_online_softmax_topk_kernel<T, items_per_thread, MAX_K, block_sz>
+            <<<batch_size * beam_width, block_sz, 0, stream>>>(
+                log_probs, bias, cum_log_probs, finished, ids, cum_log_probs, vocab_size, beam_width, end_ids);
+#endif
+    }
+}
+
+#define CASE_K(K, MAX_K)                                                                                               \
+    case K ... MAX_K:                                                                                                  \
+        topK_softMax_kernelLauncher<T, MAX_K>(log_probs,                                                               \
+                                              bias,                                                                    \
+                                              finished,                                                                \
+                                              sequence_lengths,                                                        \
+                                              cum_log_probs,                                                           \
+                                              output_log_probs,                                                        \
+                                              ids,                                                                     \
+                                              temp_storage,                                                            \
+                                              temp_storage_size,                                                       \
+                                              beam_hyps,                                                               \
+                                              batch_size,                                                              \
+                                              beam_width,                                                              \
+                                              vocab_size,                                                              \
+                                              end_ids,                                                                 \
+                                              diversity_rate,                                                          \
+                                              length_penalty,                                                          \
+                                              stream);                                                                 \
+        break;
+
+template<typename T>
+void invokeTopkSoftMax(const T*        log_probs,
+                       const T*        bias,
+                       const bool*     finished,
+                       const int*      sequence_lengths,
+                       float*          cum_log_probs,
+                       float*          output_log_probs,
+                       int*            ids,
+                       void*           temp_storage,
+                       const int       temp_storage_size,
+                       BeamHypotheses* beam_hyps,
+                       const int       batch_size,
+                       const int       beam_width,
+                       const int       vocab_size,
+                       const int*      end_ids,
+                       const float     diversity_rate,
+                       const float     length_penalty,
+                       cudaStream_t    stream)
+{
+    switch (beam_width) {
+        CASE_K(1, 4);
+        CASE_K(5, 8);
+        CASE_K(9, 16);
+        CASE_K(17, 32);
+        CASE_K(33, 64);
+        default:
+            throw std::runtime_error(fmtstr("Topk kernel of beam search does not support beam_width=%d", beam_width));
+    }
+}
+
+#undef CASE_K
+
+template void invokeTopkSoftMax<float>(const float*    log_probs,
+                                       const float*    bias,
+                                       const bool*     finished,
+                                       const int*      sequence_lengths,
+                                       float*          cum_log_probs,
+                                       float*          output_log_probs,
+                                       int*            ids,
+                                       void*           tmp_storage,
+                                       const int       temp_storage_size,
+                                       BeamHypotheses* beam_hyps,
+                                       const int       batch_size,
+                                       const int       beam_width,
+                                       const int       vocab_size,
+                                       const int*      end_ids,
+                                       const float     diversity_rate,
+                                       const float     length_penalty,
+                                       cudaStream_t    stream);
+
+template void invokeTopkSoftMax<half>(const half*     log_probs,
+                                      const half*     bias,
+                                      const bool*     finished,
+                                      const int*      sequence_lengths,
+                                      float*          cum_log_probs,
+                                      float*          output_log_probs,
+                                      int*            ids,
+                                      void*           tmp_storage,
+                                      const int       temp_storage_size,
+                                      BeamHypotheses* beam_hyps,
+                                      const int       batch_size,
+                                      const int       beam_width,
+                                      const int       vocab_size,
+                                      const int*      end_ids,
+                                      const float     diversity_rate,
+                                      const float     length_penalty,
+                                      cudaStream_t    stream);
+
+}  // end of namespace fastertransformer
diff --git a/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h b/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..addf484d24a8affc445ce322cb814598b736a30f
--- /dev/null
+++ b/src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeTopkSoftMax(const T*        log_probs,
+                       const T*        bias,
+                       const bool*     finished,
+                       const int*      sequence_lengths,
+                       float*          cum_log_probs,
+                       float*          output_log_probs,
+                       int*            ids,
+                       void*           tmp_storage,
+                       const int       temp_storage_size,
+                       BeamHypotheses* beam_hyps,
+                       const int       batch_size,
+                       const int       beam_width,
+                       const int       vocab_size,
+                       const int*      end_ids,
+                       const float     diversity_rate,
+                       const float     length_penalty,
+                       cudaStream_t    stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/penalty_types.h b/src/fastertransformer/kernels/penalty_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..cad9d185aac85098533465c70b9f04633811ac46
--- /dev/null
+++ b/src/fastertransformer/kernels/penalty_types.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "src/fastertransformer/utils/string_utils.h"
+
+namespace fastertransformer {
+
+enum class RepetitionPenaltyType {
+    Additive,        // the presence penalty
+    Multiplicative,  // the repetition penalty
+    None             // No repetition penalty.
+};
+
+inline float getDefaultPenaltyValue(RepetitionPenaltyType penalty_type)
+{
+    switch (penalty_type) {
+        case RepetitionPenaltyType::Additive:
+            return 0.0f;
+        case RepetitionPenaltyType::Multiplicative:
+            return 1.0f;
+        default:
+            break;
+    }
+    return 0.0f;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/reduce_kernel_utils.cuh b/src/fastertransformer/kernels/reduce_kernel_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ff63dee6c407e9a7f9f049e609d86a1ffee4c81f
--- /dev/null
+++ b/src/fastertransformer/kernels/reduce_kernel_utils.cuh
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <array>
+#include <assert.h>
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+#include <cooperative_groups/reduce.h>
+#else
+#include <cooperative_groups.h>
+#endif
+#include <cuda_fp16.h>
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <cuda_runtime.h>
+#include <curand_kernel.h>
+#include <float.h>
+#include <type_traits>
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+
+namespace cg = cooperative_groups;
+
+namespace fastertransformer {
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void copy(const void* local, void* data)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(local);
+    T* out = static_cast<T*>(data);
+    *out = *in;
+}
+
+static const float HALF_FLT_MAX = 65504.F;
+#define FINAL_MASK 0xffffffff
+
+template<typename T>
+__inline__ __device__ T warpReduceSum(T val)
+{
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1)
+        val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));  //__shfl_sync bf16 return float when sm < 80
+    return val;
+}
+
+/* Calculate the sum of all elements in a block */
+template<typename T>
+__inline__ __device__ T blockReduceSum(T val)
+{
+    static __shared__ T shared[32];
+    int                 lane = threadIdx.x & 0x1f;
+    int                 wid  = threadIdx.x >> 5;
+
+    val = warpReduceSum<T>(val);
+
+    if (lane == 0)
+        shared[wid] = val;
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
+    val = warpReduceSum<T>(val);
+
+    return val;
+}
+
+template<typename T>
+__inline__ __device__ T warpReduceMax(T val)
+{
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1)
+        val = max(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));
+    return val;
+}
+
+/* Calculate the maximum of all elements in a block */
+template<typename T>
+__inline__ __device__ T blockReduceMax(T val)
+{
+    static __shared__ T shared[32];
+    int                 lane = threadIdx.x & 0x1f;  // in-warp idx
+    int                 wid  = threadIdx.x >> 5;    // warp idx
+
+    val = warpReduceMax(val);  // get maxx in each warp
+
+    if (lane == 0)  // record in-warp maxx by warp Idx
+        shared[wid] = val;
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : -1e20f;
+    val = warpReduceMax(val);
+
+    return val;
+}
+
+
+/* Calculate the maximum of all elements in a block */
+template<typename T>
+__inline__ __device__ T blockAllReduceMax(T val)
+{
+    static __shared__ T shared[32];
+    int                 lane = threadIdx.x & 0x1f;  // in-warp idx
+    int                 wid  = threadIdx.x >> 5;    // warp idx
+
+    val = warpReduceMax(val);  // get maxx in each warp
+
+    if (lane == 0)  // record in-warp maxx by warp Idx
+        shared[wid] = val;
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    val = (lane < (blockDim.x / 32.f)) ? shared[lane] : -1e20f;
+    val = warpReduceMax(val);
+
+    return val;
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceSumV2(T* val)
+{
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceSumV2(T* val)
+{
+    static __shared__ T shared[NUM][33];
+    int                 lane = threadIdx.x & 0x1f;
+    int                 wid  = threadIdx.x >> 5;
+
+    warpReduceSumV2<T, NUM>(val);
+
+    if (lane == 0) {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[i][wid] = val[i];
+        }
+    }
+
+    __syncthreads();
+
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+    }
+    warpReduceSumV2<T, NUM>(val);
+    return (T)0.0f;
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceMaxV2(T* val)
+{
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] = max(val[i], __shfl_xor_sync(FINAL_MASK, val[i], mask, 32));
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceMaxV2(T* val)
+{
+    static __shared__ T shared[32][NUM];
+    int                 lane = threadIdx.x & 0x1f;  // in-warp idx
+    int                 wid  = threadIdx.x >> 5;    // warp idx
+
+    warpReduceMaxV2<T, NUM>(val);  // get maxx in each warp
+
+    if (lane == 0)  // record in-warp maxx by warp Idx
+    {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[wid][i] = val[i];
+        }
+    }
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[lane][i] : (T)-1e20f;
+    }
+    warpReduceMaxV2<T, NUM>(val);
+
+    return (T)0.0f;
+}
+
+template<int NUM>
+__inline__ __device__ void cgBlockReduceSumElements(float* element_list, float* cgBlockReduceSumElements_shm)
+{
+    cg::thread_block          cta  = cg::this_thread_block();
+    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(cta);
+
+    const int tid    = cta.thread_rank();
+    const int blockz = blockDim.x;
+    for (int i = 0; i < NUM; i++) {
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+        cgBlockReduceSumElements_shm[i * blockz + tid] = cg::reduce(tile, element_list[i], cg::plus<float>());
+#else
+        // TODO Add implementation here
+        if (threadIdx.x == 0 && blockIdx.x == 0) {
+            printf("[ERROR] Not support cgBlockReduceSumElements when CUDA < 11 \n");
+            assert(false);
+        }
+#endif
+    }
+    cg::sync(cta);
+    if (tid == 0) {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            float beta = 0.0f;
+            for (int j = 0; j < blockz; j += 32) {
+                beta += cgBlockReduceSumElements_shm[i * blockz + j];
+            }
+            element_list[i] = beta;
+        }
+    }
+}
+
+template<typename T, int MAX_K>
+struct TopK {
+    int p[MAX_K];
+    T   u[MAX_K];
+
+    __device__ __forceinline__ void insert(T elem, int elem_id)
+    {
+        if (elem > u[MAX_K - 1] || (p[MAX_K - 1] == -1) || ((elem == u[MAX_K - 1]) && (elem_id < p[MAX_K - 1])))
+        // if (elem > u[MAX_K-1] || ((elem == u[MAX_K-1]) && (elem_id < p[MAX_K-1])))
+        {
+            u[MAX_K - 1] = elem;
+            p[MAX_K - 1] = elem_id;
+        }
+
+        for (int k = MAX_K - 2; k >= 0; --k) {
+            if ((u[k + 1] > u[k]) || (p[k] == -1) || ((u[k + 1] == u[k]) && (p[k + 1] < p[k])))
+            // if ((u[k+1] > u[k]) || ((u[k+1] == u[k])&&(p[k+1] < p[k])))
+            {
+                T   u2   = u[k];
+                int p2   = p[k];
+                u[k]     = u[k + 1];
+                p[k]     = p[k + 1];
+                u[k + 1] = u2;
+                p[k + 1] = p2;
+            }
+        }
+    }
+
+    __device__ __forceinline__ void init()
+    {
+        const bool IS_FP16   = std::is_same<T, half>::value;
+        const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+        for (int i = 0; i < MAX_K; i++) {
+            p[i] = -1;
+            u[i] = -MAX_T_VAL;
+        }
+    }
+};
+
+template<typename T, int MAX_K>
+__device__ __forceinline__ TopK<T, MAX_K> reduce_topk_op(const TopK<T, MAX_K>& a, const TopK<T, MAX_K>& b)
+{
+    TopK<T, MAX_K> res = a;
+    for (int i = 0; i < MAX_K; ++i)
+        res.insert(b.u[i], b.p[i]);
+    return res;
+}
+
+template<typename T>
+struct TopK_2 {
+    int p = -1;
+    T   u = -((std::is_same<T, half>::value) ? HALF_FLT_MAX : FLT_MAX);
+
+    __device__ __forceinline__ void insert(T elem, int elem_id)
+    {
+        if (elem > u) {
+            u = elem;
+            p = elem_id;
+        }
+    }
+
+    __device__ __forceinline__ void init()
+    {
+        u = -((std::is_same<T, half>::value) ? HALF_FLT_MAX : FLT_MAX);
+        p = -1;
+    }
+};
+
+template<typename T>
+__device__ __forceinline__ TopK_2<T> reduce_topk_op_2(const TopK_2<T>& a, const TopK_2<T>& b)
+{
+    return a.u > b.u ? a : b;
+}
+
+template<typename T>
+__device__ __forceinline__ T clamp_inf_for_half(const float input)
+{
+    return input;
+}
+
+template<>
+__device__ __forceinline__ half clamp_inf_for_half(const float input)
+{
+    // clamp inf values to enable fp16 training
+    return input > 0.0f ? (half)min(input, HALF_FLT_MAX - 1000) : (half)max(input, -HALF_FLT_MAX + 1000);
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __forceinline__ __nv_bfloat16 clamp_inf_for_half(const float input)
+{
+    return __float2bfloat16(input);
+}
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/sampling_penalty_kernels.cu b/src/fastertransformer/kernels/sampling_penalty_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d7e2c9ee4e66a142648d870a95c320dee423c69b
--- /dev/null
+++ b/src/fastertransformer/kernels/sampling_penalty_kernels.cu
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <float.h>
+
+#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
+
+namespace fastertransformer {
+
+// TODO Add half2 implementation
+template<typename T>
+__global__ void applyTemperaturePenalty(T*          logits,
+                                        const T*    bias,
+                                        const float temperature_inverse,
+                                        const int   m,
+                                        const int   vocab_size,
+                                        const int   vocab_size_padd)
+{
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? 65504.F : FLT_MAX;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < m * vocab_size_padd;
+         index += blockDim.x * gridDim.x) {
+        T bias_val = bias == nullptr ? (T)(0.0f) : bias[index % vocab_size_padd];
+        if (index % vocab_size_padd < vocab_size) {
+            logits[index] = (logits[index] + bias_val) * (T)temperature_inverse;
+        }
+        else {
+            logits[index] = -MAX_T_VAL;
+        }
+    }
+}
+
+template<>
+__global__ void applyTemperaturePenalty(half2*       logits,
+                                        const half2* bias,
+                                        const float  temperature_inverse,
+                                        const int    batch_size,
+                                        const int    vocab_size,
+                                        const int    vocab_size_padded)
+{
+    assert(vocab_size % 2 == 0);
+    assert(vocab_size_padded % 2 == 0);
+    const half2 mask_val = __float2half2_rn(-65504.0f);
+    const half2 temp_inv = __float2half2_rn(temperature_inverse);
+
+    const int half_vocab_size        = vocab_size / 2;
+    const int half_vocab_size_padded = vocab_size_padded / 2;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * half_vocab_size_padded;
+         index += blockDim.x * gridDim.x) {
+        int   vocab_idx = index % half_vocab_size_padded;
+        half2 logit     = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
+        if (vocab_idx < half_vocab_size) {
+            if (bias != nullptr) {
+                logit = __hadd2(logit, bias[vocab_idx]);
+            }
+            logits[index] = __hmul2(logit, temp_inv);
+        }
+    }
+}
+
+template<typename T>
+void invokeApplyTemperaturePenalty(T*           logits,
+                                   const T*     bias,
+                                   const float  temperature,
+                                   const int    batch_size,
+                                   const int    vocab_size,
+                                   const int    vocab_size_padd,
+                                   cudaStream_t stream)
+{
+    dim3    block(min(vocab_size_padd, 1024));
+    dim3    grid(min(batch_size * vocab_size_padd / block.x, 65536));
+    const T temperature_inverse = (T)(1.f / (temperature + 1e-6f));
+    if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padd % 2 == 0) {
+        applyTemperaturePenalty<<<grid, block, 0, stream>>>(reinterpret_cast<half2*>(logits),
+                                                            reinterpret_cast<const half2*>(bias),
+                                                            temperature_inverse,
+                                                            batch_size,
+                                                            vocab_size,
+                                                            vocab_size_padd);
+    }
+    else {
+        applyTemperaturePenalty<T>
+            <<<grid, block, 0, stream>>>(logits, bias, temperature_inverse, batch_size, vocab_size, vocab_size_padd);
+    }
+}
+
+template void invokeApplyTemperaturePenalty(float*       logits,
+                                            const float* bias,
+                                            const float  temperature,
+                                            const int    batch_size,
+                                            const int    vocab_size,
+                                            const int    vocab_size_padd,
+                                            cudaStream_t stream);
+
+template void invokeApplyTemperaturePenalty(half*        logits,
+                                            const half*  bias,
+                                            const float  temperature,
+                                            const int    batch_size,
+                                            const int    vocab_size,
+                                            const int    vocab_size_padd,
+                                            cudaStream_t stream);
+
+template<typename T>
+__global__ void batchApplyTemperaturePenalty(T*           logits,
+                                             const T*     bias,
+                                             const float* temperatures,
+                                             const int    batch_size,
+                                             const int    vocab_size,
+                                             const int    vocab_size_padd)
+{
+    // TODO: Add macro or device function to get MAX_T_VAL.
+    const bool              IS_FP16   = std::is_same<T, half>::value;
+    const T                 MAX_T_VAL = (IS_FP16) ? 65504.F : FLT_MAX;
+    extern __shared__ float inv_temperatures[];
+    if (threadIdx.x < batch_size) {
+        inv_temperatures[threadIdx.x] = 1.0f / (temperatures[threadIdx.x] + 1e-6f);
+    }
+    __syncthreads();
+
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * vocab_size_padd;
+         index += blockDim.x * gridDim.x) {
+        int batch_idx = index / vocab_size_padd;
+        int vocab_idx = index % vocab_size_padd;
+        T   logit     = (vocab_idx < vocab_size) ? logits[index] : -MAX_T_VAL;
+        if (vocab_idx < vocab_size) {
+            if (bias != nullptr) {
+                logit += bias[vocab_idx];
+            }
+            logit *= inv_temperatures[batch_idx];
+        }
+        logits[index] = logit;
+    }
+}
+
+__global__ void batchApplyTemperaturePenalty_h2(half2*       logits,
+                                                const half2* bias,
+                                                const float* temperatures,
+                                                const int    batch_size,
+                                                const int    vocab_size,
+                                                const int    vocab_size_padded)
+{
+    assert(vocab_size % 2 == 0);
+    assert(vocab_size_padded % 2 == 0);
+    extern __shared__ half2 h2_inv_temperatures[];
+    if (threadIdx.x < batch_size) {
+        h2_inv_temperatures[threadIdx.x] = __float2half2_rn(1.f / (temperatures[threadIdx.x] + 1e-6f));
+    }
+    __syncthreads();
+
+    const half2 mask_val               = __float2half2_rn(-65504.0f);
+    const int   half_vocab_size        = vocab_size / 2;
+    const int   half_vocab_size_padded = vocab_size_padded / 2;
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch_size * half_vocab_size_padded;
+         index += blockDim.x * gridDim.x) {
+        int   batch_idx = index / half_vocab_size_padded;
+        int   vocab_idx = index % half_vocab_size_padded;
+        half2 logit     = vocab_idx < half_vocab_size ? __ldg(&logits[index]) : mask_val;
+        if (vocab_idx < half_vocab_size) {
+            if (bias != nullptr) {
+                logit = __hadd2(logit, bias[vocab_idx]);
+            }
+            logits[index] = __hmul2(logit, h2_inv_temperatures[batch_idx]);
+        }
+    }
+}
+
+template<typename T>
+void invokeBatchApplyTemperaturePenalty(T*           logits,
+                                        const T*     bias,
+                                        const float* temperatures,
+                                        const int    batch_size,
+                                        const int    vocab_size,
+                                        const int    vocab_size_padd,
+                                        cudaStream_t stream)
+{
+    dim3 block(min(vocab_size_padd, 1024));
+    dim3 grid(min(batch_size * vocab_size_padd / block.x, 65536));
+    if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padd % 2 == 0) {
+        size_t smem_size = sizeof(half2) * batch_size;
+        batchApplyTemperaturePenalty_h2<<<grid, block, smem_size, stream>>>(reinterpret_cast<half2*>(logits),
+                                                                            reinterpret_cast<const half2*>(bias),
+                                                                            temperatures,
+                                                                            batch_size,
+                                                                            vocab_size,
+                                                                            vocab_size_padd);
+    }
+    else {
+        size_t smem_size = sizeof(float) * batch_size;
+        batchApplyTemperaturePenalty<T>
+            <<<grid, block, smem_size, stream>>>(logits, bias, temperatures, batch_size, vocab_size, vocab_size_padd);
+    }
+}
+
+template void invokeBatchApplyTemperaturePenalty(float*       logits,
+                                                 const float* bias,
+                                                 const float* temperatures,
+                                                 const int    batch_size,
+                                                 const int    vocab_size,
+                                                 const int    vocab_size_padd,
+                                                 cudaStream_t stream);
+
+template void invokeBatchApplyTemperaturePenalty(half*        logits,
+                                                 const half*  bias,
+                                                 const float* temperatures,
+                                                 const int    batch_size,
+                                                 const int    vocab_size,
+                                                 const int    vocab_size_padd,
+                                                 cudaStream_t stream);
+
+template<typename T, RepetitionPenaltyType penalty_type>
+__global__ void applyRepetitionPenalty(T*          logits,
+                                       const float penalty,
+                                       const int*  start_ids,
+                                       int*        output_ids,
+                                       const int   batch_size,
+                                       const int   local_batch_size,
+                                       const int   vocab_size,
+                                       const int   vocab_size_padd,
+                                       const int*  input_lengths,
+                                       const int   max_input_len,
+                                       const int   step)
+{
+    extern __shared__ float penalty_logits[];
+    int*                    penalty_indices = (int*)(penalty_logits + step);
+
+    logits                 = logits + blockIdx.x * vocab_size_padd;
+    const int input_length = input_lengths != nullptr ? input_lengths[blockIdx.x] : max_input_len;
+    for (int index = threadIdx.x; index < step; index += blockDim.x) {
+
+        if (index >= input_length && index < max_input_len) {
+            continue;
+        }
+
+        // output_ids shape: (input_len + output_len, batch_size)
+        int penalty_index = output_ids[index * batch_size + blockIdx.x];
+        if (penalty_index >= vocab_size) {
+            continue;
+        }
+        penalty_indices[index] = penalty_index;
+        float logit            = (float)logits[penalty_index];
+        if (penalty_type == RepetitionPenaltyType::Additive) {
+            penalty_logits[index] = logit - penalty;
+        }
+        else if (penalty_type == RepetitionPenaltyType::Multiplicative) {
+            penalty_logits[index] = logit < 0.0f ? logit * penalty : logit / penalty;
+        }
+        else if (penalty_type == RepetitionPenaltyType::None) {
+            penalty_logits[index] = logit;
+        }
+        else {
+            // Unsupported type
+            assert(false);
+        }
+    }
+
+    if (blockDim.x > 32) {
+        __syncthreads();
+    }
+
+    for (int index = threadIdx.x; index < step; index += blockDim.x) {
+
+        if (index >= input_length && index < max_input_len) {
+            continue;
+        }
+
+        // output_ids shape: (input_len + output_len, batch_size)
+        if (penalty_indices[index] >= vocab_size) {
+            continue;
+        }
+        logits[penalty_indices[index]] = penalty_logits[index];
+    }
+}
+
+template<typename T>
+void invokeApplyRepetitionPenalty(T*                          logits,
+                                  const float                 penalty,
+                                  const int*                  start_ids,
+                                  int*                        output_ids,
+                                  const int                   batch_size,
+                                  const int                   local_batch_size,
+                                  const int                   vocab_size,
+                                  const int                   vocab_size_padd,
+                                  const int*                  input_lengths,
+                                  const int                   max_input_len,
+                                  const int                   step,
+                                  const RepetitionPenaltyType penalty_type,
+                                  cudaStream_t                stream)
+{
+    dim3   block(min(step, 1024));
+    dim3   grid(local_batch_size);
+    size_t smem_size = step * (sizeof(float) + sizeof(int));
+
+    if (penalty_type == RepetitionPenaltyType::Additive) {
+        applyRepetitionPenalty<T, RepetitionPenaltyType::Additive><<<grid, block, smem_size, stream>>>(logits,
+                                                                                                       penalty,
+                                                                                                       start_ids,
+                                                                                                       output_ids,
+                                                                                                       batch_size,
+                                                                                                       local_batch_size,
+                                                                                                       vocab_size,
+                                                                                                       vocab_size_padd,
+                                                                                                       input_lengths,
+                                                                                                       max_input_len,
+                                                                                                       step);
+    }
+    else if (penalty_type == RepetitionPenaltyType::Multiplicative) {
+        applyRepetitionPenalty<T, RepetitionPenaltyType::Multiplicative>
+            <<<grid, block, smem_size, stream>>>(logits,
+                                                 penalty,
+                                                 start_ids,
+                                                 output_ids,
+                                                 batch_size,
+                                                 local_batch_size,
+                                                 vocab_size,
+                                                 vocab_size_padd,
+                                                 input_lengths,
+                                                 max_input_len,
+                                                 step);
+    }
+    else if (penalty_type == RepetitionPenaltyType::None) {
+        // do nothing
+    }
+}
+
+template void invokeApplyRepetitionPenalty(float*                      logits,
+                                           const float                 penalty,
+                                           const int*                  start_ids,
+                                           int*                        output_ids,
+                                           const int                   batch_size,
+                                           const int                   local_batch_size,
+                                           const int                   vocab_size,
+                                           const int                   vocab_size_padd,
+                                           const int*                  input_lengths,
+                                           const int                   max_input_len,
+                                           const int                   step,
+                                           const RepetitionPenaltyType penalty_type,
+                                           cudaStream_t                stream);
+
+template void invokeApplyRepetitionPenalty(half*                       logits,
+                                           const float                 penalty,
+                                           const int*                  start_ids,
+                                           int*                        output_ids,
+                                           const int                   batch_size,
+                                           const int                   local_batch_size,
+                                           const int                   vocab_size,
+                                           const int                   vocab_size_padd,
+                                           const int*                  input_lengths,
+                                           const int                   max_input_len,
+                                           const int                   step,
+                                           const RepetitionPenaltyType penalty_type,
+                                           cudaStream_t                stream);
+
+template<typename T, RepetitionPenaltyType penalty_type>
+__global__ void batchApplyRepetitionPenalty(T*           logits,
+                                            const float* penalties,
+                                            const int*   output_ids,
+                                            const int    batch_size,
+                                            const int    vocab_size,
+                                            const int*   input_lengths,
+                                            const int    max_input_length,
+                                            const int    step)
+{
+    extern __shared__ float penalty_logits[];
+    int*                    penalty_indices = (int*)(penalty_logits + step);
+    const int               batch_idx       = blockIdx.x;
+    const float             penalty         = penalties[batch_idx];
+    const int               input_length    = input_lengths != nullptr ? input_lengths[batch_idx] : max_input_length;
+
+    logits += batch_idx * vocab_size;
+
+    // Phase 1. Find indices to penalize and keep the penalized values.
+    // A vocab id can appear multiple times but should be penalized once.
+    for (int index = threadIdx.x; index < step; index += blockDim.x) {
+        // Skip the padding tokens in input sequences.
+        if (index >= input_length && index < max_input_length) {
+            continue;
+        }
+        // output_ids shape: (input_len + output_len, batch_size)
+        int penalty_index = output_ids[index * batch_size + batch_idx];
+        assert(penalty_index < vocab_size);
+        penalty_indices[index] = penalty_index;
+        float logit            = (float)logits[penalty_index];
+        if (penalty_type == RepetitionPenaltyType::Additive) {
+            penalty_logits[index] = logit - penalty;
+        }
+        else if (penalty_type == RepetitionPenaltyType::Multiplicative) {
+            penalty_logits[index] = logit < 0.0f ? logit * penalty : logit / penalty;
+        }
+        else if (penalty_type == RepetitionPenaltyType::None) {
+            penalty_logits[index] = logit;
+        }
+        else {
+            // Unsupported type
+            assert(false);
+        }
+    }
+
+    if (blockDim.x > 32) {
+        __syncthreads();
+    }
+
+    // Phase 2. Replace a logit value by the penalized one.
+    for (int index = threadIdx.x; index < step; index += blockDim.x) {
+        // Skip the padding tokens in input sequences.
+        if (index >= input_length && index < max_input_length) {
+            continue;
+        }
+        logits[penalty_indices[index]] = penalty_logits[index];
+    }
+}
+
+template<typename T>
+void invokeBatchApplyRepetitionPenalty(T*                    logits,
+                                       const float*          penalties,
+                                       const int*            output_ids,
+                                       const int             batch_size,
+                                       const int             local_batch_size,
+                                       const int             vocab_size,
+                                       const int*            input_lengths,
+                                       const int             max_input_length,
+                                       const int             step,
+                                       RepetitionPenaltyType penalty_type,
+                                       cudaStream_t          stream)
+{
+    // Inputs
+    //   logits [local_batch_size, vocab_size] : logit values.
+    //   penalties [local_batch_size] : repetition penalty factors.
+    //   output_ids [step, batch_size] : output token ids (with offset ite * local_batch_size).
+    //   input_lengths [local_batch_size], input lengths (optional).
+    //      Padding tokens at [input_length, max_input_length) of input will not be penalized.
+    dim3   block(min(step, 1024));
+    dim3   grid(local_batch_size);
+    size_t smem_size = step * (sizeof(float) + sizeof(int));
+    if (penalty_type == RepetitionPenaltyType::Additive) {
+        batchApplyRepetitionPenalty<T, RepetitionPenaltyType::Additive><<<grid, block, smem_size, stream>>>(
+            logits, penalties, output_ids, batch_size, vocab_size, input_lengths, max_input_length, step);
+    }
+    else if (penalty_type == RepetitionPenaltyType::Multiplicative) {
+        batchApplyRepetitionPenalty<T, RepetitionPenaltyType::Multiplicative><<<grid, block, smem_size, stream>>>(
+            logits, penalties, output_ids, batch_size, vocab_size, input_lengths, max_input_length, step);
+    }
+    else if (penalty_type == RepetitionPenaltyType::None) {
+        // do nothing
+    }
+}
+
+template void invokeBatchApplyRepetitionPenalty(float*                logits,
+                                                const float*          penalties,
+                                                const int*            output_ids,
+                                                const int             batch_size,
+                                                const int             local_batch_size,
+                                                const int             vocab_size,
+                                                const int*            input_lengths,
+                                                const int             max_input_length,
+                                                const int             step,
+                                                RepetitionPenaltyType penalty_type,
+                                                cudaStream_t          stream);
+
+template void invokeBatchApplyRepetitionPenalty(half*                 logits,
+                                                const float*          penalties,
+                                                const int*            output_ids,
+                                                const int             batch_size,
+                                                const int             local_batch_size,
+                                                const int             vocab_size,
+                                                const int*            input_lengths,
+                                                const int             max_input_length,
+                                                const int             step,
+                                                RepetitionPenaltyType penalty_type,
+                                                cudaStream_t          stream);
+
+template<typename T>
+__global__ void batchApplyMinLengthPenalty(T*         logits,
+                                           const int* min_lengths,
+                                           const int* end_ids,
+                                           const int* sequence_lengths,
+                                           const int  max_input_length,
+                                           const int  vocab_size_padded)
+{
+    int bid = threadIdx.x + blockIdx.x * blockDim.x;  // batch index
+    // We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
+    // which is equal to the length of k/v caches.
+    if (sequence_lengths[bid] + 1 - max_input_length < min_lengths[bid]) {
+        T mask_val                                     = (std::is_same<T, half>::value) ? -65504.0f : -FLT_MAX;
+        logits[bid * vocab_size_padded + end_ids[bid]] = mask_val;
+    }
+}
+
+template<typename T>
+void invokeMinLengthPenalty(T*           logits,
+                            const int*   min_lengths,
+                            const int*   end_ids,
+                            const int*   sequnece_lengths,
+                            const int    max_input_length,
+                            const int    batch_size,
+                            const int    vocab_size_padded,
+                            cudaStream_t stream)
+
+{
+    const int block_size = min(batch_size, 1024);
+    const int grid_size  = (batch_size + block_size - 1) / block_size;
+    batchApplyMinLengthPenalty<<<grid_size, block_size, 0, stream>>>(
+        logits, min_lengths, end_ids, sequnece_lengths, max_input_length, vocab_size_padded);
+}
+
+template void invokeMinLengthPenalty(float*       logits,
+                                     const int*   min_lengths,
+                                     const int*   end_ids,
+                                     const int*   sequnece_lengths,
+                                     const int    max_input_length,
+                                     const int    batch_size,
+                                     const int    vocab_size_padded,
+                                     cudaStream_t stream);
+
+template void invokeMinLengthPenalty(half*        logits,
+                                     const int*   min_lengths,
+                                     const int*   end_ids,
+                                     const int*   sequnece_lengths,
+                                     const int    max_input_length,
+                                     const int    batch_size,
+                                     const int    vocab_size_padded,
+                                     cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/sampling_penalty_kernels.h b/src/fastertransformer/kernels/sampling_penalty_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ceb9c85ad9f54e4fdefab2ef34c1eee6d87c3de
--- /dev/null
+++ b/src/fastertransformer/kernels/sampling_penalty_kernels.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeApplyRepetitionPenalty(T*                          logits,
+                                  const float                 penalty,
+                                  const int*                  start_ids,
+                                  int*                        output_ids,
+                                  const int                   batch_size,
+                                  const int                   local_batch_size,
+                                  const int                   vocab_size,
+                                  const int                   vocab_size_padd,
+                                  const int*                  input_lengths,
+                                  const int                   max_input_len,
+                                  const int                   step,
+                                  const RepetitionPenaltyType penalty_type,
+                                  cudaStream_t                stream);
+
+template<typename T>
+void invokeBatchApplyRepetitionPenalty(T*                          logits,
+                                       const float*                penalties,
+                                       const int*                  output_ids,
+                                       const int                   batch_size,
+                                       const int                   local_batch_size,
+                                       const int                   vocab_size,
+                                       const int*                  input_lengths,
+                                       const int                   max_input_length,
+                                       const int                   step,
+                                       const RepetitionPenaltyType penalty_type,
+                                       cudaStream_t                stream);
+
+template<typename T>
+void invokeApplyTemperaturePenalty(T*           logits,
+                                   const T*     bias,
+                                   const float  temperature,
+                                   const int    batch_size,
+                                   const int    vocab_size,
+                                   const int    vocab_size_padd,
+                                   cudaStream_t stream);
+
+template<typename T>
+void invokeBatchApplyTemperaturePenalty(T*           logits,
+                                        const T*     bias,
+                                        const float* temperatures,
+                                        const int    batch_size,
+                                        const int    vocab_size,
+                                        const int    vocab_size_padd,
+                                        cudaStream_t stream);
+
+template<typename T>
+void invokeMinLengthPenalty(T*           logits,
+                            const int*   min_lengths,
+                            const int*   end_ids,
+                            const int*   sequnece_lengths,
+                            const int    max_input_length,
+                            const int    batch_size,
+                            const int    vocab_size_padded,
+                            cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.cu b/src/fastertransformer/kernels/sampling_topk_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..21df5268729570e968a737462b03eee3d0b8937b
--- /dev/null
+++ b/src/fastertransformer/kernels/sampling_topk_kernels.cu
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11050)
+#include <cub/cub.cuh>
+#else
+#include "3rdparty/cub/cub.cuh"
+#endif
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+
+namespace fastertransformer {
+
+__global__ void curandInitialize(curandState_t* state, const int size, const unsigned long long random_seed)
+{
+    if (threadIdx.x + blockIdx.x * blockDim.x < size) {
+        curand_init(random_seed, 0, 0, &state[blockIdx.x * blockDim.x + threadIdx.x]);
+    }
+}
+
+void invokeCurandInitialize(curandState_t*           state,
+                            const size_t             batch_size,
+                            const unsigned long long random_seed,
+                            cudaStream_t             stream)
+{
+    dim3 block(256);
+    dim3 grid((int)(ceil(batch_size * 1.0 / 256)));
+    curandInitialize<<<grid, block, 0, stream>>>(state, batch_size, random_seed);
+}
+
+__global__ void curandBatchInitialize(curandState_t* states, const int size, const unsigned long long* random_seeds)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < size) {
+        curand_init(random_seeds[idx], 0, 0, &states[idx]);
+    }
+}
+
+void invokeCurandBatchInitialize(curandState_t*            states,
+                                 const size_t              batch_size,
+                                 const unsigned long long* random_seeds,
+                                 cudaStream_t              stream)
+{
+    dim3 block(256);
+    dim3 grid((int)(ceil(batch_size * 1.0 / 256)));
+    curandBatchInitialize<<<grid, block, 0, stream>>>(states, batch_size, random_seeds);
+}
+
+template<typename T>
+__global__ void addBiasEndMask(T*          logits,
+                               const T*    bias,
+                               const int*  end_ids,
+                               const bool* finished,
+                               const int   vocab_size,
+                               const int   vocab_size_padded)
+{
+    int  bid    = blockIdx.x;
+    bool finish = finished != nullptr ? finished[bid] : false;
+    int  offset = bid * vocab_size_padded;
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+    for (int tid = threadIdx.x; tid < vocab_size_padded; tid += blockDim.x) {
+        if (tid >= vocab_size) {
+            logits[offset + tid] = -MAX_T_VAL;
+        }
+        else if (finish) {
+            logits[offset + tid] = (tid == end_ids[bid]) ? MAX_T_VAL : -MAX_T_VAL;
+        }
+        else {
+            if (bias != nullptr) {
+                logits[offset + tid] += bias[tid];
+            }
+        }
+    }
+}
+
+template<typename T>
+void invokeAddBiasEndMask(T*           logits,
+                          const T*     bias,
+                          const int*   end_ids,
+                          const bool*  finished,
+                          const int    batch_size,
+                          const int    vocab_size,
+                          const int    vocab_size_padded,
+                          cudaStream_t stream)
+{
+    dim3 grid(batch_size);
+    dim3 block(min(vocab_size_padded, 1024));
+    /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
+    addBiasEndMask<<<grid, block, 0, stream>>>(logits, bias, end_ids, finished, vocab_size, vocab_size_padded);
+}
+
+template void invokeAddBiasEndMask(float*       logits,
+                                   const float* bias,
+                                   const int*   end_ids,
+                                   const bool*  finished,
+                                   const int    batch_size,
+                                   const int    vocab_size,
+                                   const int    vocab_size_padded,
+                                   cudaStream_t stream);
+
+template void invokeAddBiasEndMask(half*        logits,
+                                   const half*  bias,
+                                   const int*   end_ids,
+                                   const bool*  finished,
+                                   const int    batch_size,
+                                   const int    vocab_size,
+                                   const int    vocab_size_padded,
+                                   cudaStream_t stream);
+
+template<typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
+__global__ void topk_stage1(const T* __restrict log_probs,
+                            T*          tmp_log_probs,
+                            int*        topk_tmp_id_buf,
+                            T*          topk_tmp_val_buf,
+                            const bool* finished,
+                            const int   max_top_k,
+                            const int*  top_ks,
+                            const int   vocab_size,
+                            const int*  end_ids,
+                            const bool* skip_decode)
+{
+    typedef cub::BlockReduce<TopK_2<T>, BLOCK_SIZE_> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage     temp_storage;
+
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    const int batch_id = bid / BLOCKS_PER_BEAM_;  // row id for log_probs
+    if (skip_decode != nullptr && skip_decode[batch_id]) {
+        return;
+    }
+    const int block_lane = bid % BLOCKS_PER_BEAM_;                              // block id for a beam
+    const int k          = (top_ks != nullptr) ? top_ks[batch_id] : max_top_k;  // batch_id = batch index
+
+    const int tmp_log_buf_index  = batch_id * vocab_size;
+    const int tmp_topk_buf_index = batch_id * BLOCKS_PER_BEAM_ * max_top_k + block_lane * k;
+
+    TopK_2<T>  partial;
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    if (finished != nullptr && finished[batch_id] == true) {
+        if (tid < k) {
+            const int index = tmp_topk_buf_index + tid;
+            if (block_lane == 0 && tid == 0) {
+                const int end_id        = end_ids[batch_id];
+                topk_tmp_id_buf[index]  = tmp_log_buf_index + end_id;
+                topk_tmp_val_buf[index] = log_probs[tmp_log_buf_index + end_id];
+            }
+            else {
+                topk_tmp_id_buf[index]  = -1;
+                topk_tmp_val_buf[index] = -MAX_T_VAL;
+            }
+        }
+        return;
+    }
+
+    for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
+         elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
+        int index            = elem_id + tmp_log_buf_index;
+        tmp_log_probs[index] = log_probs[index];
+    }
+
+    for (int ite = 0; ite < k; ite++) {
+        partial.init();
+#pragma unroll
+        for (int elem_id = tid + block_lane * BLOCK_SIZE_; elem_id < vocab_size;
+             elem_id += BLOCK_SIZE_ * BLOCKS_PER_BEAM_) {
+            int index = elem_id + tmp_log_buf_index;
+            partial.insert(tmp_log_probs[index], index);
+        }
+
+        TopK_2<T> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<T>);
+
+        if (tid == 0) {
+            const int index         = tmp_topk_buf_index + ite;
+            topk_tmp_id_buf[index]  = total.p;
+            topk_tmp_val_buf[index] = total.u;
+            tmp_log_probs[total.p]  = -MAX_T_VAL;
+        }
+        __syncthreads();
+    }
+}
+
+template<typename T, int BLOCK_SIZE_, int BLOCKS_PER_BEAM_>
+__global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
+                                     T*             topk_tmp_val_buf,
+                                     int*           ids,
+                                     int*           sequence_length,
+                                     bool*          finished,
+                                     float*         cum_log_probs,
+                                     float*         output_log_probs,
+                                     const int      max_top_k,
+                                     const int*     top_ks,
+                                     const float    top_p,
+                                     const float*   top_ps,
+                                     curandState_t* curandstate,
+                                     const int*     end_ids,
+                                     const int      vocab_size,
+                                     const bool*    skip_decode)
+{
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+    const int tid      = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    if (skip_decode != nullptr && skip_decode[batch_id]) {
+        return;
+    }
+
+    const int   k              = (top_ks != nullptr) ? top_ks[batch_id] : max_top_k;
+    const float prob_threshold = (top_ps != nullptr) ? top_ps[batch_id] : top_p;
+    const int   size           = k * BLOCKS_PER_BEAM_;
+    const int   stride         = max_top_k * BLOCKS_PER_BEAM_;
+
+    typedef cub::BlockReduce<TopK_2<float>, BLOCK_SIZE_> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage         temp_storage;
+    extern __shared__ char                               array[];
+    __shared__ float                                     rand_num;
+    __shared__ float                                     s_sum;
+    __shared__ float                                     s_max;
+    T*                                                   s_val = topk_tmp_val_buf + batch_id * stride;
+    int*                                                 s_id  = reinterpret_cast<int*>(array);
+    if (tid == 0) {
+        s_sum = 0.0f;
+    }
+    TopK_2<float> partial;
+
+    if (finished != nullptr && finished[batch_id] == true) {
+        ids[batch_id] = end_ids[batch_id];
+        return;
+    }
+
+    float* s_val2 = reinterpret_cast<float*>(s_id + k);
+    for (int ite = 0; ite < k; ite++) {
+        partial.init();
+#pragma unroll
+        for (int i = tid; i < size; i += BLOCK_SIZE_) {
+            partial.insert((float)s_val[i], i);
+        }
+
+        TopK_2<float> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op_2<float>);
+
+        if (tid == 0) {
+            if (ite == 0) {
+                s_max = total.u;
+            }
+            s_id[ite]      = total.p;
+            s_val[total.p] = -MAX_T_VAL;
+
+            // when cum_log_probs are computed, topk_tmp_val_buf (logits_buf_) are already pre-processed by
+            // softmax_kernel
+            if (cum_log_probs == nullptr && output_log_probs == nullptr) {
+                total.u = __expf(total.u - s_max);
+            }
+            s_val2[ite] = total.u;
+            s_sum += total.u;
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        rand_num = (float)curand_uniform(curandstate + blockIdx.x) * prob_threshold * s_sum;
+        for (int i = 0; i < k; i++) {
+            float exp_logit = s_val2[i];
+            rand_num        = rand_num - exp_logit;
+            if (rand_num <= 0.0f || i == k - 1) {
+                ids[batch_id] = topk_tmp_id_buf[batch_id * stride + s_id[i]] % vocab_size;
+                if (cum_log_probs != nullptr || output_log_probs != nullptr) {
+                    float log_prob = logf(exp_logit);
+                    if (cum_log_probs != nullptr) {
+                        cum_log_probs[batch_id] += log_prob;
+                    }
+                    if (output_log_probs != nullptr) {
+                        // 'output_log_probs' is the probability induced by the top-k sampling.
+                        // We normalize the probability 'exp_logit' of the selected token by
+                        // the probability 's_sum' of a set of top-k tokens, meaning the log_prob
+                        // is the probability of the selected token, conditioned on the event that
+                        // it is selected, i.e.,
+                        //   log_prob = log P(i | i is in top-k) = log(exp_logit / s_sum).
+                        output_log_probs[batch_id] = log_prob - logf(s_sum);
+                    }
+                }
+                break;
+            }
+        }
+        if (sequence_length != nullptr && finished != nullptr) {
+            sequence_length[batch_id] = finished[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1;
+            finished[batch_id]        = ids[batch_id] == end_ids[batch_id] ? true : false;
+        }
+    }
+}
+
+#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)                                           \
+    case K_MIN ... K_MAX:                                                                                              \
+        topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_>                                                                \
+            <<<batch_size * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs,                                   \
+                                                                          temp_log_probs,                              \
+                                                                          topk_tmp_id_buf,                             \
+                                                                          topk_tmp_val_buf,                            \
+                                                                          finished,                                    \
+                                                                          max_top_k,                                   \
+                                                                          top_ks,                                      \
+                                                                          vocab_size,                                  \
+                                                                          end_ids,                                     \
+                                                                          skip_decode);                                \
+        topk_stage2_sampling<T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_>                                                       \
+            <<<batch_size, BLOCK_SIZE_2_, K_MAX * sizeof(int) + K_MAX * sizeof(float), stream>>>(topk_tmp_id_buf,      \
+                                                                                                 topk_tmp_val_buf,     \
+                                                                                                 ids,                  \
+                                                                                                 sequence_length,      \
+                                                                                                 finished,             \
+                                                                                                 cum_log_probs,        \
+                                                                                                 output_log_probs,     \
+                                                                                                 max_top_k,            \
+                                                                                                 top_ks,               \
+                                                                                                 top_p,                \
+                                                                                                 top_ps,               \
+                                                                                                 curandstate,          \
+                                                                                                 end_ids,              \
+                                                                                                 vocab_size,           \
+                                                                                                 skip_decode);         \
+        break;
+
+template<typename T>
+void invokeBatchTopKSampling(void*          workspace,
+                             size_t&        workspace_size,
+                             const T*       log_probs,
+                             int*           ids,
+                             int*           sequence_length,
+                             bool*          finished,
+                             float*         cum_log_probs,
+                             float*         output_log_probs,
+                             curandState_t* curandstate,
+                             const int      max_top_k,
+                             const int*     top_ks,
+                             const float    top_p,
+                             const float*   top_ps,
+                             const int      vocab_size_padded,
+                             const int*     end_ids,
+                             cudaStream_t   stream,
+                             const int      batch_size,
+                             const bool*    skip_decode)
+{
+    // Not allow an ambiguous inputs top_p and top_ps.
+    assert(top_p == 1.0f || top_ps == nullptr);
+    const int vocab_size              = vocab_size_padded;
+    const int max_block_per_beam      = 8;
+    int       temp_log_probs_buf_size = batch_size * vocab_size;                      // type float
+    int       topk_tmp_ids_buf_size   = batch_size * max_top_k * max_block_per_beam;  // type int
+    int       topk_tmp_val_buf_size   = batch_size * max_top_k * max_block_per_beam;  // type float
+
+    // prevent memory misaligned address
+    temp_log_probs_buf_size = (int)(ceil(temp_log_probs_buf_size / 4.)) * 4;
+    topk_tmp_ids_buf_size   = (int)(ceil(topk_tmp_ids_buf_size / 4.)) * 4;
+    topk_tmp_val_buf_size   = (int)(ceil(topk_tmp_val_buf_size / 4.)) * 4;
+
+    if (workspace == nullptr) {
+        workspace_size = sizeof(T) * temp_log_probs_buf_size + sizeof(int) * topk_tmp_ids_buf_size
+                         + sizeof(T) * topk_tmp_val_buf_size;
+        return;
+    }
+
+    T*   temp_log_probs   = (T*)workspace;
+    int* topk_tmp_id_buf  = (int*)(temp_log_probs + temp_log_probs_buf_size);
+    T*   topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
+
+    switch (max_top_k) {
+        CASE_K(1, 16, 128, 128, 8);
+        CASE_K(17, 32, 256, 128, 8);
+        CASE_K(33, 64, 256, 256, 8);
+        CASE_K(65, 1024, 256, 256, 8);
+        default:
+            throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
+    }
+}
+
+#undef CASE_K
+
+template void invokeBatchTopKSampling(void*          workspace,
+                                      size_t&        workspace_size,
+                                      const float*   log_probs,
+                                      int*           ids,
+                                      int*           sequence_length,
+                                      bool*          finished_buf,
+                                      float*         cum_log_probs,
+                                      float*         output_log_probs,
+                                      curandState_t* curandstate,
+                                      const int      max_top_k,
+                                      const int*     top_ks,
+                                      const float    top_p,
+                                      const float*   top_ps,
+                                      const int      vocab_size_padded,
+                                      const int*     end_ids,
+                                      cudaStream_t   stream,
+                                      const int      batch_size,
+                                      const bool*    skip_decode);
+
+template void invokeBatchTopKSampling(void*          workspace,
+                                      size_t&        workspace_size,
+                                      const half*    log_probs,
+                                      int*           ids,
+                                      int*           sequence_length,
+                                      bool*          finished_buf,
+                                      float*         cum_log_probs,
+                                      float*         output_log_probs,
+                                      curandState_t* curandstate,
+                                      const int      max_top_k,
+                                      const int*     top_ks,
+                                      const float    top_p,
+                                      const float*   top_ps,
+                                      const int      vocab_size_padded,
+                                      const int*     end_ids,
+                                      cudaStream_t   stream,
+                                      const int      batch_size,
+                                      const bool*    skip_decode);
+
+template<typename T>
+void invokeTopKSampling(void*          workspace,
+                        size_t&        workspace_size,
+                        const T*       log_probs,
+                        int*           ids,
+                        int*           sequence_length,
+                        bool*          finished_buf,
+                        float*         cum_log_probs,
+                        float*         output_log_probs,
+                        curandState_t* curandstate,
+                        const int      top_k,
+                        const float    top_p,
+                        const int      vocab_size_padded,
+                        const int*     end_ids,
+                        cudaStream_t   stream,
+                        const int      batch_size,
+                        const bool*    skip_decode)
+{
+    invokeBatchTopKSampling(workspace,
+                            workspace_size,
+                            log_probs,
+                            ids,
+                            sequence_length,
+                            finished_buf,
+                            cum_log_probs,
+                            output_log_probs,
+                            curandstate,
+                            top_k,
+                            nullptr,
+                            top_p,
+                            nullptr,
+                            vocab_size_padded,
+                            end_ids,
+                            stream,
+                            batch_size,
+                            skip_decode);
+}
+
+template void invokeTopKSampling(void*          workspace,
+                                 size_t&        workspace_size,
+                                 const float*   log_probs,
+                                 int*           ids,
+                                 int*           sequence_length,
+                                 bool*          finished_buf,
+                                 float*         cum_log_probs,
+                                 float*         output_log_probs,
+                                 curandState_t* curandstate,
+                                 const int      top_k,
+                                 const float    top_p,
+                                 const int      vocab_size_padded,
+                                 const int*     end_ids,
+                                 cudaStream_t   stream,
+                                 const int      batch_size,
+                                 const bool*    skip_decode);
+
+template void invokeTopKSampling(void*          workspace,
+                                 size_t&        workspace_size,
+                                 const half*    log_probs,
+                                 int*           ids,
+                                 int*           sequence_length,
+                                 bool*          finished_buf,
+                                 float*         cum_log_probs,
+                                 float*         output_log_probs,
+                                 curandState_t* curandstate,
+                                 const int      top_k,
+                                 const float    top_p,
+                                 const int      vocab_size_padded,
+                                 const int*     end_ids,
+                                 cudaStream_t   stream,
+                                 const int      batch_size,
+                                 const bool*    skip_decode);
+
+template<typename T>
+void invokeTopKTopPSampling(void*          workspace,
+                            size_t&        workspace_size,
+                            int*           output_ids,
+                            const T*       logits,
+                            int*           sequence_length,
+                            bool*          finished_buf,
+                            float*         cum_log_probs,
+                            float*         output_log_probs,
+                            curandState_t* curandstate,
+                            const int      batch_size,
+                            const int      top_k,
+                            const float    top_p,
+                            const int      vocab_size_padded,
+                            const int*     end_ids,
+                            cudaStream_t   stream)
+{
+    // invokeTopKTopPSampling will be deprecated. Please use invokeTopKSampling instead.
+    invokeTopKSampling(workspace,
+                       workspace_size,
+                       logits,
+                       output_ids,
+                       sequence_length,
+                       finished_buf,
+                       cum_log_probs,
+                       output_log_probs,
+                       curandstate,
+                       top_k,
+                       top_p,
+                       vocab_size_padded,
+                       end_ids,
+                       stream,
+                       batch_size,
+                       nullptr);
+}
+
+template void invokeTopKTopPSampling(void*          workspace,
+                                     size_t&        workspace_size,
+                                     int*           output_ids,
+                                     const float*   logits,
+                                     int*           sequence_length,
+                                     bool*          finished_buf,
+                                     float*         cum_log_probs,
+                                     float*         output_log_probs,
+                                     curandState_t* curandstate,
+                                     const int      batch_size,
+                                     const int      top_k,
+                                     const float    top_p,
+                                     const int      vocab_size_padded,
+                                     const int*     end_ids,
+                                     cudaStream_t   stream);
+
+template void invokeTopKTopPSampling(void*          workspace,
+                                     size_t&        workspace_size,
+                                     int*           output_ids,
+                                     const half*    logits,
+                                     int*           sequence_length,
+                                     bool*          finished_buf,
+                                     float*         cum_log_probs,
+                                     float*         output_log_probs,
+                                     curandState_t* curandstate,
+                                     const int      batch_size,
+                                     const int      top_k,
+                                     const float    top_p,
+                                     const int      vocab_size_padded,
+                                     const int*     end_ids,
+                                     cudaStream_t   stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/sampling_topk_kernels.h b/src/fastertransformer/kernels/sampling_topk_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c8c763c3b39fcf7e13bec105644195211f78028
--- /dev/null
+++ b/src/fastertransformer/kernels/sampling_topk_kernels.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "src/fastertransformer/utils/logger.h"
+#include <curand_kernel.h>
+namespace fastertransformer {
+
+template<typename T>
+void invokeTopKSampling(void*          workspace,
+                        size_t&        workspace_size,
+                        const T*       log_probs,
+                        int*           ids,
+                        int*           sequence_length,
+                        bool*          finished_buf,
+                        float*         cum_log_probs,
+                        float*         output_log_probs,
+                        curandState_t* curandstate,
+                        const int      top_k,
+                        const float    top_p,
+                        const int      vocab_size_padded,
+                        const int*     end_ids,
+                        cudaStream_t   stream,
+                        const int      batch_size,
+                        const bool*    skip_decode);
+
+template<typename T>
+void invokeBatchTopKSampling(void*          workspace,
+                             size_t&        workspace_size,
+                             const T*       log_probs,
+                             int*           ids,
+                             int*           sequence_length,
+                             bool*          finished,
+                             float*         cum_log_probs,
+                             float*         output_log_probs,
+                             curandState_t* curandstate,
+                             const int      max_top_k,
+                             const int*     top_ks,
+                             const float    top_p,
+                             const float*   top_ps,
+                             const int      vocab_size_padded,
+                             const int*     end_ids,
+                             cudaStream_t   stream,
+                             const int      batch_size,
+                             const bool*    skip_decode);
+
+void invokeCurandInitialize(curandState_t*     state,
+                            const size_t       batch_size,
+                            unsigned long long random_seed,
+                            cudaStream_t       stream);
+
+void invokeCurandBatchInitialize(curandState_t*            states,
+                                 const size_t              batch_size,
+                                 const unsigned long long* random_seeds,
+                                 cudaStream_t              stream);
+
+template<typename T>
+void invokeAddBiasEndMask(T*           logits,
+                          const T*     bias,
+                          const int*   end_ids,
+                          const bool*  finished,
+                          const int    batch_size,
+                          const int    vocab_size,
+                          const int    vocab_size_padded,
+                          cudaStream_t stream);
+
+template<typename T>
+void invokeTopKTopPSampling(void*          workspace,
+                            size_t&        workspace_size,
+                            int*           output_ids,
+                            const T*       logits,
+                            int*           sequence_length,
+                            bool*          finished_buf,
+                            float*         cum_log_probs,
+                            float*         output_log_probs,
+                            curandState_t* curandstate,
+                            const int      batch_size,
+                            const int      top_k,
+                            const float    top_p,
+                            const int      vocab_size_padded,
+                            const int*     end_ids,
+                            cudaStream_t   stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/sampling_topp_kernels.cu b/src/fastertransformer/kernels/sampling_topp_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..78122dc2cd6067fa0c8be4b519f5f7e8426862ed
--- /dev/null
+++ b/src/fastertransformer/kernels/sampling_topp_kernels.cu
@@ -0,0 +1,1429 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11050)
+#include <cub/cub.cuh>
+#else
+#include "3rdparty/cub/cub.cuh"
+#endif
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+constexpr int   ENABLE_SINGLE_PASS_TOP_P = 0;
+constexpr float SINGLE_PASS_THRESHOLD    = 0.9;
+
+namespace fastertransformer {
+
+namespace segmented_topp_impl {
+
+template<int HALF_ELEMENTS_PER_WARP_LOAD>
+using Copy_half_t = typename std::conditional<
+    HALF_ELEMENTS_PER_WARP_LOAD == 32,
+    half,
+    typename std::conditional<HALF_ELEMENTS_PER_WARP_LOAD == 64,
+                              int,
+                              typename std::conditional<HALF_ELEMENTS_PER_WARP_LOAD == 128, int2, int4>::type>::type>::
+    type;
+
+template<typename T, int ELEMENTS_PER_WARP_LOAD>
+using Copy_t = Copy_half_t<sizeof(T) / sizeof(half) * ELEMENTS_PER_WARP_LOAD>;
+
+template<typename T>
+struct Float_as_int_ {
+};
+template<>
+struct Float_as_int_<float> {
+    using Type = uint32_t;
+};
+template<>
+struct Float_as_int_<__half> {
+    using Type = uint16_t;
+};
+
+using kernel_params_float   = Segmented_topk_kernel_params<float, int32_t, 256, 2>;
+using kernel_params_float_1 = Segmented_topk_kernel_params<float, int32_t, 256, 1>;
+using kernel_params_half    = Segmented_topk_kernel_params<__half, int32_t, 256, 4>;
+using kernel_params_half_1  = Segmented_topk_kernel_params<__half, int32_t, 256, 1>;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float to_float(uint32_t src)
+{
+    return __int_as_float(src);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static inline __device__ float to_float(uint16_t src)
+{
+    __half dst = __ushort_as_half(src);
+    return __half2float(dst);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// sort one segment per cta
+template<typename T_SCORE, int BLOCK_THREADS, int ELEMENTS_PER_THREAD>
+__global__ void blockSortKernel(const T_SCORE* d_keys_in,
+                                T_SCORE*       d_keys_out,
+                                const int32_t* d_values_in,
+                                int32_t*       d_values_out,
+                                const int32_t* active_counts,
+                                int            num_items_,
+                                int            stride_items,
+                                int            num_segments)
+{
+    // Specialize BlockRadixSort for a 1D block
+    typedef cub::BlockRadixSort<T_SCORE, BLOCK_THREADS, ELEMENTS_PER_THREAD, int32_t> BlockRadixSort;
+
+    // Allocate shared memory for BlockRadixSort
+    __shared__ typename BlockRadixSort::TempStorage temp_storage;
+
+    if (blockIdx.x >= num_segments) {
+        return;
+    }
+
+    int num_items = active_counts[blockIdx.x];  // > num_items_ ? num_items_ : active_counts[blockIdx.x];
+
+    if (num_items == 0) {
+        return;
+    }
+
+    // Obtain a segment of consecutive items that are blocked across threads
+    T_SCORE thread_keys[ELEMENTS_PER_THREAD];
+    int32_t thread_values[ELEMENTS_PER_THREAD];
+
+    int32_t block_offset = blockIdx.x * stride_items;
+    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out + block_offset, thread_keys, num_items, 0);
+    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out + block_offset, thread_values, num_items, -1);
+    __syncthreads();
+
+    // Collectively sort the keys and values among block threads
+    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(thread_keys, thread_values);
+
+    // Store output in striped fashion
+    cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out + block_offset, thread_keys, num_items);
+    cub::StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out + block_offset, thread_values, num_items);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// block sort kernel
+template<typename T_SCORE>
+void blockSort(const T_SCORE* d_keys_in,
+               T_SCORE*       d_keys_out,
+               const int32_t* d_values_in,
+               int32_t*       d_values_out,
+               const int32_t* active_counts,
+               int            num_items,
+               int            stride_items,
+               int            num_segments,
+               cudaStream_t   stream)
+{
+    if (num_items == 0) {
+        return;
+    }
+
+    int kernel_index  = div_up(num_items, 128) - 1;
+    int warps_per_cta = (kernel_index + 1) * 128 / 32;
+    if (kernel_index > 7) {
+        kernel_index  = 7 + div_up(num_items, 1024) - 1;
+        warps_per_cta = 1024 / 32;
+    }
+    assert(warps_per_cta <= 32);
+
+    dim3 block(warps_per_cta * 32);
+    dim3 grid(num_segments);
+
+    using kernel_func = void (*)(const T_SCORE* d_keys_in,
+                                 T_SCORE*       d_keys_out,
+                                 const int32_t* d_values_in,
+                                 int32_t*       d_values_out,
+                                 const int32_t* active_counts,
+                                 int            num_items,
+                                 int            stride_items,
+                                 int            num_segments);
+
+    static const kernel_func kernel_funcs[] = {
+        &blockSortKernel<T_SCORE, 128, 1>,
+        &blockSortKernel<T_SCORE, 256, 1>,
+        &blockSortKernel<T_SCORE, 384, 1>,
+        &blockSortKernel<T_SCORE, 512, 1>,
+        &blockSortKernel<T_SCORE, 640, 1>,
+        &blockSortKernel<T_SCORE, 768, 1>,
+        &blockSortKernel<T_SCORE, 896, 1>,
+        &blockSortKernel<T_SCORE, 1024, 1>,
+        &blockSortKernel<T_SCORE, 1024, 2>,
+        &blockSortKernel<T_SCORE, 1024, 4>,
+        //&blockSortKernel<T_SCORE, 1024, 6>,
+    };
+    kernel_funcs[kernel_index]<<<grid, block, 0, stream>>>(
+        d_keys_in, d_keys_out, d_values_in, d_values_out, active_counts, num_items, stride_items, num_segments);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct BlockPrefixCallbackOp {
+    // Running prefix
+    int running_total;
+    // Constructor
+    __device__ BlockPrefixCallbackOp(uint32_t running_total): running_total(running_total) {}
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ int operator()(uint32_t block_aggregate)
+    {
+        uint32_t old_prefix = running_total;
+        running_total += block_aggregate;
+        return old_prefix;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define DO_DEBUG_PRINT 0
+
+// governs the split between regs and smem
+constexpr float SMEM_FRACTION = 0.5F;
+constexpr float P_EPSILON     = 0.01F;
+
+constexpr int MAX_TOP_K = 3072;
+constexpr int WARP_SZ   = 32;
+
+template<typename Kernel_params, int ITEMS_PER_THREAD>
+__global__ __launch_bounds__(Kernel_params::BLOCK_THREADS,
+                             1) void segmented_top_p_single_pass(TopKPerSegmentParams params)
+{
+#if DO_DEBUG_PRINT
+    constexpr int debug_block_id = 26;
+#endif
+
+    using Key_Data_Type     = typename Kernel_params::Key_Data_Type;
+    using Int_Key_Data_Type = typename Float_as_int_<Key_Data_Type>::Type;
+
+    // 4 fp16 keys or 2 fp32 keys
+    constexpr int                                         KEYS_PER_LDG = Kernel_params::KEYS_PER_LDG;
+    typedef Copy_t<Key_Data_Type, WARP_SZ * KEYS_PER_LDG> copy_t;
+    union access_t {
+        copy_t            v;
+        Int_Key_Data_Type x[KEYS_PER_LDG];  // supported size 1,2,4
+    };
+
+    constexpr int BLOCK_THREADS = Kernel_params::BLOCK_THREADS;
+
+    constexpr int ITEMS_PER_THREAD_IN_REGS = ITEMS_PER_THREAD * (1.0F - SMEM_FRACTION);
+    constexpr int ITEMS_PER_THREAD_IN_SMEM = ITEMS_PER_THREAD - ITEMS_PER_THREAD_IN_REGS;
+
+#if DO_DEBUG_PRINT == 1
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printf("ITEMS_PER_THREAD, ITEMS_PER_THREAD_IN_REGS, ITEMS_PER_THREAD_IN_SMEM = %d, %d, %d\n",
+               ITEMS_PER_THREAD,
+               ITEMS_PER_THREAD_IN_REGS,
+               ITEMS_PER_THREAD_IN_SMEM);
+    }
+#endif
+
+    constexpr int          MIN_KEY            = 0;
+    constexpr int          ENABLED_PER_THREAD = (ITEMS_PER_THREAD + 32 - 1) / 32;
+    extern __shared__ int2 dynamic_smem[];
+    int2*                  smem_selected_elements = dynamic_smem;
+    Int_Key_Data_Type*     smem_thread_items = reinterpret_cast<Int_Key_Data_Type*>(smem_selected_elements + MAX_TOP_K);
+
+    __shared__ unsigned int smem_selected_count;
+
+    // Specialize BlockScan type for our thread block
+    typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
+
+    // Specialize BlockScan type for our thread block
+    typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduce;
+    __shared__ float                               smem_p_sum_total;
+
+    __shared__ union {
+        typename BlockScan::TempStorage scan;
+
+        typename BlockReduce::TempStorage reduce;
+    } temp_storage;
+    // Initialize running total
+    BlockPrefixCallbackOp prefix_op(0);
+
+    unsigned int old_selected_count;
+
+    uint32_t segment = blockIdx.y * gridDim.x + blockIdx.x;
+
+    // Preceding TopK has shortcutted this segment
+    if (params.gmem_begin_offsets[segment] == params.gmem_end_offsets[segment]) {
+        if (threadIdx.x == 0) {
+            params.gmem_active_count_per_segment[segment] = 1;
+            atomicMax(params.gmem_active_count_total, 1);
+        }
+        return;
+    }
+
+    Int_Key_Data_Type* gmem_src_keys = reinterpret_cast<Int_Key_Data_Type*>(params.gmem_src_keys);
+    Int_Key_Data_Type* gmem_dst_keys = reinterpret_cast<Int_Key_Data_Type*>(params.gmem_dst_keys);
+    int32_t*           gmem_dst_vals = reinterpret_cast<int32_t*>(params.gmem_dst_vals);
+
+    constexpr int BITS_IN_KEY = sizeof(Key_Data_Type) * 8;
+
+    int items       = params.num_items / params.num_segments;
+    int first_index = segment * items;
+    gmem_src_keys += first_index;
+    gmem_dst_keys += first_index;
+    gmem_dst_vals += first_index;
+
+    int               index_limit                            = items;
+    Int_Key_Data_Type thread_items[ITEMS_PER_THREAD_IN_REGS] = {0};
+
+    // Load all keys into registers and smem
+    const int     lane_id   = threadIdx.x % WARP_SZ;
+    const int     warp_id   = threadIdx.x / WARP_SZ;
+    constexpr int NUM_WARPS = BLOCK_THREADS / WARP_SZ;
+
+    access_t ZERO;
+    for (int i = 0; i < KEYS_PER_LDG; i++) {
+        ZERO.x[i] = MIN_KEY;
+    }
+
+    // registers
+    for (int iter = 0; iter < ITEMS_PER_THREAD_IN_REGS; iter++) {
+        int offset         = (iter + threadIdx.x * ITEMS_PER_THREAD);
+        thread_items[iter] = (offset < index_limit) ? gmem_src_keys[offset] : MIN_KEY;
+    }
+
+    // shared memory
+    for (int c = warp_id; c < BLOCK_THREADS; c += NUM_WARPS) {
+        for (int iter = lane_id * KEYS_PER_LDG; iter < ITEMS_PER_THREAD_IN_SMEM; iter += WARP_SZ * KEYS_PER_LDG) {
+            int      offset = iter + c * ITEMS_PER_THREAD + ITEMS_PER_THREAD_IN_REGS;
+            access_t val;
+            val.v = (offset < index_limit) ? *reinterpret_cast<copy_t*>(&gmem_src_keys[offset]) : ZERO.v;
+            for (int i = 0; i < KEYS_PER_LDG; i++) {
+                smem_thread_items[c + (iter + i) * BLOCK_THREADS] = val.x[i];
+            }
+            // smem_thread_items[c  + iter * BLOCK_THREADS] = (offset < index_limit)? gmem_src_keys[offset] : MIN_KEY;
+        }
+    }
+
+    Int_Key_Data_Type select_mask = 0;
+    Int_Key_Data_Type save_mask   = 0;
+
+    // Int_Key_Data_Type save_bit = 0;
+    // set to true when we finish with too few keys, so we go back to last_save_mask one more time
+    bool is_last_iter = false;
+
+    if (threadIdx.x == 0) {
+        smem_selected_count = 0;
+        old_selected_count  = 0;
+    }
+
+    // iterate over bits.
+    // skip the first two bits,
+    // * bit 31 is the sign bit. all values are positive
+    // * bit 30 is only set for values >= 2, but the input consists only of values in
+    // the range of [0,1]
+    constexpr int               START_BIT = BITS_IN_KEY - 1;
+    constexpr int               SKIP_BITS = 2;
+    constexpr Int_Key_Data_Type ONE       = (Int_Key_Data_Type)1;
+    uint32_t                    selected;
+    uint32_t                    sc;
+    float                       p_sum_total     = 0.0F;
+    float                       old_p_sum_total = 0.0F;
+    uint32_t                    offset          = 0;
+    for (Int_Key_Data_Type bit = START_BIT - SKIP_BITS; true; --bit) {
+        __syncthreads();
+        Int_Key_Data_Type bit_mask = select_mask | (ONE << bit);
+
+        uint32_t enabled[ENABLED_PER_THREAD] = {0};
+        float    thread_sum                  = 0.0F;
+
+        for (int item = 0; item < ITEMS_PER_THREAD_IN_REGS; ++item) {
+            // check if all the bits from bit mask are contained in the thread_item. If yes, set respective
+            // bit of enabled
+            auto     val        = thread_items[item];
+            uint32_t is_enabled = uint32_t(((val ^ bit_mask) & bit_mask) == 0);
+            // thread_sum += (is_enabled)? to_float(val) : 0.0F;
+            thread_sum += is_enabled * to_float(val);
+            enabled[item / 32] |= is_enabled << (item % 32);
+        }
+
+        for (int item = 0; item < ITEMS_PER_THREAD_IN_SMEM; ++item) {
+            int idx = threadIdx.x + item * BLOCK_THREADS;
+            // int idx = item + ITEMS_PER_THREAD_IN_SMEM * threadIdx.x;
+            auto     val        = smem_thread_items[idx];
+            uint32_t is_enabled = uint32_t(((val ^ bit_mask) & bit_mask) == 0);
+            // thread_sum += (is_enabled)? to_float(val) : 0.0F;
+            thread_sum += is_enabled * to_float(val);
+            enabled[(ITEMS_PER_THREAD_IN_REGS + item) / 32] |= is_enabled << ((ITEMS_PER_THREAD_IN_REGS + item) % 32);
+        }
+
+        selected = 0;
+#pragma unroll
+        for (int i = 0; i < ENABLED_PER_THREAD; i++) {
+            selected += __popc(enabled[i]);
+        }
+
+        float p_sum = BlockReduce(temp_storage.reduce).Sum(thread_sum);
+
+        if (threadIdx.x == 0) {
+            p_sum_total += p_sum;
+            smem_p_sum_total = p_sum_total;
+        }
+
+        __syncthreads();
+        p_sum_total = smem_p_sum_total;
+        __syncthreads();
+
+        BlockScan(temp_storage.scan).ExclusiveSum(selected, offset, prefix_op);
+
+        if (threadIdx.x == 0) {
+            smem_selected_count = prefix_op.running_total;
+        }
+
+        __syncthreads();
+        sc = smem_selected_count;
+        __syncthreads();
+
+        // float p_diff = params.top_p - p_sum_total;
+        float p_diff = p_sum_total - params.top_p;
+
+        if ((p_sum_total <= params.top_p + P_EPSILON && p_sum_total > 0)
+            || (p_sum_total > params.top_p && sc <= MAX_TOP_K) || (bit == 0 && p_sum_total > 0) || is_last_iter) {
+
+#if DO_DEBUG_PRINT == 1
+            __syncthreads();
+            if (threadIdx.x == 0 && blockIdx.x == debug_block_id) {
+                sc = smem_selected_count;
+                printf("bit %d bit_mask %d offset %d (%d, %d), sc = %d, p_sum = %f, p_sum_total = %f\n",
+                       bit,
+                       bit_mask,
+                       offset,
+                       blockIdx.x,
+                       threadIdx.x,
+                       sc,
+                       p_sum,
+                       p_sum_total);
+            }
+            __syncthreads();
+#endif
+
+            for (int item = 0; item < ITEMS_PER_THREAD_IN_REGS; ++item) {
+                // last condition should not trigger with well trained weights, but we will get
+                // illegal mewmory access if we do not have one in those rare cases
+                if (enabled[item / 32] & (ONE << (item % 32)) && offset < MAX_TOP_K) {
+                    smem_selected_elements[offset] =
+                        make_int2(thread_items[item], item + threadIdx.x * ITEMS_PER_THREAD);
+                    ++offset;
+                    thread_items[item] = MIN_KEY;
+                }
+            }
+
+            for (int item = 0; item < ITEMS_PER_THREAD_IN_SMEM; ++item) {
+                if (enabled[(item + ITEMS_PER_THREAD_IN_REGS) / 32] & (ONE << ((item + ITEMS_PER_THREAD_IN_REGS) % 32))
+                    && offset < MAX_TOP_K) {
+                    int idx = threadIdx.x + item * BLOCK_THREADS;
+                    // int idx = item + ITEMS_PER_THREAD_IN_SMEM * threadIdx.x;
+                    // if (idx <  params.num_items_per_segment_in_smem)
+                    {
+                        smem_selected_elements[offset] = make_int2(
+                            smem_thread_items[idx], item + threadIdx.x * ITEMS_PER_THREAD + ITEMS_PER_THREAD_IN_REGS);
+                        ++offset;
+                        smem_thread_items[idx] = MIN_KEY;
+                    }
+                }
+            }
+        }
+
+#if DO_DEBUG_PRINT == 1
+        if (threadIdx.x == 0 && blockIdx.x == debug_block_id) {
+            printf("!!!! bit %d bit_mask %d offset %d (%d, %d), sc = %d, p_sum = %f, p_sum_total = %f\n",
+                   bit,
+                   bit_mask,
+                   offset,
+                   blockIdx.x,
+                   threadIdx.x,
+                   sc,
+                   p_sum,
+                   p_sum_total);
+        }
+#endif
+
+        if (p_diff <= P_EPSILON && p_diff >= 0 || (p_sum_total > params.top_p && sc <= MAX_TOP_K) || bit == 0) {
+
+            break;
+        }
+        // p > top_p
+        else if (p_diff > P_EPSILON) {
+            // There are too many bits in the current selection
+            // Save the current state and go to the next bit
+            // If there are not enough items left using the next bit
+            // it's necessary to restart here with the current bit not set
+            save_mask = bit_mask;
+            select_mask |= bit_mask;
+
+            if (threadIdx.x == 0) {
+                smem_selected_count = old_selected_count;
+                p_sum_total         = old_p_sum_total;
+
+                prefix_op.running_total = old_selected_count;
+            }
+        }
+        else {
+            // sc < num_top_k branch
+            if (save_mask) {
+                select_mask = save_mask;
+
+                save_mask = 0;
+            }
+            if (threadIdx.x == 0) {
+                old_selected_count = smem_selected_count;
+                old_p_sum_total    = p_sum_total;
+            }
+        }
+    }
+
+    __syncthreads();
+
+    // store data to global memory
+    sc = (p_sum_total < params.top_p) ? params.num_items / params.num_segments : smem_selected_count;
+    if (threadIdx.x == 0) {
+        params.gmem_active_count_per_segment[segment] = sc;
+        atomicMax(params.gmem_active_count_total, sc);
+    }
+    if (sc >= MAX_TOP_K) {
+        return;
+    }
+    for (int i = threadIdx.x; i < sc; i += blockDim.x) {
+        int2 selected_element = smem_selected_elements[i];
+        gmem_dst_keys[i]      = selected_element.x;
+        gmem_dst_vals[i]      = selected_element.y;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_params>
+int getSmemSizeAndCheck(const TopKPerSegmentContext& context, const TopKPerSegmentParams& params)
+{
+    constexpr int BLOCK_THREADS         = Kernel_params::BLOCK_THREADS;
+    using Key_Data_Type                 = typename Kernel_params::Key_Data_Type;
+    int           num_items_per_segment = params.num_items / params.num_segments;
+    constexpr int ITEMS_INCREMENT       = Kernel_params::ITEMS_INCREMENT;
+    int           kernel_index          = div_up(num_items_per_segment, BLOCK_THREADS * ITEMS_INCREMENT) - 1;
+
+    int       smem_size                = MAX_TOP_K * sizeof(int2);
+    const int items_per_thread         = (kernel_index + 1) * ITEMS_INCREMENT;
+    const int items_per_thread_in_regs = items_per_thread * (1.0F - SMEM_FRACTION);
+    const int items_per_thread_in_smem = items_per_thread - items_per_thread_in_regs;
+
+    smem_size += items_per_thread_in_smem * BLOCK_THREADS * sizeof(typename Float_as_int_<Key_Data_Type>::Type);
+
+    int keys_per_ldg = 2 * sizeof(Key_Data_Type) / 2;
+    if (smem_size + BLOCK_THREADS * sizeof(float) > (size_t)context.sm_shared_size ||  // dynamic + static memory
+        items_per_thread_in_regs + items_per_thread_in_smem != items_per_thread || params.top_p + P_EPSILON > 1.0F
+        || items_per_thread_in_regs % keys_per_ldg != 0 || items_per_thread_in_smem % keys_per_ldg != 0
+        || num_items_per_segment % keys_per_ldg != 0) {
+        return -1;
+    }
+
+    return smem_size;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int getSmemSizeAndCheck(const TopKPerSegmentContext& context,
+                        const TopKPerSegmentParams&  params,
+                        const DType_t                DT_SCORE)
+{
+    int num_items_per_segment = params.num_items / params.num_segments;
+    if (DT_SCORE == kFLOAT) {
+        if (num_items_per_segment % 2 == 0) {
+            return getSmemSizeAndCheck<kernel_params_float>(context, params);
+        }
+        else {
+            return getSmemSizeAndCheck<kernel_params_float_1>(context, params);
+        }
+    }
+    else {
+        if (num_items_per_segment % 4 == 0) {
+            return getSmemSizeAndCheck<kernel_params_half>(context, params);
+        }
+        else {
+            return getSmemSizeAndCheck<kernel_params_half_1>(context, params);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_params>
+void segmentedTopPSinglePass_dispatch(const TopKPerSegmentParams&  params,
+                                      const TopKPerSegmentContext& context,
+                                      cudaStream_t                 stream)
+{
+
+    constexpr int BLOCK_THREADS = Kernel_params::BLOCK_THREADS;
+    using Key_Data_Type         = typename Kernel_params::Key_Data_Type;
+    using Value_Data_Type       = typename Kernel_params::Value_Data_Type;
+
+    int num_items_per_segment = params.num_items / params.num_segments;
+
+    constexpr int ITEMS_INCREMENT = Kernel_params::ITEMS_INCREMENT;
+    int           kernel_index    = div_up(num_items_per_segment, BLOCK_THREADS * ITEMS_INCREMENT) - 1;
+
+#define KERNEL_RUN(INDEX)                                                                                              \
+    {                                                                                                                  \
+        if (smem_size > 0)                                                                                             \
+            check_cuda_error(                                                                                          \
+                cudaFuncSetAttribute(segmented_top_p_single_pass<Kernel_params, ITEMS_INCREMENT*(INDEX + 1)>,          \
+                                     cudaFuncAttributeMaxDynamicSharedMemorySize,                                      \
+                                     smem_size));                                                                      \
+        segmented_top_p_single_pass<Kernel_params, ITEMS_INCREMENT*(INDEX + 1)>                                        \
+            <<<grid_dim, Kernel_params::BLOCK_THREADS, smem_size, stream>>>(params);                                   \
+    }
+
+    int smem_size = getSmemSizeAndCheck<Kernel_params>(context, params);
+
+    dim3 grid_dim(params.num_segments, 1);
+
+    switch (kernel_index) {
+        case 0:
+            KERNEL_RUN(0) break;
+        case 1:
+            KERNEL_RUN(1) break;
+        case 2:
+            KERNEL_RUN(2) break;
+        case 3:
+            KERNEL_RUN(3) break;
+        case 4:
+            KERNEL_RUN(4) break;
+        case 5:
+            KERNEL_RUN(5) break;
+        case 6:
+            KERNEL_RUN(6) break;
+        case 7:
+            KERNEL_RUN(7) break;
+        default:
+            exit(1);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Kernel_params>
+void topPPerSegment_dispatch(const TopKPerSegmentContext& context,
+                             TopKPerSegmentParams&        params,
+                             void*                        temp_storage,
+                             size_t&                      temp_storage_bytes,
+                             cudaStream_t                 stream)
+{
+
+    using Key_Data_Type   = typename Kernel_params::Key_Data_Type;
+    using Value_Data_Type = typename Kernel_params::Value_Data_Type;
+
+    if (temp_storage == nullptr) {
+        if (params.num_segments > 1) {
+            cub::DeviceSegmentedRadixSort::SortPairsDescending(temp_storage,
+                                                               temp_storage_bytes,
+                                                               reinterpret_cast<Key_Data_Type*>(params.gmem_src_keys),
+                                                               reinterpret_cast<Key_Data_Type*>(params.gmem_dst_keys),
+                                                               reinterpret_cast<Value_Data_Type*>(params.gmem_src_vals),
+                                                               reinterpret_cast<Value_Data_Type*>(params.gmem_dst_vals),
+                                                               params.num_items,
+                                                               params.num_segments,
+                                                               params.gmem_begin_offsets,
+                                                               params.gmem_end_offsets,
+                                                               0,
+                                                               sizeof(Key_Data_Type) * 8,
+                                                               stream);
+        }
+        else {
+            cub::DeviceRadixSort::SortPairsDescending(temp_storage,
+                                                      temp_storage_bytes,
+                                                      reinterpret_cast<Key_Data_Type*>(params.gmem_src_keys),
+                                                      reinterpret_cast<Key_Data_Type*>(params.gmem_dst_keys),
+                                                      reinterpret_cast<Value_Data_Type*>(params.gmem_src_vals),
+                                                      reinterpret_cast<Value_Data_Type*>(params.gmem_dst_vals),
+                                                      params.num_items,
+                                                      0,
+                                                      sizeof(Key_Data_Type) * 8,
+                                                      stream);
+        }
+        temp_storage_bytes = div_up(temp_storage_bytes, 256) * 256;
+        // total active counts
+        temp_storage_bytes += div_up(sizeof(int), 256) * 256;
+        // storage for gmem_end_offsets
+        temp_storage_bytes += div_up(sizeof(int) * params.num_segments, 256) * 256;
+        return;
+    }
+
+    size_t cub_temp_storage_bytes =
+        temp_storage_bytes - div_up(sizeof(int), 256) * 256 - div_up(sizeof(int) * params.num_segments, 256) * 256;
+    void* cub_temp_storage         = temp_storage;
+    params.gmem_active_count_total = reinterpret_cast<int*>((char*)temp_storage + cub_temp_storage_bytes);
+    params.gmem_active_count_per_segment =
+        reinterpret_cast<int*>((char*)params.gmem_active_count_total + div_up(sizeof(int), 256) * 256);
+
+    int num_items_per_segment = params.num_items / params.num_segments;
+
+    cudaMemsetAsync(params.gmem_active_count_total, 0, sizeof(int), stream);
+    cudaMemsetAsync(params.gmem_dst_keys, 0, params.num_items * sizeof(Key_Data_Type), stream);
+    segmentedTopPSinglePass_dispatch<Kernel_params>(params, context, stream);
+
+    int max_num_items = 0;
+    cudaMemcpyAsync(&max_num_items, params.gmem_active_count_total, sizeof(int), cudaMemcpyDeviceToHost, stream);
+
+    cudaStreamSynchronize(stream);
+
+    if (max_num_items >= MAX_TOP_K || max_num_items == 0) {
+        if (params.num_segments > 1) {
+            cub::DeviceSegmentedRadixSort::SortPairsDescending(cub_temp_storage,
+                                                               cub_temp_storage_bytes,
+                                                               reinterpret_cast<Key_Data_Type*>(params.gmem_src_keys),
+                                                               reinterpret_cast<Key_Data_Type*>(params.gmem_dst_keys),
+                                                               reinterpret_cast<Value_Data_Type*>(params.gmem_src_vals),
+                                                               reinterpret_cast<Value_Data_Type*>(params.gmem_dst_vals),
+                                                               params.num_items,
+                                                               params.num_segments,
+                                                               params.gmem_begin_offsets,
+                                                               params.gmem_end_offsets,
+                                                               0,
+                                                               sizeof(Key_Data_Type) * 8,
+                                                               stream);
+        }
+        else {
+            cub::DeviceRadixSort::SortPairsDescending(cub_temp_storage,
+                                                      cub_temp_storage_bytes,
+                                                      reinterpret_cast<Key_Data_Type*>(params.gmem_src_keys),
+                                                      reinterpret_cast<Key_Data_Type*>(params.gmem_dst_keys),
+                                                      reinterpret_cast<Value_Data_Type*>(params.gmem_src_vals),
+                                                      reinterpret_cast<Value_Data_Type*>(params.gmem_dst_vals),
+                                                      params.num_items,
+                                                      0,
+                                                      sizeof(Key_Data_Type) * 8,
+                                                      stream);
+        }
+    }
+    else {
+        // run at max supported value
+        blockSort<Key_Data_Type>((const Key_Data_Type*)(params.gmem_dst_keys),
+                                 (Key_Data_Type*)(params.gmem_dst_keys),
+                                 (const Value_Data_Type*)(params.gmem_dst_vals),
+                                 (Value_Data_Type*)(params.gmem_dst_vals),
+                                 params.gmem_active_count_per_segment,
+                                 max_num_items,
+                                 num_items_per_segment,
+                                 params.num_segments,
+                                 stream);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int topPPerSegment(const TopKPerSegmentContext& context,
+                   TopKPerSegmentParams&        params,
+                   const DType_t                DT_SCORE,
+                   void*                        temp_storage,
+                   size_t&                      temp_storage_bytes,
+                   cudaStream_t                 stream)
+{
+    int num_items_per_segment = params.num_items / params.num_segments;
+    if (DT_SCORE == kFLOAT) {
+        if (num_items_per_segment % 2 == 0) {
+            topPPerSegment_dispatch<kernel_params_float>(context, params, temp_storage, temp_storage_bytes, stream);
+        }
+        else {
+            topPPerSegment_dispatch<kernel_params_float_1>(context, params, temp_storage, temp_storage_bytes, stream);
+        }
+    }
+    else {
+        if (num_items_per_segment % 4 == 0) {
+            topPPerSegment_dispatch<kernel_params_half>(context, params, temp_storage, temp_storage_bytes, stream);
+        }
+        else {
+            topPPerSegment_dispatch<kernel_params_half_1>(context, params, temp_storage, temp_storage_bytes, stream);
+        }
+    }
+
+    return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace segmented_topp_impl
+
+__global__ void topPInitialize(
+    int* topp_id_val_buf, int* topp_offset_buf, int* begin_topp_offset_buf_, const int batch_size, const int n)
+{
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+
+    if (bid == 0) {
+        for (int i = tid; i < batch_size + 1; i += blockDim.x) {
+            topp_offset_buf[i]        = i * n;
+            begin_topp_offset_buf_[i] = topp_offset_buf[i];
+        }
+    }
+
+    int index = tid + bid * blockDim.x;
+
+    while (index < batch_size * n) {
+        topp_id_val_buf[index] = index % n;
+        index += blockDim.x * gridDim.x;
+    }
+}
+
+void invokeTopPInitialize(int*         topp_id_val_buf,
+                          int*         topp_offset_buf,
+                          int*         begin_topp_offset_buf_,
+                          const size_t batch_size,
+                          const int    n,
+                          cudaStream_t stream)
+{
+    // n: the column number of logits_buffer for top_p sampling
+    topPInitialize<<<32, 512, 0, stream>>>(topp_id_val_buf, topp_offset_buf, begin_topp_offset_buf_, batch_size, n);
+}
+
+template<typename T, int MAX_K, int THREADBLOCK_SIZE>
+__launch_bounds__(THREADBLOCK_SIZE) __global__ void topp_beam_topk_kernel(const T*     log_probs,  // prob.
+                                                                          int*         topk_tmp_id_buf,
+                                                                          T*           topk_tmp_val_buf,
+                                                                          const int    vocab_size,
+                                                                          int*         offset_buf,
+                                                                          int*         begin_offset_buf,
+                                                                          const float  top_p,
+                                                                          const float* top_ps,
+                                                                          const bool*  skip_decode)
+{
+    int thread_id = threadIdx.x;
+    int batch_id  = blockIdx.x;
+    if (skip_decode != nullptr && skip_decode[batch_id]) {
+        return;
+    }
+    float p_threshold = (top_ps != nullptr) ? top_ps[batch_id] : top_p;
+
+    typedef cub::BlockReduce<TopK<T, MAX_K>, THREADBLOCK_SIZE> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage               temp_storage;
+    TopK<T, MAX_K>                                             partial;
+
+    const bool IS_FP16   = std::is_same<T, half>::value;
+    const T    MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+
+#pragma unroll
+    for (int i = 0; i < MAX_K; ++i) {
+        partial.p[i] = -1;
+        partial.u[i] = -MAX_T_VAL;
+    }
+
+#pragma unroll
+    for (int elem_id = thread_id; elem_id < vocab_size; elem_id += THREADBLOCK_SIZE) {
+        int index = elem_id + batch_id * vocab_size;
+        partial.insert(log_probs[index], index);
+    }
+
+    TopK<T, MAX_K> total = BlockReduce(temp_storage).Reduce(partial, reduce_topk_op<T, MAX_K>);
+
+    if (thread_id == 0) {
+        begin_offset_buf[batch_id] = offset_buf[batch_id];
+        T sum_prob                 = (T)(0.0f);
+
+#pragma unroll
+        for (int i = 0; i < MAX_K; i++) {
+            sum_prob += total.u[i];
+        }
+
+        if ((float)sum_prob >= p_threshold) {
+            begin_offset_buf[batch_id] += vocab_size;
+            int index = batch_id * vocab_size;
+
+#pragma unroll
+            for (int i = 0; i < MAX_K; ++i) {
+                topk_tmp_id_buf[index + i]  = total.p[i] % vocab_size;
+                topk_tmp_val_buf[index + i] = total.u[i];
+            }
+        }
+    }
+}
+
+struct BlockPrefixCallbackOp {
+    // Running prefix
+    float running_total;
+    // Constructor
+    __device__ BlockPrefixCallbackOp(float running_total): running_total(running_total) {}
+    // Callback operator to be entered by the first warp of threads in the block.
+    // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+    __device__ float operator()(float block_aggregate)
+    {
+        float old_prefix = running_total;
+        running_total += block_aggregate;
+        return old_prefix;
+    }
+};
+
+template<typename T, int BLOCK_SIZE>
+__global__ void topp_sampling(T*             sorted_log_probs,
+                              int*           sorted_id_vals,
+                              int*           ids,
+                              int*           sequence_length,
+                              bool*          finished_buf,
+                              float*         cum_log_probs,
+                              float*         output_log_probs,
+                              const int*     begin_offset_buf,
+                              const int*     offset_buf,
+                              const int      vocab_size,
+                              curandState_t* curandstate,
+                              const float    top_p,
+                              const float*   top_ps,
+                              const int*     end_ids,
+                              const int      batch_size,
+                              const bool*    skip_decode)
+{
+    __shared__ int   stop_shared;
+    __shared__ float rand_num_s;
+
+    const int tid      = threadIdx.x;
+    const int batch_id = blockIdx.x;
+    if (skip_decode != nullptr && skip_decode[batch_id]) {
+        return;
+    }
+
+    constexpr int WARP_SIZE      = 32;
+    constexpr int NUM_WARPS      = BLOCK_SIZE / WARP_SIZE;
+    const int     lane_id        = threadIdx.x % WARP_SIZE;
+    const int     warp_id        = threadIdx.x / WARP_SIZE;
+    const float   prob_threshold = (top_ps != nullptr) ? top_ps[batch_id] : top_p;
+
+    if (threadIdx.x == 0) {
+        stop_shared = 0;
+        rand_num_s  = curand_uniform(curandstate + blockIdx.x) * prob_threshold;
+    }
+
+    // if begin_offset_buf and offset_buf of sorting have same value,
+    // this means that we have find best one in beam_topK_kernel_for_topP
+    // and skip the sorting. So, we can skip then during sampling.
+    if (begin_offset_buf[batch_id] == offset_buf[batch_id]) {
+        if (tid == 0) {
+            int offset    = batch_id * vocab_size;
+            ids[batch_id] = sorted_id_vals[offset];
+
+            if (cum_log_probs != nullptr || output_log_probs != nullptr) {
+                float lprob = logf(sorted_log_probs[offset]);
+                if (cum_log_probs != nullptr) {
+                    cum_log_probs[batch_id] += lprob;
+                }
+                if (output_log_probs != nullptr) {
+                    output_log_probs[batch_id] = lprob;
+                }
+            }
+            if (sequence_length != nullptr && finished_buf != nullptr) {
+                sequence_length[batch_id] =
+                    finished_buf[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1;
+                finished_buf[batch_id] = ids[batch_id] == end_ids[batch_id] ? 1 : 0;
+            }
+        }
+        return;
+    }
+
+    typedef cub::BlockScan<float, BLOCK_SIZE>  BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    __shared__ uint32_t                        selected_shared[NUM_WARPS];
+    // Initialize running total
+    BlockPrefixCallbackOp prefix_op(0);
+
+    if (lane_id == 0) {
+        selected_shared[warp_id] = 0;
+    }
+
+    __syncthreads();
+
+    int offset          = batch_id * vocab_size;
+    ids[batch_id]       = sorted_id_vals[offset];
+    int   end           = ((vocab_size + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+    int   i_active      = 0;
+    float thread_offset = 0;
+    for (int i = tid; i < end; i += BLOCK_SIZE) {
+        float thread_count = (i < vocab_size) ? (float)sorted_log_probs[offset + i] : 0.f;
+        BlockScan(temp_storage).InclusiveSum(thread_count, thread_offset, prefix_op);
+
+        uint32_t active_mask = __ballot_sync(0xFFFFFFFF, rand_num_s <= thread_offset);
+
+        i_active = i;
+        if (active_mask != 0) {
+            if (lane_id == 0) {
+                atomicAdd(&stop_shared, 1);
+                selected_shared[warp_id] = active_mask;
+            }
+        }
+        __syncthreads();
+        if (stop_shared > 0) {
+            break;
+        }
+    };
+
+    // select first active warp
+    bool skip = (selected_shared[warp_id] > 0) ? false : true;
+    for (int i = 0; i < warp_id; i++) {
+        if (selected_shared[i] != 0) {
+            skip = true;
+        }
+    }
+    if (!skip) {
+        int active_lane_id = WARP_SIZE - __popc(selected_shared[warp_id]);
+        if (lane_id == active_lane_id) {
+            ids[batch_id] = sorted_id_vals[offset + i_active];
+            if (cum_log_probs != nullptr || output_log_probs != nullptr) {
+                float lprob = logf(sorted_log_probs[offset + i_active]);
+                if (cum_log_probs != nullptr) {
+                    cum_log_probs[batch_id] += lprob;
+                }
+                if (output_log_probs != nullptr) {
+                    output_log_probs[batch_id] = lprob;
+                }
+            }
+            if (sequence_length != nullptr && finished_buf != nullptr) {
+                sequence_length[batch_id] =
+                    finished_buf[batch_id] ? sequence_length[batch_id] : sequence_length[batch_id] + 1;
+                finished_buf[batch_id] = ids[batch_id] == end_ids[batch_id] ? 1 : 0;
+            }
+        }
+    }
+}
+
+template<typename T>
+void invokeBatchTopPSampling(void*           workspace,
+                             size_t&         workspace_size,
+                             size_t&         cub_temp_storage_size,
+                             int*            output_ids,
+                             int*            sequence_length,
+                             bool*           finished_buf,
+                             float*          cum_log_probs,
+                             float*          output_log_probs,
+                             const T*        log_probs,
+                             const int*      id_vals,
+                             int*            offset_buf,
+                             int*            begin_offset_buf,
+                             curandState_t*  curandstate,
+                             const int       batch_size,
+                             const size_t    vocab_size_padded,
+                             const int*      end_ids,
+                             const float     max_top_p,
+                             const float*    top_ps,
+                             cudaStream_t    stream,
+                             cudaDeviceProp* cuda_device_prop,
+                             const bool*     skip_decode)
+{
+    // Here, we put batch size as an argument because the batch size of initialization
+    // and inference may be different due to pipeline parallelism.
+    const int vocab_size = vocab_size_padded;
+    const int block_size = 256;
+
+    size_t sorted_log_prob_buf_size = batch_size * vocab_size * sizeof(T);    // type T
+    size_t sorted_id_vals_buf_size  = batch_size * vocab_size * sizeof(int);  // type int
+    sorted_log_prob_buf_size        = div_up(sorted_log_prob_buf_size, 256) * 256;
+    sorted_id_vals_buf_size         = div_up(sorted_id_vals_buf_size, 256) * 256;
+
+    void* cub_temp_storage = workspace;
+    T*    sorted_log_probs = (T*)((char*)cub_temp_storage + cub_temp_storage_size);
+    int*  sorted_id_vals   = (int*)((char*)sorted_log_probs + sorted_log_prob_buf_size);
+
+    bool do_radix_sort = (ENABLE_SINGLE_PASS_TOP_P == 0 || max_top_p >= SINGLE_PASS_THRESHOLD);
+    int  smem_size     = -1;
+
+    segmented_topp_impl::TopKPerSegmentContext context;
+    segmented_topp_impl::TopKPerSegmentParams  params;
+    segmented_topp_impl::DType_t               dataTypeKind =
+        (std::is_same<T, float>::value) ? segmented_topp_impl::kFLOAT : segmented_topp_impl::kHALF;
+
+    if (!do_radix_sort) {
+        FT_CHECK(cuda_device_prop != nullptr);
+        memset(&context, 0, sizeof(context));
+        context.sm_count       = cuda_device_prop->multiProcessorCount;
+        context.sm_shared_size = cuda_device_prop->sharedMemPerMultiprocessor;
+        context.sm_version     = cuda_device_prop->major * 100 + cuda_device_prop->minor * 10;
+
+        memset(&params, 0, sizeof(params));
+        params.gmem_src_keys        = reinterpret_cast<void*>(const_cast<T*>(log_probs));
+        params.gmem_dst_keys        = sorted_log_probs;
+        params.gmem_src_vals        = reinterpret_cast<void*>(const_cast<int*>(id_vals));
+        params.gmem_dst_vals        = reinterpret_cast<void*>(sorted_id_vals);
+        params.gmem_begin_offsets   = begin_offset_buf;
+        params.gmem_end_offsets     = offset_buf + 1;
+        params.workspace            = nullptr;
+        params.num_items            = vocab_size * batch_size;
+        params.num_segments         = batch_size;
+        params.top_p                = max_top_p;
+        params.confidence_threshold = 0.0F;
+
+        smem_size     = getSmemSizeAndCheck(context, params, dataTypeKind);
+        do_radix_sort = smem_size < 0;
+    }
+
+    if (do_radix_sort) {
+        if (workspace == nullptr) {
+            check_cuda_error(
+                cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr,
+                                                                   cub_temp_storage_size,
+                                                                   log_probs,
+                                                                   (T*)nullptr,
+                                                                   id_vals,
+                                                                   (int*)nullptr,
+                                                                   vocab_size * batch_size,
+                                                                   batch_size,
+                                                                   begin_offset_buf,
+                                                                   offset_buf + 1,
+                                                                   0,              // begin_bit
+                                                                   sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
+                                                                   stream));       // cudaStream_t
+            cub_temp_storage_size = div_up(cub_temp_storage_size, 256) * 256;
+            workspace_size        = sorted_log_prob_buf_size + sorted_id_vals_buf_size + cub_temp_storage_size;
+            return;
+        }
+
+        topp_beam_topk_kernel<T, 1, block_size><<<batch_size, block_size, 0, stream>>>(log_probs,
+                                                                                       sorted_id_vals,
+                                                                                       sorted_log_probs,
+                                                                                       vocab_size,
+                                                                                       offset_buf,
+                                                                                       begin_offset_buf,
+                                                                                       max_top_p,
+                                                                                       top_ps,
+                                                                                       skip_decode);
+
+        check_cuda_error(
+            cub::DeviceSegmentedRadixSort::SortPairsDescending(cub_temp_storage,
+                                                               cub_temp_storage_size,
+                                                               log_probs,
+                                                               sorted_log_probs,
+                                                               id_vals,
+                                                               sorted_id_vals,
+                                                               vocab_size * batch_size,
+                                                               batch_size,
+                                                               begin_offset_buf,
+                                                               offset_buf + 1,
+                                                               0,              // begin_bit
+                                                               sizeof(T) * 8,  // end_bit = sizeof(KeyT) * 8
+                                                               stream));       // cudaStream_t
+    }
+    else {
+        if (workspace == nullptr) {
+            segmented_topp_impl::topPPerSegment(
+                context, params, dataTypeKind, cub_temp_storage, cub_temp_storage_size, stream);
+            workspace_size = sorted_log_prob_buf_size + sorted_id_vals_buf_size + cub_temp_storage_size;
+            return;
+        }
+        else {
+            topp_beam_topk_kernel<T, 1, block_size><<<batch_size, block_size, 0, stream>>>(log_probs,
+                                                                                           sorted_id_vals,
+                                                                                           sorted_log_probs,
+                                                                                           vocab_size,
+                                                                                           offset_buf,
+                                                                                           begin_offset_buf,
+                                                                                           max_top_p,
+                                                                                           top_ps,
+                                                                                           skip_decode);
+            segmented_topp_impl::topPPerSegment(
+                context, params, dataTypeKind, cub_temp_storage, cub_temp_storage_size, stream);
+        }
+    }
+
+    constexpr int SAMPLING_BLOCK_SIZE = 256;
+    dim3          grid(batch_size);
+    topp_sampling<T, SAMPLING_BLOCK_SIZE><<<grid, SAMPLING_BLOCK_SIZE, 0, stream>>>(sorted_log_probs,
+                                                                                    sorted_id_vals,
+                                                                                    output_ids,
+                                                                                    sequence_length,
+                                                                                    finished_buf,
+                                                                                    cum_log_probs,
+                                                                                    output_log_probs,
+                                                                                    begin_offset_buf,
+                                                                                    offset_buf + 1,
+                                                                                    vocab_size,
+                                                                                    curandstate,
+                                                                                    max_top_p,
+                                                                                    top_ps,
+                                                                                    end_ids,
+                                                                                    batch_size,
+                                                                                    skip_decode);
+}
+
+template void invokeBatchTopPSampling(void*           workspace,
+                                      size_t&         workspace_size,
+                                      size_t&         cub_temp_storage_size,
+                                      int*            output_ids,
+                                      int*            sequence_length,
+                                      bool*           finished_buf,
+                                      float*          cum_log_probs,
+                                      float*          output_log_probs,
+                                      const float*    log_probs,
+                                      const int*      id_vals,
+                                      int*            offset_buf,
+                                      int*            begin_offset_buf,
+                                      curandState_t*  curandstate,
+                                      const int       batch_size,
+                                      const size_t    vocab_size_padded,
+                                      const int*      end_ids,
+                                      const float     max_top_p,
+                                      const float*    top_ps,
+                                      cudaStream_t    stream,
+                                      cudaDeviceProp* cuda_device_prop,
+                                      const bool*     skip_decode);
+
+template void invokeBatchTopPSampling(void*           workspace,
+                                      size_t&         workspace_size,
+                                      size_t&         cub_temp_storage_size,
+                                      int*            output_ids,
+                                      int*            sequence_length,
+                                      bool*           finished_buf,
+                                      float*          cum_log_probs,
+                                      float*          output_log_probs,
+                                      const half*     log_probs,
+                                      const int*      id_vals,
+                                      int*            offset_buf,
+                                      int*            begin_offset_buf,
+                                      curandState_t*  curandstate,
+                                      const int       batch_size,
+                                      const size_t    vocab_size_padded,
+                                      const int*      end_ids,
+                                      const float     max_top_p,
+                                      const float*    top_ps,
+                                      cudaStream_t    stream,
+                                      cudaDeviceProp* cuda_device_prop,
+                                      const bool*     skip_decode);
+
+template<typename T>
+void invokeTopPSampling(void*           workspace,
+                        size_t&         workspace_size,
+                        size_t&         cub_temp_storage_size,
+                        int*            output_ids,
+                        int*            sequence_length,
+                        bool*           finished_buf,
+                        float*          cum_log_probs,
+                        float*          output_log_probs,
+                        const T*        log_probs,
+                        const int*      id_vals,
+                        int*            offset_buf,
+                        int*            begin_offset_buf,
+                        curandState_t*  curandstate,
+                        const int       batch_size,
+                        const size_t    vocab_size_padded,
+                        const int*      end_ids,
+                        const float     top_p,
+                        cudaStream_t    stream,
+                        cudaDeviceProp* cuda_device_prop,
+                        const bool*     skip_decode)
+{
+    invokeBatchTopPSampling(workspace,
+                            workspace_size,
+                            cub_temp_storage_size,
+                            output_ids,
+                            sequence_length,
+                            finished_buf,
+                            cum_log_probs,
+                            output_log_probs,
+                            log_probs,
+                            id_vals,
+                            offset_buf,
+                            begin_offset_buf,
+                            curandstate,
+                            batch_size,
+                            vocab_size_padded,
+                            end_ids,
+                            top_p,
+                            nullptr,
+                            stream,
+                            cuda_device_prop,
+                            skip_decode);
+}
+
+template void invokeTopPSampling(void*           workspace,
+                                 size_t&         workspace_size,
+                                 size_t&         cub_temp_storage_size,
+                                 int*            output_ids,
+                                 int*            sequence_length,
+                                 bool*           finished_buf,
+                                 float*          cum_log_probs,
+                                 float*          output_log_probs,
+                                 const float*    log_probs,
+                                 const int*      id_vals,
+                                 int*            offset_buf,
+                                 int*            begin_offset_buf,
+                                 curandState_t*  curandstate,
+                                 const int       batch_size,
+                                 const size_t    vocab_size_padded,
+                                 const int*      end_ids,
+                                 const float     top_p,
+                                 cudaStream_t    stream,
+                                 cudaDeviceProp* cuda_device_prop,
+                                 const bool*     skip_decode);
+
+template void invokeTopPSampling(void*           workspace,
+                                 size_t&         workspace_size,
+                                 size_t&         cub_temp_storage_size,
+                                 int*            output_ids,
+                                 int*            sequence_length,
+                                 bool*           finished_buf,
+                                 float*          cum_log_probs,
+                                 float*          output_log_probs,
+                                 const half*     log_probs,
+                                 const int*      id_vals,
+                                 int*            offset_buf,
+                                 int*            begin_offset_buf,
+                                 curandState_t*  curandstate,
+                                 const int       batch_size,
+                                 const size_t    vocab_size_padded,
+                                 const int*      end_ids,
+                                 const float     top_p,
+                                 cudaStream_t    stream,
+                                 cudaDeviceProp* cuda_device_prop,
+                                 const bool*     skip_decode);
+
+template<typename T>
+__global__ void
+addBiasSoftMax(T* logits, const T* bias, const int* end_ids, const bool* finished, const int n_padded, const int n)
+{
+    int  bid    = blockIdx.x;
+    bool finish = (finished != nullptr) ? finished[bid] : false;
+    int  offset = bid * n_padded;
+
+    float            max_val   = -1 * FLT_MAX;
+    const bool       IS_FP16   = std::is_same<T, half>::value;
+    const T          MAX_T_VAL = (IS_FP16) ? HALF_FLT_MAX : FLT_MAX;
+    __shared__ float s_max_val;
+    __shared__ float s_sum_val;
+
+    for (int tid = threadIdx.x; tid < n_padded; tid += blockDim.x) {
+        if (tid < n) {
+            if (finish) {
+                logits[offset + tid] = (tid == end_ids[bid]) ? MAX_T_VAL : -MAX_T_VAL;
+            }
+            else {
+                T bias_val = (bias != nullptr) ? bias[tid] : (T)0.0f;
+                logits[offset + tid] += bias_val;
+            }
+        }
+        else {
+            logits[offset + tid] = -MAX_T_VAL;
+        }
+        max_val = max(max_val, (float)logits[offset + tid]);
+    }
+
+    max_val = blockReduceMax<float>((float)max_val);
+    if (threadIdx.x == 0) {
+        s_max_val = max_val;
+    }
+    __syncthreads();
+
+    float sum_val = 0.0f;
+    for (int tid = threadIdx.x; tid < n_padded; tid += blockDim.x) {
+        logits[offset + tid] = __expf((float)logits[offset + tid] - s_max_val);
+        sum_val += (float)logits[offset + tid];
+    }
+
+    sum_val = blockReduceSum<float>(sum_val);
+    if (threadIdx.x == 0) {
+        s_sum_val = sum_val;
+    }
+    __syncthreads();
+
+    for (int tid = threadIdx.x; tid < n_padded; tid += blockDim.x) {
+        logits[offset + tid] = ((float)logits[offset + tid] / (s_sum_val + 1e-6f));
+    }
+}
+
+template<typename T>
+void invokeAddBiasSoftMax(T*           logits,
+                          const T*     bias,
+                          const int*   end_ids,
+                          const bool*  finished,
+                          const int    m,
+                          const int    n_padded,
+                          const int    n,
+                          cudaStream_t stream)
+{
+    dim3 grid(m);
+    dim3 block(min(n, 1024));
+    /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
+    addBiasSoftMax<<<grid, block, 0, stream>>>(logits, bias, end_ids, finished, n_padded, n);
+}
+
+template void invokeAddBiasSoftMax(float*       logits,
+                                   const float* bias,
+                                   const int*   end_ids,
+                                   const bool*  finished,
+                                   const int    m,
+                                   const int    n_padded,
+                                   const int    n,
+                                   cudaStream_t stream);
+
+template void invokeAddBiasSoftMax(half*        logits,
+                                   const half*  bias,
+                                   const int*   end_ids,
+                                   const bool*  finished,
+                                   const int    m,
+                                   const int    n_padded,
+                                   const int    n,
+                                   cudaStream_t stream);
+
+__global__ void computeToppDecay(float*         runtime_top_p,
+                                 const float*   runtime_initial_top_p,
+                                 const int*     output_ids,
+                                 const float*   top_p_decay,
+                                 const float*   top_p_min,
+                                 const int32_t* top_p_reset_ids,
+                                 const int      local_batch_size)
+{
+    /**
+     * @brief Compute the topp decay by https://arxiv.org/pdf/2206.04624.pdf
+     *        In short, the formula is
+     *          runtime_top_p = max(runtime_top_p * top_p_decay, top_p_min)
+     *        If generating the top_p_reset_ids, then reset the runtime_top_p.
+     *
+     * \param runtime_top_p          [local_batch_size]
+     * \param runtime_initial_top_p  [local_batch_size]
+     * \param output_ids             [local_batch_size]
+     * \param top_p_decay            [local_batch_size]
+     * \param top_p_min              [local_batch_size]
+     * \param top_p_reset_ids         [local_batch_size]
+     * \param local_batch_size
+     *
+     */
+
+    int idx = blockDim.x * blockIdx.x + threadIdx.x;
+    if (output_ids[idx] == top_p_reset_ids[idx]) {
+        runtime_top_p[idx] = runtime_initial_top_p[idx];
+    }
+    else {
+        runtime_top_p[idx] = max(runtime_top_p[idx] * top_p_decay[idx], top_p_min[idx]);
+    }
+}
+
+void invokeComputeToppDecay(float*         runtime_top_p,
+                            const float*   runtime_initial_top_p,
+                            const int*     output_ids,
+                            const float*   top_p_decay,
+                            const float*   top_p_min,
+                            const int32_t* top_p_reset_ids,
+                            const int      local_batch_size,
+                            cudaStream_t   stream)
+{
+    dim3 block(min(local_batch_size, 512));
+    dim3 grid((local_batch_size + block.x - 1) / block.x);
+    computeToppDecay<<<grid, block, 0, stream>>>(
+        runtime_top_p, runtime_initial_top_p, output_ids, top_p_decay, top_p_min, top_p_reset_ids, local_batch_size);
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/sampling_topp_kernels.h b/src/fastertransformer/kernels/sampling_topp_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..41a58752bfd424e44b9454558be1ba33a384e87f
--- /dev/null
+++ b/src/fastertransformer/kernels/sampling_topp_kernels.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <curand_kernel.h>
+
+namespace fastertransformer {
+
+void invokeTopPInitialize(int*         topp_id_val_buf,
+                          int*         topp_offset_buf,
+                          int*         begin_topp_offset_buf_,
+                          const size_t batch_size,
+                          const int    n,
+                          cudaStream_t stream);
+
+template<typename T>
+void invokeTopPSampling(void*           workspace,
+                        size_t&         workspace_size,
+                        size_t&         cub_temp_storage_size,
+                        int*            output_ids,
+                        int*            sequence_length,
+                        bool*           finished_buf,
+                        float*          cum_log_probs,
+                        float*          output_log_probs,
+                        const T*        log_probs,
+                        const int*      id_vals,
+                        int*            offset_buf,
+                        int*            begin_offset_buf,
+                        curandState_t*  curandstate,
+                        const int       batch_size,
+                        const size_t    vocab_size_padded,
+                        const int*      end_ids,
+                        const float     top_p,
+                        cudaStream_t    stream,
+                        cudaDeviceProp* cuda_device_prop,
+                        const bool*     skip_decode);
+
+template<typename T>
+void invokeBatchTopPSampling(void*           workspace,
+                             size_t&         workspace_size,
+                             size_t&         cub_temp_storage_size,
+                             int*            output_ids,
+                             int*            sequence_length,
+                             bool*           finished_buf,
+                             float*          cum_log_probs,
+                             float*          output_log_probs,
+                             const T*        log_probs,
+                             const int*      id_vals,
+                             int*            offset_buf,
+                             int*            begin_offset_buf,
+                             curandState_t*  curandstate,
+                             const int       batch_size,
+                             const size_t    vocab_size_padded,
+                             const int*      end_ids,
+                             const float     max_top_p,
+                             const float*    top_ps,
+                             cudaStream_t    stream,
+                             cudaDeviceProp* cuda_device_prop,
+                             const bool*     skip_decode);
+
+template<typename T>
+void invokeAddBiasSoftMax(T*           logits,
+                          const T*     bias,
+                          const int*   end_ids,
+                          const bool*  finished,
+                          const int    m,
+                          const int    n_padded,
+                          const int    n,
+                          cudaStream_t stream);
+
+namespace segmented_topp_impl {
+enum DType_t {
+    kFLOAT,
+    kHALF,
+    kINT8
+};
+
+template<typename Key_Data_Type_   = float,
+         typename Value_Data_Type_ = int32_t,
+         int BLOCK_THREADS_        = 256,
+         int KEYS_PER_LDG_         = 1>
+struct Segmented_topk_kernel_params {
+    typedef Key_Data_Type_   Key_Data_Type;
+    typedef Value_Data_Type_ Value_Data_Type;
+    enum {
+        BLOCK_THREADS = BLOCK_THREADS_
+    };
+    enum {
+        ITEMS_INCREMENT = 32
+    };
+    // enum { KEYS_PER_LDG = 2 * 4 / sizeof(Key_Data_Type_) };
+    enum {
+        KEYS_PER_LDG = KEYS_PER_LDG_
+    };
+};
+
+struct TopKPerSegmentContext {
+    TopKPerSegmentContext(): sm_count(0), sm_shared_size(0), sm_version(0){};
+    int sm_count;
+    int sm_shared_size;
+    int sm_version;
+};
+
+struct TopKPerSegmentParams {
+    // input/output keys and values
+    void *gmem_src_keys, *gmem_dst_keys, *gmem_dst_vals;
+    // not used in the custom implementation
+    void* gmem_src_vals;
+    // int array of size num_segments
+    int* gmem_active_count_per_segment;
+    int* gmem_active_count_total;
+    int* gmem_begin_offsets;
+    // gmem_end_offsets will be populated
+    int*  gmem_end_offsets;
+    void* workspace;
+    // total number of items for all segments
+    int num_items;
+    int num_segments;
+    // top_k per segment
+    int   num_top_k;
+    float top_p;
+    float confidence_threshold;
+};
+
+int topPPerSegment(const TopKPerSegmentContext& context,
+                   TopKPerSegmentParams&        params,
+                   const DType_t                DT_SCORE,
+                   void*                        temp_storage,
+                   size_t&                      temp_storage_bytes,
+                   cudaStream_t                 stream);
+}  // namespace segmented_topp_impl
+
+void invokeComputeToppDecay(float*         runtime_top_p,
+                            const float*   runtime_initial_top_p,
+                            const int*     output_ids,
+                            const float*   top_p_decay,
+                            const float*   top_p_min,
+                            const int32_t* top_p_reset_ids,
+                            const int      local_batch_size,
+                            cudaStream_t   stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.cu b/src/fastertransformer/kernels/stop_criteria_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..155d496bf2504c8793099e76b724aeefe7cccaa7
--- /dev/null
+++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/stop_criteria_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+__global__ void stop_words_criterion(const int* output_ids,
+                                     const int* parent_ids,
+                                     const int* stop_words,
+                                     bool*      finished,
+                                     size_t     id_offset,
+                                     size_t     stop_words_len,
+                                     int        batch_size,
+                                     int        beam_width,
+                                     int        step)
+{
+    const int id        = blockIdx.x * blockDim.x + threadIdx.x;
+    const int batch_idx = blockIdx.y / beam_width;
+    const int beam_idx  = blockIdx.y % beam_width;
+
+    const int* base_stop_words = stop_words + batch_idx * 2 * stop_words_len;
+    const int* base_offsets    = base_stop_words + stop_words_len;
+
+    if (id >= stop_words_len || base_offsets[id] < 0) {
+        return;
+    }
+
+    const int item_end   = base_offsets[id];
+    const int item_start = (id > 0) ? base_offsets[id - 1] : 0;
+    const int item_size  = item_end - item_start;
+
+    /* The single-token case unconditionally bans the token */
+    bool should_stop = false;
+
+    /* Enough previously generated tokens to look for a match */
+    if (step + 1 >= item_size) {
+        should_stop            = true;
+        int        parent_id   = beam_idx;
+        const bool gather_beam = beam_width > 1;
+
+        for (int token_idx = item_size - 1; token_idx >= 0; token_idx--) {
+            const int previous_token = output_ids[(step - (item_size - 1) + token_idx) * batch_size * beam_width
+                                                  + id_offset + batch_idx * beam_width + parent_id];
+
+            if (previous_token != base_stop_words[item_start + token_idx]) {
+                should_stop = false;
+                break;
+            }
+            if (gather_beam) {
+                parent_id = parent_ids[(step - (item_size - 1) + token_idx) * beam_width * batch_size + id_offset
+                                       + batch_idx * beam_width + parent_id];
+
+                if (parent_id < 0 || parent_id >= beam_width) {
+                    should_stop = false;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (should_stop) {
+        finished[batch_idx * beam_width + beam_idx] = true;
+    }
+}
+
+void invokeStopWordsCriterion(const int*   output_ids,
+                              const int*   parent_ids,
+                              const int*   stop_words,
+                              bool*        finished,
+                              size_t       id_offset,
+                              size_t       stop_words_len,
+                              int          batch_size,
+                              int          beam_width,
+                              int          step,
+                              cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    // Check if we have sampled a word from the stop_words list. If so, stop the sequence.
+    dim3 block, grid;
+    block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL);
+    grid.x  = (stop_words_len + block.x - 1) / block.x;
+    grid.y  = batch_size * beam_width;
+
+    stop_words_criterion<<<grid, block, 0, stream>>>(
+        output_ids, parent_ids, stop_words, finished, id_offset, stop_words_len, batch_size, beam_width, step);
+    sync_check_cuda_error();
+}
+
+__global__ void length_criterion(bool*           finished,
+                                 bool*           should_stop,
+                                 int*            finished_sum,
+                                 const uint32_t* sequence_limit_length,
+                                 int             batch_size,
+                                 int             beam_width,
+                                 int             step)
+{
+    int thread_finished_count = 0;
+    for (int index = threadIdx.x; index < batch_size * beam_width; index += blockDim.x) {
+        const int batch_idx = index / beam_width;
+
+        finished[index] |= step >= sequence_limit_length[batch_idx];
+        thread_finished_count += finished[index] ? 1 : 0;
+    }
+    int block_finished_count = 0;
+    if (blockDim.x <= 32) {
+        block_finished_count = warpReduceSum(thread_finished_count);
+    }
+    else {
+        block_finished_count = blockReduceSum(thread_finished_count);
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+        finished_sum[0] = block_finished_count;
+    }
+}
+
+void invokeLengthCriterion(bool*           finished,
+                           bool*           should_stop,
+                           int*            h_pinned_finished_sum_,
+                           const uint32_t* sequence_limit_length,
+                           int             batch_size,
+                           int             beam_width,
+                           int             step,
+                           cudaStream_t    stream)
+{
+    // Check if we have attained the sequence length limit. If so, stop the sequence.
+    // In addition, check if all sequences are stopped and return the result in should_stop
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    dim3 block{min(512, uint32_t(batch_size * beam_width))};
+    dim3 grid{1};
+    h_pinned_finished_sum_[0] = -1;
+
+    length_criterion<<<grid, block, 0, stream>>>(
+        finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);    
+    while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
+    sync_check_cuda_error();
+
+    *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/stop_criteria_kernels.h b/src/fastertransformer/kernels/stop_criteria_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb46d3b703976284479cedc8237451578328116f
--- /dev/null
+++ b/src/fastertransformer/kernels/stop_criteria_kernels.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+void invokeStopWordsCriterion(const int*   output_ids,
+                              const int*   parent_ids,
+                              const int*   stop_words,
+                              bool*        finished,
+                              size_t       id_offset,
+                              size_t       stop_words_len,
+                              int          batch_size,
+                              int          beam_width,
+                              int          step,
+                              cudaStream_t stream);
+
+void invokeLengthCriterion(bool*           finished,
+                           bool*           should_stop,
+                           int*            finished_sum,
+                           const uint32_t* sequence_limit_length,
+                           int             batch_size,
+                           int             beam_width,
+                           int             step,
+                           cudaStream_t    stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.cu b/src/fastertransformer/kernels/unfused_attention_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..651b811d7ec35339c1e6fb6cc3d68061b32905eb
--- /dev/null
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -0,0 +1,2523 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+__inline__ __device__ int target_index(int id1, int id2, int id3, int id4, int dim_1, int dim_2, int dim_3, int dim_4)
+{
+    return id1 * (dim_2 * dim_3 * dim_4) + id3 * (dim_2 * dim_4) + id2 * dim_4 + id4;
+}
+
+template<typename T>
+__global__ void addQKVBiasIA3Transpose(T* q_out,
+                                       T* k_out,
+                                       T* v_out,
+                                       const T* __restrict q_in,
+                                       const T* __restrict bias_q,
+                                       const T* __restrict k_in,
+                                       const T* __restrict bias_k,
+                                       const T* __restrict v_in,
+                                       const T* __restrict bias_v,
+                                       const int* ia3_tasks,
+                                       const T*   ia3_key_weights,
+                                       const T*   ia3_value_weights,
+                                       const int  batch_size,
+                                       const int  seq_len,
+                                       const int  head_num,
+                                       const int  size_per_head)
+{
+    const int n        = head_num * size_per_head;
+    const int batch_id = blockIdx.x;
+    const int word_id  = blockIdx.y;
+    const int row_id   = batch_id * seq_len + word_id;
+
+    const bool use_ia3       = ia3_tasks != nullptr;
+    const int  ia3_task      = use_ia3 ? ia3_tasks[batch_id] : 0;
+    const bool use_ia3_key   = use_ia3 && (ia3_key_weights != nullptr);
+    const bool use_ia3_value = use_ia3 && (ia3_value_weights != nullptr);
+
+    for (int col_id = threadIdx.x; col_id < n; col_id += blockDim.x) {
+        const int head_id   = col_id / size_per_head;
+        const int size_id   = col_id % size_per_head;
+        const int target_id = batch_id * (head_num * seq_len * size_per_head) + head_id * seq_len * size_per_head
+                              + word_id * size_per_head + size_id;
+        const int src_id = row_id * n + col_id;
+
+        T q              = ldg(&q_in[src_id]);
+        q_out[target_id] = add(q, ldg(&bias_q[col_id]));
+
+        T k = add(ldg(&k_in[src_id]), ldg(&bias_k[col_id]));
+        if (use_ia3_key) {
+            k = k * ia3_key_weights[ia3_task * n + col_id];
+        }
+        k_out[target_id] = k;
+
+        T v = add(ldg(&v_in[src_id]), ldg(&bias_v[col_id]));
+        if (use_ia3_value) {
+            v = v * ia3_value_weights[ia3_task * n + col_id];
+        }
+        v_out[target_id] = v;
+    }
+}
+
+template<typename T>
+__global__ void QKVIA3Transpose(T* q_out,
+                                T* k_out,
+                                T* v_out,
+                                const T* __restrict q_in,
+                                const T* __restrict k_in,
+                                const T* __restrict v_in,
+                                const int* ia3_tasks,
+                                const T* __restrict ia3_key_weights,
+                                const T* __restrict ia3_value_weights,
+                                const int batch_size,
+                                const int seq_len,
+                                const int head_num,
+                                const int size_per_head)
+{
+    const int n        = head_num * size_per_head;
+    const int batch_id = blockIdx.x;
+    const int word_id  = blockIdx.y;
+    const int row_id   = batch_id * seq_len + word_id;
+
+    const bool use_ia3       = ia3_tasks != nullptr;
+    const int  ia3_task      = use_ia3 ? ia3_tasks[batch_id] : 0;
+    const bool use_ia3_key   = use_ia3 && (ia3_key_weights != nullptr);
+    const bool use_ia3_value = use_ia3 && (ia3_value_weights != nullptr);
+
+    for (int col_id = threadIdx.x; col_id < n; col_id += blockDim.x) {
+        const int head_id   = col_id / size_per_head;
+        const int size_id   = col_id % size_per_head;
+        const int target_id = batch_id * (head_num * seq_len * size_per_head) + head_id * seq_len * size_per_head
+                              + word_id * size_per_head + size_id;
+        const int src_id = row_id * n + col_id;
+
+        q_out[target_id] = ldg(&q_in[src_id]);
+
+        T k = ldg(&k_in[src_id]);
+        if (use_ia3_key) {
+            k = k * ia3_key_weights[ia3_task * n + col_id];
+        }
+        k_out[target_id] = k;
+
+        T v = ldg(&v_in[src_id]);
+        if (use_ia3_value) {
+            v = v * ia3_value_weights[ia3_task * n + col_id];
+        }
+        v_out[target_id] = v;
+    }
+}
+
+template<typename T>
+void invokeAddQKVBiasIA3Transpose(T*           q_buf,
+                                  T*           k_buf,
+                                  T*           v_buf,
+                                  T*           Q,
+                                  const T*     bias_Q,
+                                  T*           K,
+                                  const T*     bias_K,
+                                  T*           V,
+                                  const T*     bias_V,
+                                  const int    batch_size,
+                                  const int    seq_len,
+                                  const int    head_num,
+                                  const int    size_per_head,
+                                  const int*   ia3_tasks,
+                                  const T*     ia3_key_weights,
+                                  const T*     ia3_value_weights,
+                                  cudaStream_t stream)
+{
+    const int k = head_num * size_per_head;
+    dim3      grid(batch_size, seq_len);
+    bool      is_add_bias = bias_Q != nullptr;
+    if (sizeof(T) == 4 || k % 2 != 0) {
+        dim3 block(min(k, 512));
+        if (is_add_bias) {
+            addQKVBiasIA3Transpose<T><<<grid, block, 0, stream>>>(q_buf,
+                                                                  k_buf,
+                                                                  v_buf,
+                                                                  Q,
+                                                                  bias_Q,
+                                                                  K,
+                                                                  bias_K,
+                                                                  V,
+                                                                  bias_V,
+                                                                  ia3_tasks,
+                                                                  ia3_key_weights,
+                                                                  ia3_value_weights,
+                                                                  batch_size,
+                                                                  seq_len,
+                                                                  head_num,
+                                                                  size_per_head);
+        }
+        else {
+            QKVIA3Transpose<T><<<grid, block, 0, stream>>>(q_buf,
+                                                           k_buf,
+                                                           v_buf,
+                                                           Q,
+                                                           K,
+                                                           V,
+                                                           ia3_tasks,
+                                                           ia3_key_weights,
+                                                           ia3_value_weights,
+                                                           batch_size,
+                                                           seq_len,
+                                                           head_num,
+                                                           size_per_head);
+        }
+        sync_check_cuda_error();
+    }
+    else {
+        using T2 = typename TypeConverter<T>::Type;  // fp16 to half2, bf16 to bf162
+        dim3 block(min(k / 2, 512));
+        if (is_add_bias) {
+            addQKVBiasIA3Transpose<T2><<<grid, block, 0, stream>>>((T2*)q_buf,
+                                                                   (T2*)k_buf,
+                                                                   (T2*)v_buf,
+                                                                   (const T2*)Q,
+                                                                   (const T2*)bias_Q,
+                                                                   (const T2*)K,
+                                                                   (const T2*)bias_K,
+                                                                   (const T2*)V,
+                                                                   (const T2*)bias_V,
+                                                                   ia3_tasks,
+                                                                   (const T2*)ia3_key_weights,
+                                                                   (const T2*)ia3_value_weights,
+                                                                   batch_size,
+                                                                   seq_len,
+                                                                   head_num,
+                                                                   size_per_head / 2);
+        }
+        else {
+            QKVIA3Transpose<T2><<<grid, block, 0, stream>>>((T2*)q_buf,
+                                                            (T2*)k_buf,
+                                                            (T2*)v_buf,
+                                                            (const T2*)Q,
+                                                            (const T2*)K,
+                                                            (const T2*)V,
+                                                            ia3_tasks,
+                                                            (const T2*)ia3_key_weights,
+                                                            (const T2*)ia3_value_weights,
+                                                            batch_size,
+                                                            seq_len,
+                                                            head_num,
+                                                            size_per_head / 2);
+        }
+        sync_check_cuda_error();
+    }
+}
+
+#define INSTANTIATEADDQKVBIASIA3TRANSPOSE(T)                                                                           \
+    template void invokeAddQKVBiasIA3Transpose(T*           q_buf,                                                     \
+                                               T*           k_buf,                                                     \
+                                               T*           v_buf,                                                     \
+                                               T*           Q,                                                         \
+                                               const T*     bias_Q,                                                    \
+                                               T*           K,                                                         \
+                                               const T*     bias_K,                                                    \
+                                               T*           V,                                                         \
+                                               const T*     bias_V,                                                    \
+                                               const int    batch_size,                                                \
+                                               const int    seq_len,                                                   \
+                                               const int    head_num,                                                  \
+                                               const int    size_per_head,                                             \
+                                               const int*   ia3_tasks,                                                 \
+                                               const T*     ia3_key_weights,                                           \
+                                               const T*     ia3_value_weights,                                         \
+                                               cudaStream_t stream)
+INSTANTIATEADDQKVBIASIA3TRANSPOSE(float);
+INSTANTIATEADDQKVBIASIA3TRANSPOSE(half);
+#ifdef ENABLE_BF16
+INSTANTIATEADDQKVBIASIA3TRANSPOSE(__nv_bfloat16);
+#endif
+#undef INSTANTIATEADDQKVBIASTRANSPOSE
+
+template<typename T, typename T_IN, int ITEMS_PER_THREAD>
+__global__ void softmax_kernel(T*          attn_score,
+                               const T_IN* qk,
+                               const T*    attn_mask,
+                               const T*    linear_bias_slopes,
+                               const int   batch_size,
+                               const int   head_num,
+                               const int   q_length,
+                               const int   k_length,
+                               const float qk_scale)
+{
+    // attn_score, [batch_size, num_heads, q_length, k_length]
+    // qk, [batch_size, num_heads, q_length, k_length]
+    // attn_mask, [batch_size, q_length, k_length]
+    // linear_bias_slopes, [num_heads]
+
+    const int bi = blockIdx.y;  // Batch index.
+    const int hi = blockIdx.z;  // Head index.
+
+    __shared__ float s_mean, s_max;
+
+    const float linear_bias_slope = linear_bias_slopes != nullptr ? (float)linear_bias_slopes[hi] : 0.0f;
+
+    // Loop along with Q dimension.
+    for (int qi = blockIdx.x; qi < q_length; qi += gridDim.x) {
+
+        float data[ITEMS_PER_THREAD];
+        int   qk_offset;
+        float local_max = -1e20f;
+
+        // Loop along with K dimension.
+        for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
+            int ki    = blockDim.x * i + threadIdx.x;  // Index of K dimension.
+            qk_offset = ((bi * head_num + hi) * q_length + qi) * k_length + ki;
+
+            float qk_val  = static_cast<float>(qk[qk_offset]);
+            float qk_bias = 0.0f;
+
+            if (linear_bias_slopes != nullptr) {
+                // We don't handle the upper diagonal (ki > qi) separately, whose values
+                // are negligible due to the negative infinity mask. And it matches with
+                // the HF's implementation.
+                qk_bias += static_cast<float>(linear_bias_slope * (ki - qi));
+            }
+
+            int   mask_offset = (bi * q_length + qi) * k_length + ki;
+            float mask_val    = static_cast<float>(ldg(&attn_mask[mask_offset]));
+            qk_bias += (1.0f - mask_val) * -10000.0f;
+
+            data[i]   = qk_scale * qk_val + qk_bias;
+            local_max = fmax(local_max, data[i]);
+        }
+
+        float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax<float>(local_max);
+        if (threadIdx.x == 0) {
+            s_max = max_val;
+        }
+        __syncthreads();
+
+        float local_sum = 0;
+        for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
+            data[i] = __expf(data[i] - s_max);
+            local_sum += data[i];
+        }
+
+        float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum<float>(local_sum);
+        if (threadIdx.x == 0) {
+            s_mean = sum_val + 1e-6f;
+            s_mean = __fdividef(1.0f, s_mean);
+        }
+        __syncthreads();
+
+        for (int i = 0; blockDim.x * i + threadIdx.x < k_length; i++) {
+            qk_offset             = ((bi * head_num + hi) * q_length + qi) * k_length + blockDim.x * i + threadIdx.x;
+            attn_score[qk_offset] = (T)(data[i] * s_mean);
+        }
+    }
+}
+
+template<typename T, int ITEMS_PER_THREAD>
+__global__ void softmax_kernel_h2(T*        attn_score,
+                                  const T*  qk_buf,
+                                  const T*  attn_mask,
+                                  const T*  linear_bias_slopes,
+                                  const int batch_size,
+                                  const int head_num,
+                                  const int q_length,
+                                  const int k_length,
+                                  const T   qk_scale)
+{
+    // attn_score, [batch_size, num_heads, q_length, k_length]
+    // qk, [batch_size, num_heads, q_length, k_length]
+    // attn_mask, [batch_size, q_length, k_length]
+    // linear_bias_slopes, [num_heads]
+
+    using T2 = typename TypeConverter<T>::Type;
+
+    T2*       attn_score_h2 = reinterpret_cast<T2*>(attn_score);
+    const T2* qk_buf_h2     = reinterpret_cast<const T2*>(qk_buf);
+    const T2* attn_mask_h2  = reinterpret_cast<const T2*>(attn_mask);
+
+    const int bi = blockIdx.y;  // Batch index
+    const int hi = blockIdx.z;  // Head index.
+
+    __shared__ float s_mean, s_max;
+
+    // Constant values that will be used repeately in the q/k loop.
+    const T2 ONE       = cuda_cast<T2>(1.0f);
+    const T2 ZERO      = cuda_cast<T2>(0.0f);
+    const T2 NEG_INFTY = cuda_cast<T2>(-10000.0f);
+
+    // The normalization factor of QK.
+    const T2 qk_scale_h2 = cuda_cast<T2>(qk_scale);
+    // The slope of a linear position bias of the current attention head.
+    const T2 linear_bias_slope = linear_bias_slopes != nullptr ? cuda_cast<T2>(linear_bias_slopes[hi]) : ZERO;
+
+    // Loop over q dimension.
+    for (int qi = blockIdx.x; qi < q_length; qi += gridDim.x) {
+        T2    data[ITEMS_PER_THREAD];
+        int   qk_offset;
+        float local_max = -1e20f;
+
+        // Loop over k dimension.
+        for (int i = 0; blockDim.x * i + threadIdx.x < (k_length / 2) && i < ITEMS_PER_THREAD; i++) {
+            // The half of the index of k dimension. We will use the elements at {2 * ki, 2 * ki + 1}.
+            int ki          = blockDim.x * i + threadIdx.x;
+            qk_offset       = ((bi * head_num + hi) * q_length + qi) * (k_length / 2) + ki;
+            int mask_offset = (bi * q_length + qi) * (k_length / 2) + ki;
+
+            // The value of QK^T matrix at (qi, ki).
+            T2 qk = qk_buf_h2[qk_offset];
+            // The bias value to the position (qi, ki) including both mask and positional bias.
+            T2 qk_bias = ZERO;
+
+            if (linear_bias_slopes != nullptr) {
+                // The position bias depends on the distance between qi/ki and is zero if qi >= 2*ki
+                // or qi >= 2*ki+1. For T2 vectorization, we should handle every two elements along
+                // with k-dim simultaneously. To do this, we check qi / 2 > ki at ones instead of
+                // qi >= 2*ki or 2*ki+1. It works because an diagonal element for an odd qi will be
+                // zero due to slope * (qi - 2*ki+1) = 0. Thus, we don't handle the upper diagonal
+                // separately, whose values are negligible due to the negative infinity mask.
+                T2 dist(2.0f * ki - qi, 2.0f * ki + 1 - qi);
+                qk_bias = hadd2<T2>(qk_bias, hmul2<T2>(linear_bias_slope, dist));
+            }
+
+            T2 mask_val = ldg(&attn_mask_h2[mask_offset]);
+            qk_bias     = hadd2<T2>(qk_bias, hmul2<T2>(hsub2<T2>(ONE, mask_val), NEG_INFTY));
+
+            data[i]   = hadd2<T2>(hmul2<T2>(qk, qk_scale_h2), qk_bias);
+            local_max = fmax(local_max, fmax((float)data[i].x, (float)data[i].y));
+        }
+
+        float max_val = blockDim.x <= 32 ? warpReduceMax(local_max) : blockReduceMax<float>(local_max);
+        if (threadIdx.x == 0) {
+            s_max = max_val;
+        }
+        __syncthreads();
+
+        float local_sum = 0.0f;
+        for (int i = 0; blockDim.x * i + threadIdx.x < (k_length / 2) && i < ITEMS_PER_THREAD; i++) {
+            data[i] = hexp2<T2>(hsub2<T2>(data[i], cuda_cast<T2>(s_max)));
+            local_sum += (float)(data[i].x + data[i].y);
+        }
+
+        float sum_val = blockDim.x <= 32 ? warpReduceSum(local_sum) : blockReduceSum<float>(local_sum);
+
+        if (threadIdx.x == 0) {
+            s_mean = sum_val + 1e-6f;
+            s_mean = __fdividef(1.0f, s_mean);
+        }
+        __syncthreads();
+
+        for (int i = 0; blockDim.x * i + threadIdx.x < (k_length / 2) && i < ITEMS_PER_THREAD; i++) {
+            qk_offset = ((bi * head_num + hi) * q_length + qi) * (k_length / 2) + blockDim.x * i + threadIdx.x;
+            attn_score_h2[qk_offset] = hmul2<T2>(data[i], cuda_cast<T2>(s_mean));
+        }
+    }
+}
+
+template<typename T, int K_ITEMS_PER_THREAD, int Q_ITEMS_PER_THREAD>
+__global__ void softmax_kernel_h2_v2(T*        attn_score,
+                                     const T*  qk_buf,
+                                     const T*  attn_mask,
+                                     const T*  linear_bias_slopes,
+                                     const int batch_size,
+                                     const int head_num,
+                                     const int q_length,
+                                     const int k_length,
+                                     const T   scalar)
+{
+    // attn_score, [batch_size, num_heads, q_length, k_length]
+    // qk, [batch_size, num_heads, q_length, k_length]
+    // attn_mask, [batch_size, q_length, k_length]
+    // linear_bias_slopes, [num_heads]
+
+    using T2 = typename TypeConverter<T>::Type;
+
+    // QK^T matrix of shape (batch_size, head_num, q_length, k_length / 2)
+    T2*       attn_score_h2 = reinterpret_cast<T2*>(attn_score);
+    const T2* qk_buf_h2     = reinterpret_cast<const T2*>(qk_buf);
+    const T2* attn_mask_h2  = reinterpret_cast<const T2*>(attn_mask);
+
+    const int bi = blockIdx.y;  // Batch index
+    const int hi = blockIdx.z;  // Head index.
+
+    // Constant values that will be used repeately in the q/k loop.
+    const T2 ONE       = cuda_cast<T2>(1.0f);
+    const T2 ZERO      = cuda_cast<T2>(0.0f);
+    const T2 NEG_INFTY = cuda_cast<T2>(-10000.0f);
+
+    // The normalization factor of QK.
+    const T2 qk_scale = cuda_cast<T2>(scalar);
+    // The slope of a linear position bias of the current attention head.
+    const T2 linear_bias_slope = linear_bias_slopes != nullptr ? cuda_cast<T2>(linear_bias_slopes[hi]) : ZERO;
+
+    __shared__ float s_sum[Q_ITEMS_PER_THREAD], s_max[Q_ITEMS_PER_THREAD];
+
+    // Loop over q dimension.
+    for (int qi = blockIdx.x; qi < q_length; qi += gridDim.x * Q_ITEMS_PER_THREAD) {
+        T2 data[Q_ITEMS_PER_THREAD][K_ITEMS_PER_THREAD];
+
+        int qk_offset[Q_ITEMS_PER_THREAD];
+
+        float local_max[Q_ITEMS_PER_THREAD];
+#pragma unroll
+        for (int j = 0; j < Q_ITEMS_PER_THREAD; j++) {
+            local_max[j] = -1e20f;
+        }
+
+        // Loop over k dimension.
+        const int Q_ITEMS = min((q_length - qi + gridDim.x - 1) / gridDim.x, Q_ITEMS_PER_THREAD);
+        for (int i = 0; blockDim.x * i + threadIdx.x < k_length / 2 && i < K_ITEMS_PER_THREAD; ++i) {
+            // The half of the index of k dimension. We will use the elements at {2 * ki, 2 * ki + 1}.
+            int ki = blockDim.x * i + threadIdx.x;
+
+            int mask_offset[Q_ITEMS_PER_THREAD];
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                qk_offset[j]   = ((bi * head_num + hi) * q_length + qi + j * gridDim.x) * (k_length / 2) + ki;
+                mask_offset[j] = (bi * q_length + qi + j * gridDim.x) * (k_length / 2) + ki;
+            }
+
+            T2 mask_val[Q_ITEMS_PER_THREAD];
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                mask_val[j] = ldg(&attn_mask_h2[mask_offset[j]]);
+            }
+
+            T2 qk[Q_ITEMS_PER_THREAD];
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                qk[j] = qk_buf_h2[qk_offset[j]];
+            }
+
+            T2 pos_bias[Q_ITEMS_PER_THREAD];
+            if (linear_bias_slopes != nullptr) {
+#pragma unroll
+                for (int j = 0; j < Q_ITEMS; j++) {
+                    // The position bias depends on the distance between qi/ki and is zero if qi >= 2*ki
+                    // or qi >= 2*ki+1. For T2 vectorization, we should handle every two elements along
+                    // with k-dim simultaneously. To do this, we check qi / 2 > ki at ones instead of
+                    // qi >= 2*ki or 2*ki+1. It works because an diagonal element for an odd qi will be
+                    // zero due to slope * (qi - 2*ki+1) = 0. Thus, we don't handle the upper diagonal
+                    // separately, whose values are negligible due to the negative infinity mask.
+                    int qidx = qi + j * gridDim.x;
+                    T2  dist(2.0f * ki - qidx, 2.0f * ki + 1 - qidx);
+                    pos_bias[j] = hmul2<T2>(linear_bias_slope, dist);
+                }
+            }
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                mask_val[j] = hmul2<T2>(hsub2<T2>(ONE, mask_val[j]), NEG_INFTY);
+            }
+
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                T2 val = hadd2<T2>(hmul2<T2>(qk_scale, qk[j]), mask_val[j]);
+                if (linear_bias_slopes != nullptr) {
+                    val = hadd2<T2>(val, pos_bias[j]);
+                }
+                data[j][i]   = val;
+                local_max[j] = fmax(local_max[j], fmax((float)data[j][i].x, (float)data[j][i].y));
+            }
+        }
+
+        if (blockDim.x <= 32) {
+            warpReduceMaxV2<float, Q_ITEMS_PER_THREAD>(local_max);
+        }
+        else {
+            blockReduceMaxV2<float, Q_ITEMS_PER_THREAD>(local_max);
+        }
+
+        if (threadIdx.x == 0) {
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS_PER_THREAD; j++) {
+                s_max[j] = local_max[j];
+            }
+        }
+        __syncthreads();
+
+        float local_sum[Q_ITEMS_PER_THREAD];
+#pragma unroll
+        for (int j = 0; j < Q_ITEMS_PER_THREAD; j++) {
+            local_sum[j] = {0.f};
+        }
+
+        for (int i = 0; blockDim.x * i + threadIdx.x < k_length / 2 && i < K_ITEMS_PER_THREAD; ++i) {
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; ++j) {
+                data[j][i] = hexp2<T2>(hsub2<T2>(data[j][i], cuda_cast<T2>(s_max[j])));
+            }
+
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                local_sum[j] += (float)(data[j][i].x + data[j][i].y);
+            }
+        }
+
+        if (blockDim.x <= 32) {
+            warpReduceSumV2<float, Q_ITEMS_PER_THREAD>(local_sum);
+        }
+        else {
+            blockReduceSumV2<float, Q_ITEMS_PER_THREAD>(local_sum);
+        }
+
+        if (threadIdx.x == 0) {
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS_PER_THREAD; j++) {
+                s_sum[j] = __fdividef(1.0f, local_sum[j] + 1e-6f);
+            }
+        }
+        __syncthreads();
+
+        for (int i = 0; blockDim.x * i + threadIdx.x < k_length / 2 && i < K_ITEMS_PER_THREAD; ++i) {
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                qk_offset[j] = ((bi * head_num + hi) * q_length + qi + j * gridDim.x) * (k_length / 2) + blockDim.x * i
+                               + threadIdx.x;
+            }
+
+#pragma unroll
+            for (int j = 0; j < Q_ITEMS; j++) {
+                attn_score_h2[qk_offset[j]] = hmul2<T2>(data[j][i], cuda_cast<T2>(s_sum[j]));
+            }
+        }
+    }
+}
+
+#define LAUNCH_MAKSED_SOFTMAX_(T_, ITEMS_PER_THREAD)                                                                   \
+    block.x /= ITEMS_PER_THREAD;                                                                                       \
+    block.x = (block.x + 31) / 32 * 32;                                                                                \
+    assert(block.x <= 1024);                                                                                           \
+    if (is_half2) {                                                                                                    \
+        if (grid.x % 4 == 0) {                                                                                         \
+            grid.x /= 4;                                                                                               \
+            softmax_kernel_h2_v2<T_, ITEMS_PER_THREAD, 4>                                                              \
+                <<<grid, block, 0, stream>>>((T_*)param.attention_score,                                               \
+                                             (const T_*)param.qk,                                                      \
+                                             (const T_*)param.attention_mask,                                          \
+                                             (const T_*)param.linear_bias_slopes,                                      \
+                                             param.batch_size,                                                         \
+                                             param.num_heads,                                                          \
+                                             param.q_length,                                                           \
+                                             param.k_length,                                                           \
+                                             (const T_)param.qk_scale);                                                \
+        }                                                                                                              \
+        else {                                                                                                         \
+            softmax_kernel_h2<T_, ITEMS_PER_THREAD><<<grid, block, 0, stream>>>((T_*)param.attention_score,            \
+                                                                                (const T_*)param.qk,                   \
+                                                                                (const T_*)param.attention_mask,       \
+                                                                                (const T_*)param.linear_bias_slopes,   \
+                                                                                param.batch_size,                      \
+                                                                                param.num_heads,                       \
+                                                                                param.q_length,                        \
+                                                                                param.k_length,                        \
+                                                                                (const T_)param.qk_scale);             \
+        }                                                                                                              \
+    }                                                                                                                  \
+    else {                                                                                                             \
+        softmax_kernel<T, T_IN, ITEMS_PER_THREAD><<<grid, block, 0, stream>>>(param.attention_score,                   \
+                                                                              param.qk,                                \
+                                                                              param.attention_mask,                    \
+                                                                              param.linear_bias_slopes,                \
+                                                                              param.batch_size,                        \
+                                                                              param.num_heads,                         \
+                                                                              param.q_length,                          \
+                                                                              param.k_length,                          \
+                                                                              param.qk_scale);                         \
+    }
+
+#define LAUNCH_MAKSED_SOFTMAX(ITEMS_PER_THREAD) LAUNCH_MAKSED_SOFTMAX_(half, ITEMS_PER_THREAD)
+
+template<typename T, typename T_IN>
+void invokeMaskedSoftmax(MaskedSoftmaxParam<T, T_IN>& param, cudaStream_t stream)
+{
+    // attention_score,    (batch_size, head_num, q_length, k_length), softmax output.
+    // qk,                 (batch_size, head_num, q_length, k_length), QK^T.
+    // attention_mask,     (batch_size, q_length, k_length), attention mask.
+    // linear_bias_slopes, (head_num,) the slopes of the linear position bias.
+
+    dim3 grid(param.q_length, param.batch_size, param.num_heads);
+    if (param.batch_size * param.num_heads > 360) {
+        grid.x = ceil(float(param.q_length) / 32.0f);
+    }
+
+    bool is_half2 = sizeof(T) == 2 && sizeof(T_IN) == 2 && param.k_length % 2 == 0;
+    dim3 block((param.k_length / (is_half2 ? 2 : 1) + 31) / 32 * 32);
+
+    if (block.x > 2048 && block.x <= 4096) {
+        LAUNCH_MAKSED_SOFTMAX(4)
+    }
+    else if (block.x > 1024) {
+        LAUNCH_MAKSED_SOFTMAX(2)
+    }
+    else if (block.x > 0) {
+        LAUNCH_MAKSED_SOFTMAX(1)
+    }
+    else {
+        FT_CHECK(param.k_length <= 4096);
+    }
+}
+
+template void invokeMaskedSoftmax(MaskedSoftmaxParam<float, float>& param, cudaStream_t stream);
+template void invokeMaskedSoftmax(MaskedSoftmaxParam<half, float>& param, cudaStream_t stream);
+template void invokeMaskedSoftmax(MaskedSoftmaxParam<half, half>& param, cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template<>
+void invokeMaskedSoftmax(MaskedSoftmaxParam<__nv_bfloat16, float>& param, cudaStream_t stream)
+{
+    // attention_score,    (batch_size, head_num, q_length, k_length), softmax output.
+    // qk,                 (batch_size, head_num, q_length, k_length), QK^T.
+    // attention_mask,     (batch_size, q_length, k_length), attention mask.
+    // linear_bias_slopes, (head_num,) the slopes of the linear position bias.
+
+    using T    = __nv_bfloat16;
+    using T_IN = float;
+
+    dim3 grid(param.q_length, param.batch_size, param.num_heads);
+    if (param.batch_size * param.num_heads > 360) {
+        grid.x = ceil(float(param.q_length) / 32.0f);
+    }
+
+    bool is_half2 = sizeof(T) == 2 && sizeof(T_IN) == 2 && param.k_length % 2 == 0;
+    dim3 block((param.k_length / (is_half2 ? 2 : 1) + 31) / 32 * 32);
+
+    if (block.x > 2048 && block.x <= 4096) {
+        LAUNCH_MAKSED_SOFTMAX_(__nv_bfloat16, 4);
+    }
+    else if (block.x > 1024) {
+        LAUNCH_MAKSED_SOFTMAX_(__nv_bfloat16, 2);
+    }
+    else if (block.x > 0) {
+        LAUNCH_MAKSED_SOFTMAX_(__nv_bfloat16, 1);
+    }
+    else {
+        FT_CHECK(param.k_length <= 4096);
+    }
+}
+template<>
+void invokeMaskedSoftmax(MaskedSoftmaxParam<__nv_bfloat16, __nv_bfloat16>& param, cudaStream_t stream)
+{
+    // attention_score,    (batch_size, head_num, q_length, k_length), softmax output.
+    // qk,                 (batch_size, head_num, q_length, k_length), QK^T.
+    // attention_mask,     (batch_size, q_length, k_length), attention mask.
+    // linear_bias_slopes, (head_num,) the slopes of the linear position bias.
+
+    using T    = __nv_bfloat16;
+    using T_IN = __nv_bfloat16;
+
+    dim3 grid(param.q_length, param.batch_size, param.num_heads);
+    if (param.batch_size * param.num_heads > 360) {
+        grid.x = ceil(float(param.q_length) / 32.0f);
+    }
+
+    bool is_half2 = sizeof(T) == 2 && sizeof(T_IN) == 2 && param.k_length % 2 == 0;
+    dim3 block((param.k_length / (is_half2 ? 2 : 1) + 31) / 32 * 32);
+
+    if (block.x > 2048 && block.x <= 4096) {
+        LAUNCH_MAKSED_SOFTMAX_(__nv_bfloat16, 4);
+    }
+    else if (block.x > 1024) {
+        LAUNCH_MAKSED_SOFTMAX_(__nv_bfloat16, 2);
+    }
+    else if (block.x > 0) {
+        LAUNCH_MAKSED_SOFTMAX_(__nv_bfloat16, 1);
+    }
+    else {
+        FT_CHECK(param.k_length <= 4096);
+    }
+}
+
+#endif
+
+#undef LAUNCH_MAKSED_SOFTMAX
+#undef LAUNCH_MAKSED_SOFTMAX_
+
+template<typename T>
+__global__ void transpose(const T*     src,
+                          T*           dst,
+                          const int    batch_size,
+                          const int    seq_len,
+                          const int    head_num,
+                          const int    size_per_head,
+                          const float* scale,
+                          int          int8_mode)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int batch_id = tid / (head_num * seq_len * size_per_head);
+    int head_id  = (tid % (head_num * seq_len * size_per_head)) / (seq_len * size_per_head);
+    int seq_id   = (tid % (seq_len * size_per_head)) / size_per_head;
+    int id       = tid % size_per_head;
+
+    int target_id = target_index(batch_id, head_id, seq_id, id, batch_size, head_num, seq_len, size_per_head);
+
+    if (int8_mode == 2) {
+        using Int8_Packed_T  = typename packed_as<int8_t, num_elems<T>::value>::type;
+        using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+
+        const Float_Packed_T scale_val = cuda_cast<Float_Packed_T>(*scale);
+        reinterpret_cast<Int8_Packed_T*>(dst)[target_id] =
+            cuda_cast<Int8_Packed_T>(cuda_cast<Float_Packed_T>(src[tid]) * scale_val);
+    }
+    else {
+        dst[target_id] = src[tid];
+    }
+}
+
+template<>
+__global__ void transpose(const float* src,
+                          float*       dst,
+                          const int    batch_size,
+                          const int    seq_len,
+                          const int    head_num,
+                          const int    size_per_head,
+                          const float* scale,
+                          int          int8_mode)
+{
+    int batch_id = blockIdx.x / (head_num * seq_len);
+    int seq_id   = blockIdx.x % seq_len;
+    int head_id  = (blockIdx.x % (head_num * seq_len)) / seq_len;
+
+    const int target_id = batch_id * (head_num * seq_len * size_per_head) + seq_id * head_num * size_per_head
+                          + head_id * size_per_head + threadIdx.x;
+    const int src_id = blockIdx.x * size_per_head + threadIdx.x;
+
+    if (int8_mode == 2) {
+        const float scale_val                     = *scale;
+        reinterpret_cast<int8_t*>(dst)[target_id] = cuda_cast<int8_t>(src[src_id] * scale_val);
+    }
+    else {
+        dst[target_id] = src[src_id];
+    }
+}
+
+template<typename T>
+void invokeTransposeQKV(T*           dst,
+                        T*           src,
+                        const int    batch_size,
+                        const int    seq_len,
+                        const int    head_num,
+                        const int    size_per_head,
+                        const float* scale,
+                        const int    int8_mode,
+                        cudaStream_t stream)
+{
+    dim3 grid, block;
+    if (sizeof(T) == 2) {
+        int seq_per_block = 1;
+        grid.x            = batch_size * head_num * seq_len / seq_per_block;
+        while (seq_per_block < 4 && grid.x % 2 == 0) {
+            grid.x /= 2;
+            seq_per_block *= 2;
+        }
+
+        FT_CHECK(grid.x * seq_per_block == (size_t)batch_size * head_num * seq_len);
+
+        if (seq_per_block * size_per_head % 2 == 0) {
+            block.x = seq_per_block * size_per_head / 2;
+            if (std::is_same<T, half>::value) {
+                transpose<half2><<<grid, block, 0, stream>>>(
+                    (half2*)src, (half2*)dst, batch_size, seq_len, head_num, size_per_head / 2, scale, int8_mode);
+            }
+#ifdef ENABLE_BF16
+            else {
+                transpose<__nv_bfloat162><<<grid, block, 0, stream>>>((__nv_bfloat162*)src,
+                                                                      (__nv_bfloat162*)dst,
+                                                                      batch_size,
+                                                                      seq_len,
+                                                                      head_num,
+                                                                      size_per_head / 2,
+                                                                      scale,
+                                                                      int8_mode);
+            }
+#endif
+        }
+        else {
+            block.x = seq_per_block * size_per_head;
+            transpose<T>
+                <<<grid, block, 0, stream>>>(src, dst, batch_size, seq_len, head_num, size_per_head, scale, int8_mode);
+        }
+    }
+    else {
+        const int seq_per_block = 1;
+        grid.x                  = batch_size * head_num * seq_len / seq_per_block;
+        block.x                 = seq_per_block * size_per_head;
+        transpose<T>
+            <<<grid, block, 0, stream>>>(src, dst, batch_size, seq_len, head_num, size_per_head, scale, int8_mode);
+    }
+}
+
+#define INSTANTIATETRANSPOSEQKV(T)                                                                                     \
+    template void invokeTransposeQKV(T*           src,                                                                 \
+                                     T*           dst,                                                                 \
+                                     const int    batch_size,                                                          \
+                                     const int    seq_len,                                                             \
+                                     const int    head_num,                                                            \
+                                     const int    size_per_head,                                                       \
+                                     const float* scale,                                                               \
+                                     const int    int8_mode,                                                           \
+                                     cudaStream_t stream)
+INSTANTIATETRANSPOSEQKV(float);
+INSTANTIATETRANSPOSEQKV(half);
+#ifdef ENABLE_BF16
+INSTANTIATETRANSPOSEQKV(__nv_bfloat16);
+#endif
+#undef INSTANTIATETRANSPOSEQKV
+
+template<typename T>
+__global__ void add_QKV_bias_rebuild_padding_ia3(const T*   Q,
+                                                 const T*   bias_Q,
+                                                 const T*   K,
+                                                 const T*   bias_K,
+                                                 const T*   V,
+                                                 const T*   bias_V,
+                                                 T*         q_buf_,
+                                                 T*         k_buf_,
+                                                 T*         v_buf_,
+                                                 const int* ia3_tasks,
+                                                 const T*   ia3_key_weights,
+                                                 const T*   ia3_value_weights,
+                                                 const int  batch_size,
+                                                 const int  seq_len,
+                                                 const int  head_num,
+                                                 const int  size_per_head,
+                                                 const int* mask_offset)
+{
+    const int bid = blockIdx.x;
+
+    const int tgt_batch_id = (bid + mask_offset[bid]) / seq_len;
+    const int tgt_seq_id   = (bid + mask_offset[bid]) % seq_len;
+    const int n            = head_num * size_per_head;
+
+    const bool use_ia3       = ia3_tasks != nullptr;
+    const int  ia3_task      = use_ia3 ? ia3_tasks[tgt_batch_id] : 0;
+    const bool use_ia3_key   = use_ia3 && (ia3_key_weights != nullptr);
+    const bool use_ia3_value = use_ia3 && (ia3_value_weights != nullptr);
+    for (int idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        const int tgt_head_id   = idx / size_per_head;
+        const int tgt_hidden_id = idx % size_per_head;
+
+        const int src_id = bid * n + idx;
+        const int tgt_id = tgt_batch_id * head_num * seq_len * size_per_head + tgt_head_id * seq_len * size_per_head
+                           + tgt_seq_id * size_per_head + tgt_hidden_id;
+
+        q_buf_[tgt_id] = add(ldg(&Q[src_id]), ldg(&bias_Q[idx]));
+
+        T k = ldg(&K[src_id]);
+        if (use_ia3_key) {
+            k = k * ia3_key_weights[ia3_task * n + idx];
+        }
+        k_buf_[tgt_id] = add(k, ldg(&bias_K[idx]));
+
+        T v = ldg(&V[src_id]);
+        if (use_ia3_value) {
+            v = v * ia3_value_weights[ia3_task * n + idx];
+        }
+        v_buf_[tgt_id] = add(v, ldg(&bias_V[idx]));
+    }
+}
+
+template<typename T>
+__global__ void rebuild_padding_ia3(const T*   Q,
+                                    const T*   K,
+                                    const T*   V,
+                                    T*         q_buf_,
+                                    T*         k_buf_,
+                                    T*         v_buf_,
+                                    const int* ia3_tasks,
+                                    const T*   ia3_key_weights,
+                                    const T*   ia3_value_weights,
+                                    const int  batch_size,
+                                    const int  seq_len,
+                                    const int  head_num,
+                                    const int  size_per_head,
+                                    const int* mask_offset)
+{
+    const int bid = blockIdx.x;
+
+    const int tgt_batch_id = (bid + mask_offset[bid]) / seq_len;
+    const int tgt_seq_id   = (bid + mask_offset[bid]) % seq_len;
+    const int n            = head_num * size_per_head;
+
+    const bool use_ia3       = ia3_tasks != nullptr;
+    const int  ia3_task      = use_ia3 ? ia3_tasks[tgt_batch_id] : 0;
+    const bool use_ia3_key   = use_ia3 && (ia3_key_weights != nullptr);
+    const bool use_ia3_value = use_ia3 && (ia3_value_weights != nullptr);
+    for (int idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        const int tgt_head_id   = idx / size_per_head;
+        const int tgt_hidden_id = idx % size_per_head;
+
+        const int src_id = bid * n + idx;
+        const int tgt_id = tgt_batch_id * head_num * seq_len * size_per_head + tgt_head_id * seq_len * size_per_head
+                           + tgt_seq_id * size_per_head + tgt_hidden_id;
+
+        q_buf_[tgt_id] = ldg(&Q[src_id]);
+
+        T k = ldg(&K[src_id]);
+        if (use_ia3_key) {
+            k = k * ia3_key_weights[ia3_task * n + idx];
+        }
+        k_buf_[tgt_id] = k;
+
+        T v = ldg(&V[src_id]);
+        if (use_ia3_value) {
+            v = v * ia3_value_weights[ia3_task * n + idx];
+        }
+        v_buf_[tgt_id] = v;
+    }
+}
+
+template<typename T>
+void invokeAddQKVBiasIA3RebuildPadding(T*           Q,
+                                       const T*     bias_Q,
+                                       T*           K,
+                                       const T*     bias_K,
+                                       T*           V,
+                                       const T*     bias_V,
+                                       T*           q_buf,
+                                       T*           k_buf,
+                                       T*           v_buf,
+                                       const int    batch_size,
+                                       const int    seq_len,
+                                       const int    head_num,
+                                       const int    size_per_head,
+                                       const int    valid_word_num,
+                                       const int*   mask_offset,
+                                       const int*   ia3_tasks,
+                                       const T*     ia3_key_weights,
+                                       const T*     ia3_value_weights,
+                                       cudaStream_t stream)
+{
+#ifdef ENABLE_BF16
+    bool is_half2 = (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) && (size_per_head % 2 == 0);
+#else
+    bool       is_half2 = (std::is_same<T, half>::value) && (size_per_head % 2 == 0);
+#endif
+    using T2       = typename TypeConverter<T>::Type;  // fp16 to half2, bf16 to bf162
+    int block_size = head_num * size_per_head;
+    if (is_half2) {
+        while (block_size > 512) {
+            if (block_size % 2 == 0) {
+                block_size /= 2;
+            }
+            else {
+                is_half2   = false;
+                block_size = std::min(block_size, 512);
+                break;
+            }
+        }
+    }
+    else {
+        block_size = std::min(block_size, 512);
+    }
+
+    if (bias_Q == nullptr && bias_K == nullptr && bias_V == nullptr) {
+        if (is_half2) {
+            rebuild_padding_ia3<<<valid_word_num, block_size, 0, stream>>>((T2*)Q,
+                                                                           (T2*)K,
+                                                                           (T2*)V,
+                                                                           (T2*)q_buf,
+                                                                           (T2*)k_buf,
+                                                                           (T2*)v_buf,
+                                                                           ia3_tasks,
+                                                                           (const T2*)ia3_key_weights,
+                                                                           (const T2*)ia3_value_weights,
+                                                                           batch_size,
+                                                                           seq_len,
+                                                                           head_num,
+                                                                           size_per_head / 2,
+                                                                           mask_offset);
+        }
+        else {
+            rebuild_padding_ia3<<<valid_word_num, block_size, 0, stream>>>(Q,
+                                                                           K,
+                                                                           V,
+                                                                           q_buf,
+                                                                           k_buf,
+                                                                           v_buf,
+                                                                           ia3_tasks,
+                                                                           ia3_key_weights,
+                                                                           ia3_value_weights,
+                                                                           batch_size,
+                                                                           seq_len,
+                                                                           head_num,
+                                                                           size_per_head,
+                                                                           mask_offset);
+        }
+    }
+    else if (bias_Q != nullptr && bias_K != nullptr && bias_V != nullptr) {
+        if (is_half2) {
+            add_QKV_bias_rebuild_padding_ia3<<<valid_word_num, block_size, 0, stream>>>((T2*)Q,
+                                                                                        (const T2*)bias_Q,
+                                                                                        (T2*)K,
+                                                                                        (const T2*)bias_K,
+                                                                                        (T2*)V,
+                                                                                        (const T2*)bias_V,
+                                                                                        (T2*)q_buf,
+                                                                                        (T2*)k_buf,
+                                                                                        (T2*)v_buf,
+                                                                                        ia3_tasks,
+                                                                                        (const T2*)ia3_key_weights,
+                                                                                        (const T2*)ia3_value_weights,
+                                                                                        batch_size,
+                                                                                        seq_len,
+                                                                                        head_num,
+                                                                                        size_per_head / 2,
+                                                                                        mask_offset);
+        }
+        else {
+            add_QKV_bias_rebuild_padding_ia3<<<valid_word_num, block_size, 0, stream>>>(Q,
+                                                                                        bias_Q,
+                                                                                        K,
+                                                                                        bias_K,
+                                                                                        V,
+                                                                                        bias_V,
+                                                                                        q_buf,
+                                                                                        k_buf,
+                                                                                        v_buf,
+                                                                                        ia3_tasks,
+                                                                                        ia3_key_weights,
+                                                                                        ia3_value_weights,
+                                                                                        batch_size,
+                                                                                        seq_len,
+                                                                                        head_num,
+                                                                                        size_per_head,
+                                                                                        mask_offset);
+        }
+    }
+    else {
+        FT_CHECK(false);
+    }
+}
+
+#define INSTANTIATEADDQKVBIASIA3REBUILDPADDING(T)                                                                      \
+    template void invokeAddQKVBiasIA3RebuildPadding(T*           Q,                                                    \
+                                                    const T*     bias_Q,                                               \
+                                                    T*           K,                                                    \
+                                                    const T*     bias_K,                                               \
+                                                    T*           V,                                                    \
+                                                    const T*     bias_V,                                               \
+                                                    T*           q_buf,                                                \
+                                                    T*           k_buf,                                                \
+                                                    T*           v_buf,                                                \
+                                                    const int    batch_size,                                           \
+                                                    const int    seq_len,                                              \
+                                                    const int    head_num,                                             \
+                                                    const int    size_per_head,                                        \
+                                                    const int    valid_word_num,                                       \
+                                                    const int*   mask_offset,                                          \
+                                                    const int*   ia3_tasks,                                            \
+                                                    const T*     ia3_key_weights,                                      \
+                                                    const T*     ia3_value_weights,                                    \
+                                                    cudaStream_t stream)
+INSTANTIATEADDQKVBIASIA3REBUILDPADDING(float);
+INSTANTIATEADDQKVBIASIA3REBUILDPADDING(half);
+#ifdef ENABLE_BF16
+INSTANTIATEADDQKVBIASIA3REBUILDPADDING(__nv_bfloat16);
+#endif
+#undef INSTANTIATEADDQKVBIASREBUILDPADDING
+
+template<typename T>
+__global__ void transpose_remove_padding(const T*     src,
+                                         T*           dst,
+                                         const int    batch_size,
+                                         const int    seq_len,
+                                         const int    head_num,
+                                         const int    size_per_head,
+                                         const int*   mask_offset,
+                                         const float* scale,
+                                         const int    int8_mode)
+{
+    // TODO: optimize this kernel?
+    // do remove_sequence_length_padding
+    const int bid = blockIdx.x;  // batch * seq_len or valid_word_num
+
+    const int src_batch_id = (bid + mask_offset[bid]) / seq_len;
+    const int src_seq_id   = (bid + mask_offset[bid]) % seq_len;
+
+    const int dst_seq_id = bid;
+
+    const int src_offset_base = src_batch_id * seq_len * head_num * size_per_head + src_seq_id * size_per_head;
+    const int dst_offset_base = dst_seq_id * head_num * size_per_head;
+
+    using Int8_Packed_T  = typename packed_as<int8_t, num_elems<T>::value>::type;
+    using Float_Packed_T = typename packed_as<float, num_elems<T>::value>::type;
+    const Float_Packed_T scale_val =
+        int8_mode == 2 ? cuda_cast<Float_Packed_T>(*scale) : cuda_cast<Float_Packed_T>(0.0f);
+
+    for (int idx = threadIdx.x; idx < head_num * size_per_head; idx += blockDim.x) {
+        const int head_id   = idx / size_per_head;
+        const int hidden_id = idx % size_per_head;
+        const T   src_elem  = ldg(&src[src_offset_base + head_id * seq_len * size_per_head + hidden_id]);
+        if (int8_mode == 2) {
+            reinterpret_cast<Int8_Packed_T*>(dst)[dst_offset_base + idx] =
+                cuda_cast<Int8_Packed_T>(cuda_cast<Float_Packed_T>(src_elem) * scale_val);
+        }
+        else {
+            dst[dst_offset_base + idx] = src_elem;
+        }
+    }
+}
+
+// clang-format off
+template<typename T>
+void invokeTransposeAttentionOutRemovePadding(T*           src,
+                                              T*           dst,
+                                              const int    valid_word_num,
+                                              const int    batch_size,
+                                              const int    seq_len,
+                                              const int    head_num,
+                                              const int    size_per_head,
+                                              const int*   mask_offset,
+                                              const float* scale,
+                                              const int    int8_mode,
+                                              cudaStream_t stream)
+{
+#ifdef ENABLE_BF16
+    bool is_half2 = (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) && (size_per_head % 2 == 0);
+#else
+    bool is_half2 = (std::is_same<T, half>::value) && (size_per_head % 2 == 0);
+#endif
+    using T2       = typename TypeConverter<T>::Type;  // fp16 to half2, bf16 to bf162
+    int block_size = head_num * size_per_head;
+    if (is_half2) {
+        while (block_size > 512) {
+            if (block_size % 2 == 0) {
+                block_size /= 2;
+            }
+            else {
+                is_half2   = false;
+                block_size = std::min(block_size, 1024);
+                break;
+            }
+        }
+    }
+    else {
+        block_size = std::min(block_size, 1024);
+    }
+
+    if (is_half2) {
+        transpose_remove_padding<T2><<<valid_word_num, block_size, 0, stream>>>(
+            (T2*)src, (T2*)dst, batch_size, seq_len, head_num, size_per_head / 2, mask_offset, scale, int8_mode);
+    }
+    else {
+        transpose_remove_padding<<<valid_word_num, block_size, 0, stream>>>(
+            src, dst, batch_size, seq_len, head_num, size_per_head, mask_offset, scale, int8_mode);
+    }
+}
+// clang-format on
+
+#define INSTANTIATETRANSPOSEATTENTIONOUTREMOVEPADDING(T)                                                               \
+    template void invokeTransposeAttentionOutRemovePadding(T*           src,                                           \
+                                                           T*           dst,                                           \
+                                                           const int    valid_word_num,                                \
+                                                           const int    batch_size,                                    \
+                                                           const int    seq_len,                                       \
+                                                           const int    head_num,                                      \
+                                                           const int    size_per_head,                                 \
+                                                           const int*   mask_offset,                                   \
+                                                           const float* scale,                                         \
+                                                           const int    int8_mode,                                     \
+                                                           cudaStream_t stream)
+INSTANTIATETRANSPOSEATTENTIONOUTREMOVEPADDING(float);
+INSTANTIATETRANSPOSEATTENTIONOUTREMOVEPADDING(half);
+#ifdef ENABLE_BF16
+INSTANTIATETRANSPOSEATTENTIONOUTREMOVEPADDING(__nv_bfloat16);
+#endif
+#undef INSTANTIATETRANSPOSEATTENTIONOUTREMOVEPADDING
+
+template<typename T>
+__global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
+                                                   T* k_buf,
+                                                   T* v_buf,
+                                                   T* QKV,
+                                                   const T* __restrict qkv_bias,
+                                                   const int*   padding_offset,
+                                                   const int    batch_size,
+                                                   const int    seq_len,
+                                                   const int    token_num,
+                                                   const int    head_num,
+                                                   const int    size_per_head,
+                                                   const float* scale,
+                                                   const int    int8_mode)
+{
+    // QKV: [token_num, 3, n]
+    // qkv_bias: [3, n]
+    // q_buf, k_buf, v_buf: [batch, head_num, seq_len, size_per_head]
+
+    T*        qkv_ptr[3] = {q_buf, k_buf, v_buf};
+    const int n          = head_num * size_per_head;
+    for (int index = blockDim.x * blockIdx.x + threadIdx.x; index < token_num * 3 * n;
+         index += gridDim.x * blockDim.x) {
+        const int bias_id = index % (3 * n);
+
+        const int token_idx        = index / (3 * n);
+        const int token_padded_idx = token_idx + (padding_offset == nullptr ? 0 : padding_offset[token_idx]);
+        const int target_batch_id  = token_padded_idx / seq_len;
+        const int seq_id           = token_padded_idx % seq_len;
+
+        const int qkv_id  = (index % (3 * n)) / n;
+        const int head_id = (index % n) / size_per_head;
+        const int size_id = index % size_per_head;
+
+        T val;
+        if (int8_mode == 2) {
+            val = cuda_cast<T>(cuda_cast<float>(reinterpret_cast<const int8_t*>(QKV)[index]) * scale[qkv_id]);
+        }
+        else {
+            val = ldg(&QKV[index]);
+        }
+        val = val + ldg(&qkv_bias[bias_id]);
+
+        if (int8_mode == 2) {
+            // TODO(mseznec): add support for int8 BMM with FusedAtt
+        }
+        else {
+            QKV[index] = val;
+        }
+
+        qkv_ptr[qkv_id][target_batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head
+                        + seq_id * size_per_head + size_id] = val;
+    }
+}
+
+template<typename T>
+struct Vec_t {
+    static constexpr int size = 0;
+};
+
+template<>
+struct Vec_t<float> {
+    using Type                = float2;
+    static constexpr int size = 2;
+};
+
+template<>
+struct Vec_t<half> {
+    using Type                = uint32_t;
+    static constexpr int size = 2;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct Vec_t<__nv_bfloat16> {
+    using Type                = __nv_bfloat162;
+    static constexpr int size = 2;
+};
+#endif
+
+/// TODO: support batch step offset
+template<typename T, bool PREFIX_PROMPT>
+__global__ void add_fusedQKV_bias_transpose_kernel(T*                               q_buf,
+                                                   T*                               k_buf,
+                                                   T*                               v_buf,
+                                                   PrefixPromptBatchWeightsParam<T> param,
+                                                   T*                               QKV,
+                                                   const T* __restrict qkv_bias,
+                                                   const int* padding_offset,
+                                                   const int* history_length,
+                                                   const int  batch_size,
+                                                   const int  seq_len,
+                                                   const int  head_num,
+                                                   const int  size_per_head,
+                                                   const int  rotary_embedding_dim,
+                                                   const bool neox_rotary_style)
+{
+    // This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, head_num, size_per_head], and
+    // QKV split to 3 split buffer q, k, v and transpose them to [batch_size, head_num, seq_len, size_per_head].
+    // For q and k, also apply the rotary embedding.
+
+    // When we pass prefix prompt, this kernel also concatenate the prefix prompt and key/value along
+    // seq_len dimension like [prompt, key/value].
+    // So, the final shape of q is same ([batch_size, head_num, seq_len, size_per_head]), but
+    // the shapes of key and values become [batch_size, head_num, max_prefix_prompt_length + seq_len, size_per_head].
+
+    // NOTE: QKV src shape (batch_size, seq_len, 3, head_num, size_per_head)
+    //  QKV dst shape (3, batch_size, head_num, seq_len, size_per_head)
+    extern __shared__ __align__(sizeof(float2)) char smem_[];  // align on largest vector type
+
+    constexpr int vec_size         = Vec_t<T>::size;
+    using Vec_t                    = typename Vec_t<T>::Type;
+    const int token_idx            = blockIdx.x - batch_size * param.max_prefix_prompt_length;
+    const int token_padding_offset = (padding_offset == nullptr || token_idx < 0) ? 0 : padding_offset[token_idx];
+    const int tgt_token_idx        = token_idx + token_padding_offset;
+
+    const int batch_idx = tgt_token_idx / seq_len;
+    const int seq_idx   = tgt_token_idx % seq_len;
+
+    const int head_idx = blockIdx.y;
+    const int tidx     = threadIdx.x;
+
+    const int total_seq_len = param.max_prefix_prompt_length + seq_len;
+
+    const bool is_masked = tidx * vec_size >= size_per_head;
+    // NOTE: blockIdx.x < batch_size * param.max_prefix_prompt_length really handles prefix prompts
+    if (PREFIX_PROMPT && token_idx < 0) {
+        const int prompt_batch_idx = blockIdx.x / param.max_prefix_prompt_length;
+        const int prompt_seq_idx   = blockIdx.x % param.max_prefix_prompt_length;
+        const int prompt_length    = param.d_prefix_prompt_lengths[prompt_batch_idx];
+
+        if (prompt_seq_idx < prompt_length) {
+            const int dest_kv_idx = prompt_batch_idx * size_per_head * total_seq_len * head_num
+                                    + head_idx * size_per_head * total_seq_len + prompt_seq_idx * size_per_head
+                                    + tidx * vec_size;
+            const int prefix_kv_idx =
+                size_per_head * prompt_length * head_idx + size_per_head * prompt_seq_idx + tidx * vec_size;
+
+            const T* prefix_prompt_k = param.d_prefix_prompt_batch[prompt_batch_idx]
+                                       + param.prefix_prompt_layer_offset_per_seq * prompt_length;
+            const T* prefix_prompt_v = prefix_prompt_k + prompt_length * head_num * size_per_head;
+            if (!is_masked) {
+                *reinterpret_cast<Vec_t*>(&k_buf[dest_kv_idx]) =
+                    *reinterpret_cast<const Vec_t*>(&prefix_prompt_k[prefix_kv_idx]);
+                *reinterpret_cast<Vec_t*>(&v_buf[dest_kv_idx]) =
+                    *reinterpret_cast<const Vec_t*>(&prefix_prompt_v[prefix_kv_idx]);
+            }
+        }
+        return;
+    }
+
+    const int prefix_prompt_length = PREFIX_PROMPT ? param.d_prefix_prompt_lengths[batch_idx] : 0;
+    const int hidden_idx           = head_idx * size_per_head + tidx * vec_size;
+    const int n                    = head_num * size_per_head;
+
+    // the [0..seq_len) indices really handle KV [max_pp_len..seq_len+max_pp_len)
+    // and Q [0..seq_len)
+    // Note: if !PREFIX_PROMPT, max_pp_len = 0, so it's no-op
+    const int dst_kv_seq_idx = seq_idx + prefix_prompt_length;
+
+    // NOTE: q has seq len excluding prefix prompt
+    // src QKV: [batch, time, 3, head, hidden]
+    const int src_q_idx = token_idx * 3 * n + hidden_idx;
+    const int src_k_idx = token_idx * 3 * n + hidden_idx + n;
+    const int src_v_idx = token_idx * 3 * n + hidden_idx + 2 * n;
+
+    Vec_t q, k, v;
+    Vec_t q_bias, k_bias, v_bias;
+    if (!is_masked) {
+        q = *reinterpret_cast<const Vec_t*>(&QKV[src_q_idx]);
+        k = *reinterpret_cast<const Vec_t*>(&QKV[src_k_idx]);
+        v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
+
+        if (qkv_bias) {
+            q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
+            k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + n]);
+            v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + 2 * n]);
+        }
+    }
+
+    if (qkv_bias) {
+        q = mmha::add(q, q_bias);
+        k = mmha::add(k, k_bias);
+        v = mmha::add(v, v_bias);
+    }
+
+    const int t_offset = history_length ? history_length[batch_idx] : 0;
+
+    if (!neox_rotary_style) {
+        mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, dst_kv_seq_idx + t_offset);
+    }
+    else {
+        const bool do_rotary = !is_masked && vec_size * tidx < rotary_embedding_dim;
+
+        T* q_smem = reinterpret_cast<T*>(smem_);
+        T* k_smem = q_smem + rotary_embedding_dim;
+
+        const int half_rotary_dim = rotary_embedding_dim / 2;
+        const int half_idx        = (tidx * vec_size) / half_rotary_dim;
+        const int intra_half_idx  = (tidx * vec_size) % half_rotary_dim;
+        const int smem_pitch      = half_rotary_dim;  // TODO: adjust for bank conflicts?
+
+        if (do_rotary) {
+            *reinterpret_cast<Vec_t*>(q_smem + half_idx * smem_pitch + intra_half_idx) = q;
+            *reinterpret_cast<Vec_t*>(k_smem + half_idx * smem_pitch + intra_half_idx) = k;
+        }
+
+        __syncthreads();
+
+        const int     transpose_idx = half_idx * (half_rotary_dim / 2) + intra_half_idx / 2;
+        constexpr int tidx_factor   = vec_size / 2;
+        if (do_rotary) {
+            mmha::vec_from_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
+            mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
+
+            mmha::apply_rotary_embedding(
+                q, k, transpose_idx / tidx_factor, rotary_embedding_dim, dst_kv_seq_idx + t_offset);
+
+            mmha::write_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
+            mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
+        }
+
+        __syncthreads();
+
+        if (do_rotary) {
+            q = *reinterpret_cast<Vec_t*>(q_smem + half_idx * smem_pitch + intra_half_idx);
+            k = *reinterpret_cast<Vec_t*>(k_smem + half_idx * smem_pitch + intra_half_idx);
+        }
+    }
+    if (!is_masked && !q_buf) {  // also skip modifing QKV if q/k/v_buf are present
+        *reinterpret_cast<Vec_t*>(&QKV[src_q_idx]) = q;
+        *reinterpret_cast<Vec_t*>(&QKV[src_k_idx]) = k;
+        *reinterpret_cast<Vec_t*>(&QKV[src_v_idx]) = v;
+    }
+
+    const int dest_q_idx = batch_idx * size_per_head * seq_len * head_num + head_idx * size_per_head * seq_len
+                           + seq_idx * size_per_head + tidx * vec_size;
+
+    const int dest_kv_idx = batch_idx * size_per_head * total_seq_len * head_num
+                            + head_idx * size_per_head * total_seq_len + dst_kv_seq_idx * size_per_head
+                            + tidx * vec_size;
+
+    if (!is_masked) {
+        *reinterpret_cast<Vec_t*>(&q_buf[dest_q_idx])  = q;
+        *reinterpret_cast<Vec_t*>(&k_buf[dest_kv_idx]) = k;
+        *reinterpret_cast<Vec_t*>(&v_buf[dest_kv_idx]) = v;
+    }
+}
+
+#define FUSED_QKV_BIAS_TRANSPOSE_LAUNCH(T, PREFIX_PROMPT)                                                              \
+    add_fusedQKV_bias_transpose_kernel<T, PREFIX_PROMPT><<<grid, block, smem_size, stream>>>(q_buf,                    \
+                                                                                             k_buf,                    \
+                                                                                             v_buf,                    \
+                                                                                             param,                    \
+                                                                                             QKV,                      \
+                                                                                             qkv_bias,                 \
+                                                                                             padding_offset,           \
+                                                                                             history_length,           \
+                                                                                             batch_size,               \
+                                                                                             seq_len,                  \
+                                                                                             head_num,                 \
+                                                                                             size_per_head,            \
+                                                                                             rotary_embedding_dim,     \
+                                                                                             neox_rotary_style);
+
+template<typename T>
+void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
+                                    T*                               k_buf,
+                                    T*                               v_buf,
+                                    PrefixPromptBatchWeightsParam<T> param,
+                                    T*                               QKV,
+                                    const T*                         qkv_bias,
+                                    const int*                       padding_offset,
+                                    const int*                       history_length,
+                                    const int                        batch_size,
+                                    const int                        seq_len,
+                                    const int                        token_num,
+                                    const int                        head_num,
+                                    const int                        size_per_head,
+                                    const int                        rotary_embedding_dim,
+                                    const int                        neox_rotary_style,
+                                    const float*                     scale,
+                                    const int                        int8_mode,
+                                    cudaStream_t                     stream)
+{
+    // [bs, seq_len, 3, head, Dh]
+    if (rotary_embedding_dim == 0 && param.max_prefix_prompt_length == 0) {
+        const int m = token_num;
+        const int n = head_num * size_per_head;
+        dim3      block(384);
+        dim3      grid((int)(ceil(1.0 * m * n / 384)));
+        add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(q_buf,
+                                                                       k_buf,
+                                                                       v_buf,
+                                                                       QKV,
+                                                                       qkv_bias,
+                                                                       padding_offset,
+                                                                       batch_size,
+                                                                       seq_len,
+                                                                       token_num,
+                                                                       head_num,
+                                                                       size_per_head,
+                                                                       scale,
+                                                                       int8_mode);
+    }
+    else {
+        FT_CHECK_WITH_INFO(int8_mode != 2, "w8a8 not yet implemented with prefix prompt");  // TODO(mseznec)
+        // To implement rotary embeddings, each thread processes two QKV elems:
+        dim3   block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
+        dim3   grid(token_num + batch_size * param.max_prefix_prompt_length, head_num);
+        size_t smem_size = neox_rotary_style ? 2 * rotary_embedding_dim * sizeof(T) : 0;
+        // NOTE: add offset for rotary embedding
+        //  add_fusedQKV_bias_transpose_kernel<<<grid, block, 0, stream>>>(
+        //      q_buf, k_buf, v_buf, param, QKV, qkv_bias, batch_size, seq_len, head_num, size_per_head,
+        //      rotary_embedding_dim);
+        if (param.max_prefix_prompt_length == 0) {
+            FUSED_QKV_BIAS_TRANSPOSE_LAUNCH(T, false);
+        }
+        else {
+            FUSED_QKV_BIAS_TRANSPOSE_LAUNCH(T, true);
+        }
+    }
+}
+
+#define INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(T)                                                                         \
+    template void invokeAddFusedQKVBiasTranspose(T*                               q_buf,                               \
+                                                 T*                               k_buf,                               \
+                                                 T*                               v_buf,                               \
+                                                 PrefixPromptBatchWeightsParam<T> param,                               \
+                                                 T*                               QKV,                                 \
+                                                 const T*                         qkv_bias,                            \
+                                                 const int*                       padding_offset,                      \
+                                                 const int*                       history_length,                      \
+                                                 const int                        batch_size,                          \
+                                                 const int                        seq_len,                             \
+                                                 const int                        token_num,                           \
+                                                 const int                        head_num,                            \
+                                                 const int                        size_per_head,                       \
+                                                 const int                        rotary_embedding_dim,                \
+                                                 const int                        neox_rotary_style,                   \
+                                                 const float*                     scale,                               \
+                                                 const int                        int8_mode,                           \
+                                                 cudaStream_t                     stream)
+INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(float);
+INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(half);
+#ifdef ENABLE_BF16
+INSTANTIATEADDFUSEDQKVBIASTRANSPOSE(__nv_bfloat16);
+#endif
+#undef INSTANTIATEADDFUSEDQKVBIASTRANSPOSE
+
+template<typename T>
+__global__ void transpose_4d(T*        dst,
+                             T*        src,
+                             const int dim0,
+                             const int dim1,
+                             const int dim2,
+                             const int dim3,
+                             const int dim0_leading_dim,
+                             const int ite)
+{
+    // transpose from [dim0, dim1, dim2, dim3] to [dim2, X, dim1, dim3]
+    // where the dimension of X is dim0_leading_dim, and offset is ite * dim0
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < dim0 * dim1 * dim2 * dim3; i += blockDim.x * gridDim.x) {
+        int       index = i;
+        const int d3    = index % dim3;
+        index           = (index - d3) / dim3;
+        const int d2    = index % dim2;
+        index           = (index - d2) / dim2;
+        const int d1    = index % dim1;
+        index           = (index - d1) / dim1;
+        const int d0    = index % dim0;
+        index           = (index - d0) / dim0;
+        dst[d2 * dim0_leading_dim * dim1 * dim3 + (d0 + dim0 * ite) * dim1 * dim3 + d1 * dim3 + d3] = src[i];
+    }
+}
+
+template<>
+__global__ void transpose_4d(half*     dst,
+                             half*     src,
+                             const int dim0,
+                             const int dim1,
+                             const int dim2,
+                             const int dim3,
+                             const int dim0_leading_dim,
+                             const int ite)
+{
+    half2*    dst_ptr   = (half2*)dst;
+    half2*    src_ptr   = (half2*)src;
+    const int half_dim3 = dim3 / 2;
+    // transpose from [dim0, dim1, dim2, half_dim3] to [dim2, dim0, dim1, half_dim3]
+    // where the dimension of X is dim0_leading_dim, and offset is ite * dim0
+    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < dim0 * dim1 * dim2 * half_dim3;
+         i += blockDim.x * gridDim.x) {
+        int       index = i;
+        const int d3    = index % half_dim3;
+        index           = (index - d3) / half_dim3;
+        const int d2    = index % dim2;
+        index           = (index - d2) / dim2;
+        const int d1    = index % dim1;
+        index           = (index - d1) / dim1;
+        const int d0    = index % dim0;
+        index           = (index - d0) / dim0;
+        dst_ptr[d2 * dim0_leading_dim * dim1 * half_dim3 + (d0 + dim0 * ite) * dim1 * half_dim3 + d1 * half_dim3 + d3] =
+            src_ptr[i];
+    }
+}
+
+template<typename T>
+void invokeTranspose4d(T*           dst,
+                       T*           src,
+                       const int    local_batch_size,
+                       const int    seq_len,
+                       const int    size_per_head,
+                       const int    local_hidden_units,
+                       const int    local_head_num,
+                       const int    batch_size,
+                       const int    ite,
+                       cudaStream_t stream)
+{
+    transpose_4d<<<local_batch_size * seq_len * local_hidden_units / 512, 512 / (4 / (sizeof(T))), 0, stream>>>(
+        dst, src, local_batch_size, local_head_num, seq_len, size_per_head, batch_size, ite);
+}
+
+#define INSTANTIATETRANSPOSE4D(T)                                                                                      \
+    template void invokeTranspose4d(T*           dst,                                                                  \
+                                    T*           src,                                                                  \
+                                    const int    local_batch_size,                                                     \
+                                    const int    seq_len,                                                              \
+                                    const int    size_per_head,                                                        \
+                                    const int    local_hidden_units,                                                   \
+                                    const int    local_head_num,                                                       \
+                                    const int    batch_size,                                                           \
+                                    const int    ite,                                                                  \
+                                    cudaStream_t stream)
+INSTANTIATETRANSPOSE4D(float);
+INSTANTIATETRANSPOSE4D(half);
+#undef INSTANTIATETRANSPOSE4D
+template<typename T>
+__global__ void transpose_4d_batch_major_k_cache(
+    T* k_dst, const T* k_src, const int head_num, const int size_per_head, const int seq_len, const int max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    auto key_src = reinterpret_cast<const uint4*>(k_src + batch_id * head_num * size_per_head * seq_len
+                                                  + head_id * size_per_head * seq_len);
+    auto key_dst = reinterpret_cast<uint4*>(k_dst + batch_id * head_num * size_per_head * max_seq_len
+                                            + head_id * size_per_head * max_seq_len);
+
+    const int out_idx             = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+    if (out_idx >= size_per_head_div_x * max_seq_len) {
+        return;
+    }
+
+    int       idx            = out_idx;
+    const int k_seq_len_id   = idx % max_seq_len;
+    idx                      = (idx - k_seq_len_id) / max_seq_len;
+    const int k_head_size_id = idx % size_per_head_div_x;
+
+    if (k_seq_len_id < seq_len) {
+        key_dst[out_idx] = key_src[k_seq_len_id * size_per_head_div_x + k_head_size_id];
+    }
+}
+
+template<typename T>
+__global__ void transpose_4d_batch_major_v_cache(
+    T* v_dst, const T* v_src, const int head_num, const int size_per_head, const int seq_len, const int max_seq_len)
+{
+    const int batch_id = blockIdx.y;
+    const int head_id  = blockIdx.z;
+
+    // 16 byte loads will handle "x" dimension
+    auto val_src = reinterpret_cast<const uint4*>(v_src + batch_id * head_num * size_per_head * seq_len
+                                                  + head_id * size_per_head * seq_len);
+    auto val_dst = reinterpret_cast<uint4*>(v_dst + batch_id * head_num * size_per_head * max_seq_len
+                                            + head_id * size_per_head * max_seq_len);
+
+    // idx is over output dimension L * size_per_head / x for values
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    constexpr int X_ELEMS             = (sizeof(T) == 4) ? 4 : 8;
+    const int     size_per_head_div_x = size_per_head / X_ELEMS;
+
+    if (idx >= size_per_head_div_x * seq_len) {
+        return;
+    }
+
+    val_dst[idx] = val_src[idx];
+}
+
+template<typename T>
+void invokeTranspose4dBatchMajor(T*           k_dst,
+                                 T*           v_dst,
+                                 const T*     k_src,
+                                 const T*     v_src,
+                                 const int    local_batch_size,
+                                 const int    seq_len,
+                                 const int    max_seq_len,
+                                 const int    size_per_head,
+                                 const int    local_head_num,
+                                 cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+    int           size     = max_seq_len * size_per_head / x;
+    dim3          grid((size + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+    dim3          grid_v((seq_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+
+    transpose_4d_batch_major_k_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, k_src, local_head_num, size_per_head, seq_len, max_seq_len);
+
+    transpose_4d_batch_major_v_cache<<<grid_v, block_sz, 0, stream>>>(
+        v_dst, v_src, local_head_num, size_per_head, seq_len, max_seq_len);
+}
+
+#define INSTANTIATETRANSPOSE4DBATCHMAJOR(T)                                                                            \
+    template void invokeTranspose4dBatchMajor(T*           k_dst,                                                      \
+                                              T*           v_dst,                                                      \
+                                              const T*     k_src,                                                      \
+                                              const T*     v_src,                                                      \
+                                              const int    local_batch_size,                                           \
+                                              const int    seq_len,                                                    \
+                                              const int    max_seq_len,                                                \
+                                              const int    size_per_head,                                              \
+                                              const int    local_head_num,                                             \
+                                              cudaStream_t stream)
+INSTANTIATETRANSPOSE4DBATCHMAJOR(float);
+INSTANTIATETRANSPOSE4DBATCHMAJOR(half);
+#ifdef ENABLE_BF16
+INSTANTIATETRANSPOSE4DBATCHMAJOR(__nv_bfloat16);
+#endif
+#undef INSTANTIATETRANSPOSE4DBATCHMAJOR
+
+template<typename T>
+__global__ void addRelativeAttentionBias(
+    T* qk_buf, const T* relative_attention_bias, const int batch_size, const int head_num, const int seq_len)
+{
+    for (int i = threadIdx.x; i < batch_size * seq_len; i += blockDim.x) {
+        int batch_id = i / seq_len;
+        int seq_id   = i % seq_len;
+
+        const int bias_index = blockIdx.x * seq_len + seq_id;
+        const int qk_index   = batch_id * gridDim.x * seq_len + bias_index;
+        qk_buf[qk_index]     = add(qk_buf[qk_index], relative_attention_bias[bias_index]);
+    }
+}
+
+template<typename T>
+void invokeAddRelativeAttentionBias(T*           qk_buf,
+                                    const T*     relative_attention_bias,
+                                    const int    batch_size,
+                                    const int    head_num,
+                                    const int    seq_len,
+                                    cudaStream_t stream)
+{
+    // qk_buf: [batch_size, head_num, seq_len, seq_len]
+    // relative_attention_bias: [1, head_num, seq_len, seq_len]
+    dim3 grid(head_num * seq_len);
+    dim3 block(512);
+    using T2 = typename TypeConverter<T>::Type;
+#ifdef ENABLE_BF16
+    const bool is_half2 = (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) && (seq_len % 2 == 0);
+#else
+    const bool is_half2 = (std::is_same<T, half>::value) && (seq_len % 2 == 0);
+#endif
+    if (is_half2) {
+        addRelativeAttentionBias<T2><<<grid, block, 0, stream>>>(
+            (T2*)qk_buf, (const T2*)relative_attention_bias, batch_size, head_num, seq_len / 2);
+    }
+    else {
+        addRelativeAttentionBias<<<grid, block, 0, stream>>>(
+            qk_buf, relative_attention_bias, batch_size, head_num, seq_len);
+    }
+}
+
+#define INSTANTIATEADDRELATIVEATTENTIONBIAS(T)                                                                         \
+    template void invokeAddRelativeAttentionBias(T*           qk_buf,                                                  \
+                                                 const T*     relative_attention_bias,                                 \
+                                                 const int    batch_size,                                              \
+                                                 const int    head_num,                                                \
+                                                 const int    seq_len,                                                 \
+                                                 cudaStream_t stream)
+INSTANTIATEADDRELATIVEATTENTIONBIAS(float);
+INSTANTIATEADDRELATIVEATTENTIONBIAS(half);
+#ifdef ENABLE_BF16
+INSTANTIATEADDRELATIVEATTENTIONBIAS(__nv_bfloat16);
+#endif
+#undef INSTANTIATEADDRELATIVEATTENTIONBIAS
+
+/*******************  invokeAddHead3SizeQKVBias  ***********************/
+// m = batch*window_num*window_len
+// mm_qkv is [m, head*3*size_per_head] row-major
+// bias_qkv is [head*3*size_per_head]
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
+// grid(window_len, window_num, 3*batch);
+// block(num_head * size_per_head)
+template<typename T>
+__global__ void add_head3Size_QKV_bias(const T*  mm_qkv,
+                                       const T*  bias_qkv,
+                                       T*        q_buf_,
+                                       T*        k_buf_,
+                                       T*        v_buf_,
+                                       const int batch,
+                                       const int window_num,
+                                       const int window_len,
+                                       const int num_head,
+                                       const int size_per_head)
+{
+
+    T*  buf_ptr;
+    int qkv_id = blockIdx.z / batch;
+    if (qkv_id == 0) {
+        buf_ptr = q_buf_;
+    }
+    else if (qkv_id == 1) {
+        buf_ptr = k_buf_;
+    }
+    else {
+        buf_ptr = v_buf_;
+    }
+
+    const int batch_id   = blockIdx.z % batch;
+    const int token_id   = blockIdx.x;
+    const int window_id  = blockIdx.y;
+    const int head_id    = threadIdx.x / size_per_head;
+    const int id_in_head = threadIdx.x % size_per_head;
+
+    const int bias_idx = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    const T   bias     = ldg(bias_qkv + bias_idx);
+
+    const int input_idx =
+        ((batch_id * window_num + window_id) * window_len + token_id) * num_head * 3 * size_per_head + bias_idx;
+    T tmp = mm_qkv[input_idx] + bias;
+
+    int target_id = (((batch_id * window_num + window_id) * num_head + head_id) * window_len + token_id) * size_per_head
+                    + id_in_head;
+    ;
+    buf_ptr[target_id] = tmp;
+}
+
+// for float2, size_per_head /= 2
+// m = batch*window_num*window_len
+// mm_qkv is [m, head*3*size_per_head] row-major
+// bias_qkv is [head*3*size_per_head]
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
+// grid(window_len, window_num, 3*batch);
+// block(num_head * size_per_head)
+template<>
+__global__ void add_head3Size_QKV_bias(const float2* mm_qkv,
+                                       const float2* bias_qkv,
+                                       float2*       q_buf_,
+                                       float2*       k_buf_,
+                                       float2*       v_buf_,
+                                       const int     batch,
+                                       const int     window_num,
+                                       const int     window_len,
+                                       const int     num_head,
+                                       const int     size_per_head)
+{
+
+    float2* buf_ptr;
+    int     qkv_id = blockIdx.z / batch;
+    if (qkv_id == 0) {
+        buf_ptr = q_buf_;
+    }
+    else if (qkv_id == 1) {
+        buf_ptr = k_buf_;
+    }
+    else {
+        buf_ptr = v_buf_;
+    }
+
+    const int batch_id   = blockIdx.z % batch;
+    const int token_id   = blockIdx.x;
+    const int window_id  = blockIdx.y;
+    const int head_id    = threadIdx.x / size_per_head;
+    const int id_in_head = threadIdx.x % size_per_head;
+
+    const int    bias_idx = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    const float2 bias     = ldg(bias_qkv + bias_idx);
+
+    const int input_idx =
+        ((batch_id * window_num + window_id) * window_len + token_id) * num_head * 3 * size_per_head + bias_idx;
+    float2 tmp = mm_qkv[input_idx];
+    tmp.x += bias.x;
+    tmp.y += bias.y;
+
+    int target_id = (((batch_id * window_num + window_id) * num_head + head_id) * window_len + token_id) * size_per_head
+                    + id_in_head;
+    ;
+    buf_ptr[target_id] = tmp;
+}
+
+// for half2, size_per_head /= 2
+// m = batch*window_num*window_len
+// mm_qkv is [m, head*3*size_per_head] row-major
+// bias_qkv is [head*3*size_per_head]
+// q_buf_, k_buf_, v_buf_ is [batch*window_num, num_head, window_len, size_per_head] row-major
+// grid(window_len, window_num, batch);
+// block(num_head * size_per_head)
+template<>
+__global__ void add_head3Size_QKV_bias(const half2* mm_qkv,
+                                       const half2* bias_qkv,
+                                       half2*       q_buf_,
+                                       half2*       k_buf_,
+                                       half2*       v_buf_,
+                                       const int    batch,
+                                       const int    window_num,
+                                       const int    window_len,
+                                       const int    num_head,
+                                       const int    size_per_head)
+{
+
+    const int batch_id   = blockIdx.z;
+    const int token_id   = blockIdx.x;
+    const int window_id  = blockIdx.y;
+    const int head_id    = threadIdx.x / size_per_head;
+    const int id_in_head = threadIdx.x % size_per_head;
+
+    const int input_offset =
+        ((batch_id * window_num + window_id) * window_len + token_id) * num_head * 3 * size_per_head;
+    const int target_id =
+        (((batch_id * window_num + window_id) * num_head + head_id) * window_len + token_id) * size_per_head
+        + id_in_head;
+
+    int   qkv_id      = 0;
+    int   bias_idx    = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    half2 bias        = __ldg(bias_qkv + bias_idx);
+    int   input_idx   = input_offset + bias_idx;
+    half2 tmp         = mm_qkv[input_idx];
+    tmp               = __hadd2(tmp, bias);
+    q_buf_[target_id] = tmp;
+
+    qkv_id            = 1;
+    bias_idx          = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    bias              = __ldg(bias_qkv + bias_idx);
+    input_idx         = input_offset + bias_idx;
+    tmp               = mm_qkv[input_idx];
+    tmp               = __hadd2(tmp, bias);
+    k_buf_[target_id] = tmp;
+
+    qkv_id            = 2;
+    bias_idx          = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    bias              = __ldg(bias_qkv + bias_idx);
+    input_idx         = input_offset + bias_idx;
+    tmp               = mm_qkv[input_idx];
+    tmp               = __hadd2(tmp, bias);
+    v_buf_[target_id] = tmp;
+}
+
+#ifdef ENABLE_BF16
+template<>
+__global__ void add_head3Size_QKV_bias(const __nv_bfloat162* mm_qkv,
+                                       const __nv_bfloat162* bias_qkv,
+                                       __nv_bfloat162*       q_buf_,
+                                       __nv_bfloat162*       k_buf_,
+                                       __nv_bfloat162*       v_buf_,
+                                       const int             batch,
+                                       const int             window_num,
+                                       const int             window_len,
+                                       const int             num_head,
+                                       const int             size_per_head)
+{
+
+    const int batch_id   = blockIdx.z;
+    const int token_id   = blockIdx.x;
+    const int window_id  = blockIdx.y;
+    const int head_id    = threadIdx.x / size_per_head;
+    const int id_in_head = threadIdx.x % size_per_head;
+
+    const int input_offset =
+        ((batch_id * window_num + window_id) * window_len + token_id) * num_head * 3 * size_per_head;
+    const int target_id =
+        (((batch_id * window_num + window_id) * num_head + head_id) * window_len + token_id) * size_per_head
+        + id_in_head;
+
+    int            qkv_id    = 0;
+    int            bias_idx  = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    __nv_bfloat162 bias      = ldg(bias_qkv + bias_idx);
+    int            input_idx = input_offset + bias_idx;
+    __nv_bfloat162 tmp       = mm_qkv[input_idx];
+    tmp                      = bf16hadd2(tmp, bias);
+    q_buf_[target_id]        = tmp;
+
+    qkv_id            = 1;
+    bias_idx          = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    bias              = ldg(bias_qkv + bias_idx);
+    input_idx         = input_offset + bias_idx;
+    tmp               = mm_qkv[input_idx];
+    tmp               = bf16hadd2(tmp, bias);
+    k_buf_[target_id] = tmp;
+
+    qkv_id            = 2;
+    bias_idx          = (head_id * 3 + qkv_id) * size_per_head + id_in_head;
+    bias              = ldg(bias_qkv + bias_idx);
+    input_idx         = input_offset + bias_idx;
+    tmp               = mm_qkv[input_idx];
+    tmp               = bf16hadd2(tmp, bias);
+    v_buf_[target_id] = tmp;
+}
+#endif
+
+template<typename T>
+void invokeAddHead3SizeQKVBias(const T*     mm_qkv,
+                               const T*     bias_qkv,
+                               T*           q_buf_,
+                               T*           k_buf_,
+                               T*           v_buf_,
+                               const int    batch,
+                               const int    window_num,
+                               const int    window_len,
+                               const int    num_head,
+                               const int    size_per_head,
+                               cudaStream_t stream)
+{
+    if (std::is_same<T, float>::value) {
+        dim3 grid(window_len, window_num, 3 * batch);
+        dim3 block(num_head * size_per_head);
+
+        if (block.x < 1024) {
+            add_head3Size_QKV_bias<<<grid, block, 0, stream>>>(
+                mm_qkv, bias_qkv, q_buf_, k_buf_, v_buf_, batch, window_num, window_len, num_head, size_per_head);
+        }
+        else if ((block.x % 2 == 0) && (block.x / 2 < 1024)) {
+            block.x /= 2;
+            add_head3Size_QKV_bias<<<grid, block, 0, stream>>>((const float2*)mm_qkv,
+                                                               (const float2*)bias_qkv,
+                                                               (float2*)q_buf_,
+                                                               (float2*)k_buf_,
+                                                               (float2*)v_buf_,
+                                                               batch,
+                                                               window_num,
+                                                               window_len,
+                                                               num_head,
+                                                               size_per_head / 2);
+        }
+        else {
+            printf("[ERROR][invokeAddHead3SizeQKVBias] unsupported block.x!\n");
+            exit(-1);
+        }
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    else if (std::is_same<T, half>::value) {
+#endif
+        dim3 grid(window_len, window_num, batch);
+        dim3 block(num_head * size_per_head / 2);
+
+        using T2 = typename TypeConverter<T>::Type;  // half2 or bfloat16
+
+        if (block.x > 1024) {
+            printf("[ERROR][invokeAddHead3SizeQKVBias] block.x > 1024!\n");
+            exit(-1);
+        }
+
+        add_head3Size_QKV_bias<<<grid, block, 0, stream>>>((const T2*)mm_qkv,
+                                                           (const T2*)bias_qkv,
+                                                           (T2*)q_buf_,
+                                                           (T2*)k_buf_,
+                                                           (T2*)v_buf_,
+                                                           batch,
+                                                           window_num,
+                                                           window_len,
+                                                           num_head,
+                                                           size_per_head / 2);
+    }
+}
+
+#define INSTANTIATEADDHEAD3SIZEQKVBIAS(T)                                                                              \
+    template void invokeAddHead3SizeQKVBias<T>(const T*     mm_qkv,                                                    \
+                                               const T*     bias_qkv,                                                  \
+                                               T*           q_buf_,                                                    \
+                                               T*           k_buf_,                                                    \
+                                               T*           v_buf_,                                                    \
+                                               const int    batch,                                                     \
+                                               const int    window_num,                                                \
+                                               const int    window_len,                                                \
+                                               const int    num_head,                                                  \
+                                               const int    size_per_head,                                             \
+                                               cudaStream_t stream)
+INSTANTIATEADDHEAD3SIZEQKVBIAS(float);
+INSTANTIATEADDHEAD3SIZEQKVBIAS(half);
+#ifdef ENABLE_BF16
+INSTANTIATEADDHEAD3SIZEQKVBIAS(__nv_bfloat16);
+#endif
+#undef INSTANTIATEADDHEAD3SIZEQKVBIAS
+
+/*******************  invokeMaskedSoftMaxWithRelPosBias  ***********************/
+
+// grid = (window_len/word_per_thread, window_num*num_head, batch_size)
+// block.x = max(32, (window_len + 31)/32*32)
+// qk_buf is [batch, window_num, num_head, window_len, window_len]
+// attn_mask is [window_num, window_len, window_len] + row-major
+// relative_pos_bias is [num_head, window_len, window_len] + row-majot
+template<typename T>
+__global__ void softmax_withRelPosBias_element1_kernel(T*          qk_buf,
+                                                       const T*    attn_mask,
+                                                       const T*    relative_pos_bias,
+                                                       const int   batch_size,
+                                                       const int   num_head,
+                                                       const int   window_num,
+                                                       const int   window_len,
+                                                       const int   window_len_x_window_len,
+                                                       const float qk_scale)
+{
+
+    bool qual = threadIdx.x < window_len;
+    for (int window_id = blockIdx.x; window_id < window_len; window_id += gridDim.x) {
+        float            tmp = -1e20f;
+        __shared__ float s_mean, s_max;
+        int              qk_offset;
+        if (qual) {
+            const int offset_in_window = window_id * window_len + threadIdx.x;
+            qk_offset = (blockIdx.z * gridDim.y + blockIdx.y) * window_len_x_window_len + offset_in_window;
+            const int relative_pos_bias_offset = (blockIdx.y % num_head) * window_len_x_window_len + offset_in_window;
+            float     mask_val =
+                (attn_mask == nullptr) ?
+                        0.0f :
+                        static_cast<float>(
+                        ldg(attn_mask + ((blockIdx.y / num_head) * window_len_x_window_len + offset_in_window)));
+            tmp = qk_scale * static_cast<float>(qk_buf[qk_offset]) + mask_val
+                  + static_cast<float>(ldg(relative_pos_bias + relative_pos_bias_offset));
+        }
+
+        float max_val = blockReduceMax<float>(tmp);
+        if (threadIdx.x == 0) {
+            s_max = max_val;
+        }
+        __syncthreads();
+
+        float qk_tmp  = qual ? __expf(tmp - s_max) : 0.0f;
+        float sum_val = blockReduceSum<float>(qk_tmp);
+        if (threadIdx.x == 0) {
+            s_mean = sum_val + 1e-6f;
+            s_mean = __fdividef(1.0f, s_mean);
+        }
+        __syncthreads();
+        if (qual) {
+            qk_buf[qk_offset] = (T)(qk_tmp * s_mean);
+        }
+    }
+}
+
+// grid = (window_len/word_per_thread, window_num*num_head, batch_size)
+// block.x = max(32, (window_len/2 + 31)/32*32)
+// qk_buf is [batch, window_num, num_head, window_len, window_len]
+// attn_mask is [window_num, window_len, window_len] + row-major
+// relative_pos_bias is [num_head, window_len, window_len] + row-majot
+template<typename T2, typename T>
+__global__ void softmax_withRelPosBias_element2_kernel(T2*         qk_buf,
+                                                       const T2*   attn_mask,
+                                                       const T2*   relative_pos_bias,
+                                                       const int   batch_size,
+                                                       const int   num_head,
+                                                       const int   window_num,
+                                                       const int   window_len,
+                                                       const int   window_len_x_window_len,
+                                                       const float qk_scale)
+{
+    const int window_len_2 = window_len / 2;
+    const int tidx         = threadIdx.x;
+    bool      qual         = tidx < window_len_2;
+    const T2  zero         = {T(0.0f), T(0.0f)};
+    const int bdim         = blockDim.x;
+    for (int window_id = blockIdx.x; window_id < window_len; window_id += gridDim.x) {
+        float            tmp = -1e20f;
+        __shared__ float s_mean, s_max;
+        int              qk_offset;
+        float2           local_qk_val;
+        T2               qk_val;
+        if (qual) {
+            const int offset_in_window = window_id * window_len + 2 * tidx;
+            qk_offset = ((blockIdx.z * gridDim.y + blockIdx.y) * window_len_x_window_len + offset_in_window) / 2;
+            const int relative_pos_bias_offset =
+                ((blockIdx.y % num_head) * window_len_x_window_len + offset_in_window) / 2;
+            T2 mask_val =
+                (attn_mask == nullptr) ?
+                    zero :
+                    ldg(attn_mask + ((blockIdx.y / num_head) * window_len_x_window_len + offset_in_window) / 2);
+            qk_val            = qk_buf[qk_offset];
+            local_qk_val.x    = static_cast<float>(qk_val.x);
+            local_qk_val.y    = static_cast<float>(qk_val.y);
+            const T2 bias_val = ldg(relative_pos_bias + relative_pos_bias_offset);
+            local_qk_val.x =
+                qk_scale * local_qk_val.x + static_cast<float>(mask_val.x) + static_cast<float>(bias_val.x);
+            local_qk_val.y =
+                qk_scale * local_qk_val.y + static_cast<float>(mask_val.y) + static_cast<float>(bias_val.y);
+            tmp = local_qk_val.x > local_qk_val.y ? local_qk_val.x : local_qk_val.y;
+        }
+
+        float max_val = bdim <= 32 ? warpReduceMax<float>(tmp) : blockReduceMax<float>(tmp);
+        if (tidx == 0) {
+            s_max = max_val;
+        }
+        __syncthreads();
+
+        local_qk_val.x = qual ? __expf(local_qk_val.x - s_max) : 0.0f;
+        local_qk_val.y = qual ? __expf(local_qk_val.y - s_max) : 0.0f;
+
+        float sum_val = bdim <= 32 ? warpReduceSum<float>(local_qk_val.x + local_qk_val.y) :
+                                     blockReduceSum<float>(local_qk_val.x + local_qk_val.y);
+        if (tidx == 0) {
+            s_mean = sum_val + 1e-6f;
+            s_mean = __fdividef(1.0f, s_mean);
+        }
+        __syncthreads();
+        if (qual) {
+            local_qk_val.x    = local_qk_val.x * s_mean;
+            local_qk_val.y    = local_qk_val.y * s_mean;
+            qk_val.x          = T(local_qk_val.x);
+            qk_val.y          = T(local_qk_val.y);
+            qk_buf[qk_offset] = qk_val;
+        }
+    }
+}
+
+// grid = (window_len/word_per_thread, window_num*num_head, batch_size)
+// block.x = max(32, (window_len/4 + 31)/32*32)
+// qk_buf is [batch, window_num, num_head, window_len, window_len]
+// attn_mask is [window_num, window_len, window_len] + row-major
+// relative_pos_bias is [num_head, window_len, window_len] + row-majot
+template<typename T4, typename T>
+__global__ void softmax_withRelPosBias_element4_kernel(T4*         qk_buf,
+                                                       const T4*   attn_mask,
+                                                       const T4*   relative_pos_bias,
+                                                       const int   batch_size,
+                                                       const int   num_head,
+                                                       const int   window_num,
+                                                       const int   window_len,
+                                                       const int   window_len_x_window_len,
+                                                       const float qk_scale)
+{
+    const int window_len_4 = window_len / 4;
+    const int tidx         = threadIdx.x;
+    bool      qual         = tidx < window_len_4;
+    const T4  zero         = {T(0.0f), T(0.0f), T(0.0f), T(0.0f)};
+    const int bdim         = blockDim.x;
+    for (int window_id = blockIdx.x; window_id < window_len; window_id += gridDim.x) {
+        float            tmp = -1e20f;
+        __shared__ float s_mean, s_max;
+        int              qk_offset;
+        float4           local_qk_val;
+        T4               qk_val;
+        if (qual) {
+            const int offset_in_window = window_id * window_len + 4 * tidx;
+            qk_offset = ((blockIdx.z * gridDim.y + blockIdx.y) * window_len_x_window_len + offset_in_window) / 4;
+            const int relative_pos_bias_offset =
+                ((blockIdx.y % num_head) * window_len_x_window_len + offset_in_window) / 4;
+            T4 mask_val       = (attn_mask == nullptr) ?
+                                    zero :
+                                    attn_mask[((blockIdx.y / num_head) * window_len_x_window_len + offset_in_window) / 4];
+            qk_val            = qk_buf[qk_offset];
+            local_qk_val.x    = static_cast<float>(qk_val.x);
+            local_qk_val.y    = static_cast<float>(qk_val.y);
+            local_qk_val.z    = static_cast<float>(qk_val.z);
+            local_qk_val.w    = static_cast<float>(qk_val.w);
+            const T4 bias_val = relative_pos_bias[relative_pos_bias_offset];
+            local_qk_val.x =
+                qk_scale * local_qk_val.x + static_cast<float>(mask_val.x) + static_cast<float>(bias_val.x);
+            local_qk_val.y =
+                qk_scale * local_qk_val.y + static_cast<float>(mask_val.y) + static_cast<float>(bias_val.y);
+            local_qk_val.z =
+                qk_scale * local_qk_val.z + static_cast<float>(mask_val.z) + static_cast<float>(bias_val.z);
+            local_qk_val.w =
+                qk_scale * local_qk_val.w + static_cast<float>(mask_val.w) + static_cast<float>(bias_val.w);
+            tmp = local_qk_val.x > local_qk_val.y ? local_qk_val.x : local_qk_val.y;
+            tmp = tmp > local_qk_val.z ? tmp : local_qk_val.z;
+            tmp = tmp > local_qk_val.w ? tmp : local_qk_val.w;
+        }
+
+        float max_val = bdim <= 32 ? warpReduceMax<float>(tmp) : blockReduceMax<float>(tmp);
+        if (tidx == 0) {
+            s_max = max_val;
+        }
+        __syncthreads();
+
+        local_qk_val.x = qual ? __expf(local_qk_val.x - s_max) : 0.0f;
+        local_qk_val.y = qual ? __expf(local_qk_val.y - s_max) : 0.0f;
+        local_qk_val.z = qual ? __expf(local_qk_val.z - s_max) : 0.0f;
+        local_qk_val.w = qual ? __expf(local_qk_val.w - s_max) : 0.0f;
+
+        float sum_val = bdim <= 32 ?
+                            warpReduceSum<float>(local_qk_val.x + local_qk_val.y + local_qk_val.z + local_qk_val.w) :
+                            blockReduceSum<float>(local_qk_val.x + local_qk_val.y + local_qk_val.z + local_qk_val.w);
+        if (tidx == 0) {
+            s_mean = sum_val + 1e-6f;
+            s_mean = __fdividef(1.0f, s_mean);
+        }
+        __syncthreads();
+        if (qual) {
+            local_qk_val.x    = local_qk_val.x * s_mean;
+            local_qk_val.y    = local_qk_val.y * s_mean;
+            local_qk_val.z    = local_qk_val.z * s_mean;
+            local_qk_val.w    = local_qk_val.w * s_mean;
+            qk_val.x          = T(local_qk_val.x);
+            qk_val.y          = T(local_qk_val.y);
+            qk_val.z          = T(local_qk_val.z);
+            qk_val.w          = T(local_qk_val.w);
+            qk_buf[qk_offset] = qk_val;
+        }
+    }
+}
+
+template<typename T>
+void invokeMaskedSoftMaxWithRelPosBias(T*           qk_buf,
+                                       const T*     attn_mask,
+                                       const T*     relative_pos_bias,
+                                       const int    batch_size,
+                                       const int    num_head,
+                                       const int    window_num,
+                                       const int    window_len,
+                                       float        qk_scale,
+                                       cudaStream_t stream)
+{
+    const int word_per_thread = 1;
+    dim3      grid((window_len + word_per_thread - 1) / word_per_thread, window_num * num_head, batch_size);
+    if ((window_len % 4 == 0) && window_len / 4 >= 32) {
+        dim3 block((window_len / 4 + 31) / 32 * 32);
+        if (std::is_same<T, float>::value) {
+            softmax_withRelPosBias_element4_kernel<float4, float>
+                <<<grid, block, 0, stream>>>((float4*)qk_buf,
+                                             (const float4*)attn_mask,
+                                             (const float4*)relative_pos_bias,
+                                             batch_size,
+                                             num_head,
+                                             window_num,
+                                             window_len,
+                                             window_len * window_len,
+                                             qk_scale);
+        }
+        else if (std::is_same<T, half>::value) {
+            softmax_withRelPosBias_element4_kernel<half4, half>
+                <<<grid, block, 0, stream>>>((half4*)qk_buf,
+                                             (const half4*)attn_mask,
+                                             (const half4*)relative_pos_bias,
+                                             batch_size,
+                                             num_head,
+                                             window_num,
+                                             window_len,
+                                             window_len * window_len,
+                                             qk_scale);
+        }
+#ifdef ENABLE_BF16
+        else {
+            dim3 block((window_len + 31) / 32 * 32);
+            softmax_withRelPosBias_element1_kernel<<<grid, block, 0, stream>>>(qk_buf,
+                                                                               attn_mask,
+                                                                               relative_pos_bias,
+                                                                               batch_size,
+                                                                               num_head,
+                                                                               window_num,
+                                                                               window_len,
+                                                                               window_len * window_len,
+                                                                               qk_scale);
+        }
+#endif
+    }
+    else if (window_len % 2 == 0) {
+        dim3 block((window_len / 2 + 31) / 32 * 32);
+        if (std::is_same<T, float>::value) {
+            softmax_withRelPosBias_element2_kernel<float2, float>
+                <<<grid, block, 0, stream>>>((float2*)qk_buf,
+                                             (const float2*)attn_mask,
+                                             (const float2*)relative_pos_bias,
+                                             batch_size,
+                                             num_head,
+                                             window_num,
+                                             window_len,
+                                             window_len * window_len,
+                                             qk_scale);
+        }
+        else if (std::is_same<T, half>::value) {
+            softmax_withRelPosBias_element2_kernel<half2, half>
+                <<<grid, block, 0, stream>>>((half2*)qk_buf,
+                                             (const half2*)attn_mask,
+                                             (const half2*)relative_pos_bias,
+                                             batch_size,
+                                             num_head,
+                                             window_num,
+                                             window_len,
+                                             window_len * window_len,
+                                             qk_scale);
+        }
+#ifdef ENABLE_BF16
+        else {
+            dim3 block((window_len + 31) / 32 * 32);
+            softmax_withRelPosBias_element1_kernel<<<grid, block, 0, stream>>>(qk_buf,
+                                                                               attn_mask,
+                                                                               relative_pos_bias,
+                                                                               batch_size,
+                                                                               num_head,
+                                                                               window_num,
+                                                                               window_len,
+                                                                               window_len * window_len,
+                                                                               qk_scale);
+        }
+#endif
+    }
+    else {
+        dim3 block((window_len + 31) / 32 * 32);
+        softmax_withRelPosBias_element1_kernel<<<grid, block, 0, stream>>>(qk_buf,
+                                                                           attn_mask,
+                                                                           relative_pos_bias,
+                                                                           batch_size,
+                                                                           num_head,
+                                                                           window_num,
+                                                                           window_len,
+                                                                           window_len * window_len,
+                                                                           qk_scale);
+    }
+}
+
+#define INSTANTIATEMASKEDSOFTMAXWITHRELPOSBIAS(T)                                                                      \
+    template void invokeMaskedSoftMaxWithRelPosBias(T*           qk_buf,                                               \
+                                                    const T*     attn_mask,                                            \
+                                                    const T*     relative_pos_bias,                                    \
+                                                    const int    batch_size,                                           \
+                                                    const int    num_head,                                             \
+                                                    const int    window_num,                                           \
+                                                    const int    window_len,                                           \
+                                                    const float  qk_scale,                                             \
+                                                    cudaStream_t stream)
+INSTANTIATEMASKEDSOFTMAXWITHRELPOSBIAS(float);
+INSTANTIATEMASKEDSOFTMAXWITHRELPOSBIAS(half);
+#ifdef ENABLE_BF16
+INSTANTIATEMASKEDSOFTMAXWITHRELPOSBIAS(__nv_bfloat16);
+#endif
+#undef INSTANTIATEMASKEDSOFTMAXWITHRELPOSBIAS
+
+template<typename T>
+__global__ void transpose_attentions(
+    T* attentions_out, const T* attentions_in, size_t batch_size, size_t num_layers, size_t num_heads, size_t seq_len)
+{
+    // attentions_in  shape [B, H, S, S]
+    // attentions_out shape [B, L, H, S, S].
+    // Note that we write the L dimension as if it was index 0.
+    // In reality, the pointer has already been shifted to point to the correct layer.
+
+    const auto batch_idx = blockIdx.x;
+    const auto head_idx  = blockIdx.y;
+
+    const auto dst_offset = (batch_idx * num_layers * num_heads + head_idx) * seq_len * seq_len;
+    const auto src_offset = (batch_idx * num_heads + head_idx) * seq_len * seq_len;
+
+    for (auto x = threadIdx.x; x < seq_len * seq_len; x += blockDim.x) {
+        attentions_out[dst_offset + x] = attentions_in[src_offset + x];
+    }
+}
+
+template<typename T>
+void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream)
+{
+    const size_t batch_size = attentions_in.shape[0];
+    const size_t num_heads  = attentions_in.shape[1];
+    const size_t seq_len    = attentions_in.shape[2];
+    const size_t num_layers = attentions_out.shape[1];
+
+    const dim3 gridSize(batch_size, num_heads);
+    const dim3 blockSize(512);
+
+    transpose_attentions<<<gridSize, blockSize, 0, stream>>>(
+        attentions_out.getPtr<T>(), attentions_in.getPtr<const T>(), batch_size, num_layers, num_heads, seq_len);
+}
+
+#define INSTANTIATETRANSPOSEATTENTIONS(T)                                                                              \
+    template void invokeTransposeAttentions<T>(                                                                        \
+        Tensor & attentions_out, const Tensor& attentions_in, cudaStream_t stream)
+INSTANTIATETRANSPOSEATTENTIONS(float);
+INSTANTIATETRANSPOSEATTENTIONS(half);
+#ifdef ENABLE_BF16
+INSTANTIATETRANSPOSEATTENTIONS(__nv_bfloat16);
+#endif
+#undef INSTANTIATETRANSPOSEATTENTIONS
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/kernels/unfused_attention_kernels.h b/src/fastertransformer/kernels/unfused_attention_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dae8022e8022e85668424974ef5030fdadf9bb0
--- /dev/null
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "src/fastertransformer/utils/Tensor.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeAddQKVBiasIA3Transpose(T*           q_buf,
+                                  T*           k_buf,
+                                  T*           v_buf,
+                                  T*           Q,
+                                  const T*     bias_Q,
+                                  T*           K,
+                                  const T*     bias_K,
+                                  T*           V,
+                                  const T*     bias_V,
+                                  const int    batch_size,
+                                  const int    seq_len,
+                                  const int    head_num,
+                                  const int    size_per_head,
+                                  const int*   ia3_tasks,
+                                  const T*     ia3_key_weights,
+                                  const T*     ia3_value_weights,
+                                  cudaStream_t stream);
+
+template<typename T, typename T_IN>
+struct MaskedSoftmaxParam {
+    // Common parameters.
+    T*          attention_score = nullptr;  // (batch_size, head_num, q_length, k_length)
+    const T_IN* qk              = nullptr;  // (batch_size, head_num, q_length, k_length)
+    const T*    attention_mask  = nullptr;  // (batch_size, q_length, k_length)
+    int         batch_size      = 0;
+    int         q_length        = 0;
+    int         k_length        = 0;
+    int         num_heads       = 0;
+    T           qk_scale        = T(0.0f);
+
+    // Optional parameters that depend on the type of attention.
+    // The slopes of the linear position bias of ALiBi.
+    const T* linear_bias_slopes = nullptr;  // (head_num,), optional
+};
+
+template<typename T, typename T_IN>
+void invokeMaskedSoftmax(MaskedSoftmaxParam<T, T_IN>& param, cudaStream_t stream);
+
+template<typename T>
+void invokeTransposeQKV(T*           dst,
+                        T*           src,
+                        const int    batch_size,
+                        const int    seq_len,
+                        const int    head_num,
+                        const int    size_per_head,
+                        const float* scale,
+                        const int    int8_mode,
+                        cudaStream_t stream);
+
+template<typename T>
+void invokeAddQKVBiasIA3RebuildPadding(T*           Q,
+                                       const T*     bias_Q,
+                                       T*           K,
+                                       const T*     bias_K,
+                                       T*           V,
+                                       const T*     bias_V,
+                                       T*           q_buf,
+                                       T*           k_buf,
+                                       T*           v_buf,
+                                       const int    batch_size,
+                                       const int    seq_len,
+                                       const int    head_num,
+                                       const int    size_per_head,
+                                       const int    valid_word_num,
+                                       const int*   mask_offset,
+                                       const int*   ia3_tasks,
+                                       const T*     ia3_key_weights,
+                                       const T*     ia3_value_weights,
+                                       cudaStream_t stream);
+
+template<typename T>
+void invokeTransposeAttentionOutRemovePadding(T*           src,
+                                              T*           dst,
+                                              const int    valid_word_num,
+                                              const int    batch_size,
+                                              const int    seq_len,
+                                              const int    head_num,
+                                              const int    size_per_head,
+                                              const int*   mask_offset,
+                                              const float* scale,
+                                              const int    int8_mode,
+                                              cudaStream_t stream);
+
+// Prefix Prompt Parameters
+template<typename T>
+struct PrefixPromptBatchWeightsParam {
+    const T**  d_prefix_prompt_batch    = nullptr;
+    const int* d_prefix_prompt_lengths  = nullptr;
+    const int  max_prefix_prompt_length = 0;
+    // l * 2 * hidden_units_ / tensor_para_.world_size_
+    const size_t prefix_prompt_layer_offset_per_seq = 0;
+};
+template<typename T>
+void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
+                                    T*                               k_buf,
+                                    T*                               v_buf,
+                                    PrefixPromptBatchWeightsParam<T> param,
+                                    T*                               QKV,
+                                    const T*                         qkv_bias,
+                                    const int*                       padding_offset,
+                                    const int*                       history_length,
+                                    const int                        batch_size,
+                                    const int                        seq_len,
+                                    const int                        token_num,
+                                    const int                        head_num,
+                                    const int                        size_per_head,
+                                    const int                        rotary_embedding_dim,
+                                    const int                        neox_rotary_style,
+                                    const float*                     scale,
+                                    const int                        int8_mode,
+                                    cudaStream_t                     stream);
+template<typename T>
+void invokeAddFusedQKVBiasTranspose(T*                               q_buf,
+                                    T*                               k_buf,
+                                    T*                               v_buf,
+                                    PrefixPromptBatchWeightsParam<T> param,
+                                    T*                               QKV,
+                                    const T*                         qkv_bias,
+                                    const int*                       padding_offset,
+                                    const int                        batch_size,
+                                    const int                        seq_len,
+                                    const int                        token_num,
+                                    const int                        head_num,
+                                    const int                        size_per_head,
+                                    const int                        rotary_embedding_dim,
+                                    const int                        neox_rotary_style,
+                                    const float*                     scale,
+                                    const int                        int8_mode,
+                                    cudaStream_t                     stream)
+{
+    invokeAddFusedQKVBiasTranspose(q_buf,
+                                   k_buf,
+                                   v_buf,
+                                   param,
+                                   QKV,
+                                   qkv_bias,
+                                   padding_offset,
+                                   nullptr,
+                                   batch_size,
+                                   seq_len,
+                                   token_num,
+                                   head_num,
+                                   size_per_head,
+                                   rotary_embedding_dim,
+                                   neox_rotary_style,
+                                   scale,
+                                   int8_mode,
+                                   stream);
+}
+
+template<typename T>
+void invokeAddFusedQKVBiasTranspose(T*           q_buf,
+                                    T*           k_buf,
+                                    T*           v_buf,
+                                    T*           QKV,
+                                    const T*     qkv_bias,
+                                    const int*   padding_offset,
+                                    const int    batch_size,
+                                    const int    seq_len,
+                                    const int    token_num,
+                                    const int    head_num,
+                                    const int    size_per_head,
+                                    cudaStream_t stream)
+{
+    invokeAddFusedQKVBiasTranspose(q_buf,
+                                   k_buf,
+                                   v_buf,
+                                   PrefixPromptBatchWeightsParam<T>{},
+                                   QKV,
+                                   qkv_bias,
+                                   padding_offset,
+                                   batch_size,
+                                   seq_len,
+                                   token_num,
+                                   head_num,
+                                   size_per_head,
+                                   0,
+                                   false,
+                                   (float*)nullptr,
+                                   0,
+                                   stream);
+}
+
+template<typename T>
+void invokeTranspose4d(T*           dst,
+                       T*           src,
+                       const int    local_batch_size,
+                       const int    seq_len,
+                       const int    size_per_head,
+                       const int    local_hidden_units,
+                       const int    local_head_num,
+                       const int    batch_size,
+                       const int    ite,
+                       cudaStream_t stream);
+
+template<typename T>
+void invokeTranspose4dBatchMajor(T*           k_dst,
+                                 T*           v_dst,
+                                 const T*     k_src,
+                                 const T*     v_src,
+                                 const int    local_batch_size,
+                                 const int    seq_len,
+                                 const int    max_seq_len,
+                                 const int    size_per_head,
+                                 const int    local_head_num,
+                                 cudaStream_t stream);
+
+template<typename T>
+void invokeAddRelativeAttentionBias(T*           qk_buf,
+                                    const T*     relative_attention_bias,
+                                    const int    batch_size,
+                                    const int    head_num,
+                                    const int    seq_len,
+                                    cudaStream_t stream);
+
+template<typename T>
+void invokeAddHead3SizeQKVBias(const T*     mm_qkv,
+                               const T*     bias_qkv,
+                               T*           q_buf_,
+                               T*           k_buf_,
+                               T*           v_buf_,
+                               const int    batch,
+                               const int    window_num,
+                               const int    window_len,
+                               const int    head_num,
+                               const int    size_per_head,
+                               cudaStream_t stream);
+
+template<typename T>
+void invokeMaskedSoftMaxWithRelPosBias(T*           qk_buf,
+                                       const T*     attn_mask,
+                                       const T*     relative_pos_bias,
+                                       const int    batch_size,
+                                       const int    num_head,
+                                       const int    window_num,
+                                       const int    window_len,
+                                       const float  qk_scale,
+                                       cudaStream_t stream);
+
+template<typename T>
+void invokeTransposeAttentions(Tensor& attentions_out, const Tensor& attentions_in, cudaStream_t stream = 0);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/BaseLayer.h b/src/fastertransformer/layers/BaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..db27eb0f17bdf05ed51e68f27d412c05ce1cea48
--- /dev/null
+++ b/src/fastertransformer/layers/BaseLayer.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+
+namespace fastertransformer {
+
+class BaseLayer {
+public:
+    BaseLayer(cudaStream_t     stream,
+              cublasMMWrapper* cublas_wrapper,
+              IAllocator*      allocator,
+              bool             is_free_buffer_after_forward,
+              cudaDeviceProp*  cuda_device_prop = nullptr,
+              bool             sparse           = false):
+        stream_(stream),
+        cublas_wrapper_(cublas_wrapper),
+        allocator_(allocator),
+        cuda_device_prop_(cuda_device_prop),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward),
+        sparse_(sparse){};
+    virtual ~BaseLayer() = default;
+
+    virtual cudaStream_t getStream()
+    {
+        return stream_;
+    }
+
+    virtual void setStream(cudaStream_t stream)
+    {
+        stream_ = stream;
+    }
+
+protected:
+    virtual void allocateBuffer() = 0;
+    virtual void freeBuffer()     = 0;
+
+    // device environments
+    cudaStream_t     stream_;
+    cublasMMWrapper* cublas_wrapper_;
+    IAllocator*      allocator_;
+    cudaDeviceProp*  cuda_device_prop_ = nullptr;
+
+    bool is_free_buffer_after_forward_;
+    bool is_allocate_buffer_ = false;  // TODO (bhsueh) to be deprecated
+    bool sparse_;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/CMakeLists.txt b/src/fastertransformer/layers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c5b8daf0c35b396023fa162f597106a805175a5
--- /dev/null
+++ b/src/fastertransformer/layers/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_subdirectory(beam_search_layers)
+add_subdirectory(sampling_layers)
+
+add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
+set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
+                        TopKSamplingLayer TopPSamplingLayer
+                        OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
+                        gpt_kernels tensor nvtx_utils)
\ No newline at end of file
diff --git a/src/fastertransformer/layers/DenseWeight.h b/src/fastertransformer/layers/DenseWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..55871fd4cf5a24448e48d69a6380bf96a935b296
--- /dev/null
+++ b/src/fastertransformer/layers/DenseWeight.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "stdlib.h"
+
+namespace fastertransformer {
+
+// Note that the int8 mode of BERT and GPT are different.
+// For int8 mode = 2 on GPT:
+// scale (gemm input scale): quantize input of GEMM (float/half) in the int8 range. Namely, int8_x = scale * x
+// scale_inter: (gemm output scale) / (gemm input scale * gemm weight scale)
+// scale_out: 1 / (gemm output scale), dequantize activation from int8 range to float/half.
+template<typename T1, typename T2 = T1>
+struct DenseWeight {
+    const T1* kernel    = nullptr;
+    const T2* bias      = nullptr;
+    const T1* fp8_bias  = nullptr;
+    const T1* sp_kernel = nullptr;
+    // for int8 kernel
+    const int8_t* int8_kernel             = nullptr;
+    const float*  scale                   = nullptr;
+    const T2*     weight_only_quant_scale = nullptr;
+    const T2*     moe_scale               = nullptr;
+    const float*  scale_inter             = nullptr;
+    const float*  scale_out               = nullptr;
+
+    // FP8 scales
+    // scale = AMAX(tensor) / FP8_MAX
+    // During GEMM, A (original) = A_scaled (fp8) * "scale of A"
+    const float* input_scale      = nullptr;  // a scalar
+    const float* input_scale_inv  = nullptr;  // a scalar
+    const float* weight_scale     = nullptr;  // a scalar or a vector
+    const float* weight_scale_inv = nullptr;  // a scalar or a vector
+    const float* output_scale     = nullptr;  // a scalar
+    const float* output_scale_inv = nullptr;  // a scalar
+    // host pointer of scales, all are scalars
+    const float* input_h_scale      = nullptr;
+    const float* input_h_scale_inv  = nullptr;
+    const float* weight_h_scale     = nullptr;
+    const float* weight_h_scale_inv = nullptr;
+    const float* output_h_scale     = nullptr;
+    const float* output_h_scale_inv = nullptr;
+
+    // TODO(bhsueh) check do we need this param
+    const float* per_channel_scale_min =
+        nullptr;  // = min(weight_scale), used to adjust the scaling of per channel scaling
+
+    bool fuse_gemm_bias = false;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/DynamicDecodeBaseLayer.h b/src/fastertransformer/layers/DynamicDecodeBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f12a46698903371bf9159b6de378f4a2ce8af51
--- /dev/null
+++ b/src/fastertransformer/layers/DynamicDecodeBaseLayer.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+
+namespace fastertransformer {
+
+class DynamicDecodeBaseLayer: public BaseLayer {
+protected:
+    virtual void allocateBuffer() = 0;
+    virtual void freeBuffer()     = 0;
+
+public:
+    DynamicDecodeBaseLayer(cudaStream_t     stream,
+                           cublasMMWrapper* cublas_wrapper,
+                           IAllocator*      allocator,
+                           bool             is_free_buffer_after_forward,
+                           cudaDeviceProp*  cuda_device_prop):
+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop){};
+    ~DynamicDecodeBaseLayer() = default;
+    DynamicDecodeBaseLayer(DynamicDecodeBaseLayer const& dynamic_decode_layer): BaseLayer(dynamic_decode_layer){};
+
+    virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
+    virtual void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                         const std::vector<fastertransformer::Tensor>* input_tensors)             = 0;
+    virtual void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                         const std::unordered_map<std::string, Tensor>* input_tensors)            = 0;
+    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors)                     = 0;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.cc b/src/fastertransformer/layers/DynamicDecodeLayer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de2f60ecab7e9a7a563347d94b519a3f571340fe
--- /dev/null
+++ b/src/fastertransformer/layers/DynamicDecodeLayer.cc
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2022-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/kernels/ban_bad_words.h"
+#include "src/fastertransformer/kernels/stop_criteria_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
+#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void DynamicDecodeLayer<T>::allocateBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    h_pinned_finished_sum_ = (int*)allocator_->reMalloc(h_pinned_finished_sum_, sizeof(int), true, true);
+    return;
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    allocator_->free((void**)(&h_pinned_finished_sum_), true);
+    return;
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::initialize()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    online_beamsearch_decode_ = new OnlineBeamSearchLayer<T>(0,  // max_batch_size, deprecated
+                                                             0,  // local_head_num, deprecated
+                                                             0,  // size_per_head, deprecated
+                                                             0,  // beam_width, deprecated
+                                                             vocab_size_,
+                                                             vocab_size_padded_,
+                                                             0,     // end_id, deprecated
+                                                             0.0f,  // beam_search_diversity_rate_, deprecated
+                                                             1.0f,  // temperature_, deprecated
+                                                             0.0f,  // len_penalty_, deprecated
+                                                             1.0f,  // repetition_penalty_, deprecated
+                                                             stream_,
+                                                             cublas_wrapper_,
+                                                             allocator_,
+                                                             is_free_buffer_after_forward_);
+
+    beamsearch_decode_ = new BeamSearchLayer<T>(0,  // max_batch_size, deprecated
+                                                0,  // local_head_num, deprecated
+                                                0,  // size_per_head, deprecated
+                                                0,  // beam_width, deprecated
+                                                vocab_size_,
+                                                vocab_size_padded_,
+                                                0,     // end_id, deprecated
+                                                0.0f,  // beam_search_diversity_rate_, deprecated
+                                                1.0f,  // temperature_, deprecated
+                                                0.0f,  // len_penalty_, deprecated
+                                                1.0f,  // repetition_penalty_, deprecated
+                                                stream_,
+                                                cublas_wrapper_,
+                                                allocator_,
+                                                is_free_buffer_after_forward_);
+
+    topk_decode_ = new TopKSamplingLayer<T>(0,
+                                            vocab_size_,
+                                            vocab_size_padded_,
+                                            0,     // end_id, deprecated
+                                            0,     // top_k_, deprecated
+                                            0,     // random_seed_, deprecated
+                                            1.0f,  // temperature_, deprecated
+                                            0.0f,  // len_penalty_, deprecated
+                                            1.0f,  // repetition_penalty_, deprecated
+                                            stream_,
+                                            cublas_wrapper_,
+                                            allocator_,
+                                            false);
+
+    topp_decode_ = new TopPSamplingLayer<T>(0,
+                                            vocab_size_,
+                                            vocab_size_padded_,
+                                            0,     // end_id, deprecated
+                                            0.0f,  // top_p_, deprecated
+                                            0,     // random_seed_, deprecated
+                                            1.0f,  // temperature_, deprecated
+                                            0.0f,  // len_penalty_, deprecated
+                                            1.0f,  // repetition_penalty_, deprecated
+                                            stream_,
+                                            cublas_wrapper_,
+                                            allocator_,
+                                            false,
+                                            cuda_device_prop_);
+
+    allocateBuffer();
+}
+
+template<typename T>
+DynamicDecodeLayer<T>::DynamicDecodeLayer(size_t           vocab_size,
+                                          size_t           vocab_size_padded,
+                                          int              end_id,
+                                          cudaStream_t     stream,
+                                          cublasMMWrapper* cublas_wrapper,
+                                          IAllocator*      allocator,
+                                          bool             is_free_buffer_after_forward,
+                                          cudaDeviceProp*  cuda_device_prop):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    vocab_size_(vocab_size),
+    vocab_size_padded_(vocab_size_padded),
+    cuda_device_prop_(cuda_device_prop)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    initialize();
+}
+
+template<typename T>
+DynamicDecodeLayer<T>::~DynamicDecodeLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    delete online_beamsearch_decode_;
+    delete beamsearch_decode_;
+    delete topk_decode_;
+    delete topp_decode_;
+    freeBuffer();
+}
+
+template<typename T>
+DynamicDecodeLayer<T>::DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer):
+    BaseLayer(dynamic_decode_layer),
+    vocab_size_(dynamic_decode_layer.vocab_size_),
+    vocab_size_padded_(dynamic_decode_layer.vocab_size_padded_),
+    cuda_device_prop_(dynamic_decode_layer.cuda_device_prop_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    initialize();
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    /**
+     * @brief Set up the dynamic decode layer for given input runtime arguments.
+     *
+     * runtime_args:
+     *   \param  runtime_top_k [1] or [batch_size] on cpu, optional.
+     *   \param  runtime_top_p [1] or [batch_size] on cpu, optional
+     *   \param  beam_search_diversity_rate [1] or [batch_size] on cpu, optional
+     *   \param  temperature [1] or [batch_size] on cpu, optional
+     *   \param  len_penalty [1] or [batch_size] on cpu, optional
+     *   \param  repetition_penalty [1] or [batch_size] on cpu, optional
+     *   \param  presence_penalty [1] or [batch_size] on cpu, optional, float
+     *   \param  min_length [1] or [batch_size], optional
+     *   \param  top_p_decay [batch_size] on gpu, float, optional
+     *   \param  top_p_min [batch_size] on gpu, float, optional
+     *   \param  top_p_reset_ids [batch_size] on gpu, uint32, optional
+     */
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    has_diff_runtime_args_ = hasDiffRuntimeArgs(runtime_args);
+    if (beam_width == 1) {  // sampling layers
+        topk_decode_->setup(batch_size, beam_width, runtime_args);
+        topp_decode_->setup(batch_size, beam_width, runtime_args);
+    }
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                    const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TensorMap input_map(*input_tensors);
+    TensorMap output_map(*output_tensors);
+    forward(&output_map, &input_map);
+}
+
+template<typename T>
+void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    /**
+     * @brief
+     * input_tensors:
+     *   \param  logits [batch_size, beam_width, vocab_size_padded]
+     *   \param  embedding_bias [vocab_size_padded], optional
+     *   \param  step [1] on cpu
+     *   \param  max_input_length [1] on cpu
+     *   \param  input_lengths [batch_size, beam_width], optional
+     *   \param  min_length [batch_size], optional
+     *   \param  sequence_limit_length [batch_size]
+     *   \param  ite [1] on cpu
+     *   \param  local_batch_size [1] on cpu
+     *   \param  stop_words_list [batch_size, 2, stop_words_length], optional
+     *   \param  runtime_top_k [1] or [batch_size] on cpu, optional, uint
+     *   \param  runtime_top_p [1] or [batch_size] on cpu, optional, float
+     *   \param  temperature [1] or [batch_size] on cpu, optional, float
+     *   \param  len_penalty [1] or [batch_size] on cpu, optional, float
+     *   \param  repetition_penalty [1] or [batch_size] on cpu, optional, float
+     *   \param  presence_penalty [1] or [batch_size] on cpu, optional, float
+     *                Only one of repetition and presence penalties is allowed.
+     *   \param  random_seed [1] or [batch_size] on cpu, optional, unsigned long long int
+     *   \param  bad_words_list [2, bad_words_length] or [batch_size, 2, bad_words_length], optional
+     *   \param  src_cache_indirection
+     *                [local_batch_size, beam_width, max_seq_len]
+     *                the k/v cache index for beam search
+     *   \param  is_initialize_random_table [1] on cpu, bool
+     *   \param  top_p_decay [batch_size] on gpu, float, optional
+     *   \param  top_p_min [batch_size] on gpu, float, optional
+     *   \param  top_p_reset_ids [batch_size] on gpu, uint32, optional
+     *
+     * output_tensors:
+     *   \param  output_ids [max_seq_len, batch_size]
+     *   \param  finished [batch_size * beam_width], optional
+     *   \param  should_stop [1] on cpu
+     *   \param  cum_log_probs [batch_size * beam_width], necessary in beam search
+     *   \param  parent_ids [max_seq_len, batch_size * beam_width]
+     *   \param  sequence_length [batch_size * beam_width], optional
+     *   \param  output_log_probs [request_ouptut_length, batch_size * beam_width], must be float*, optional
+     *   \param  tgt_cache_indirection
+     *                [local_batch_size, beam_width, max_seq_len]
+     *                the k/v cache index for beam search
+     *   \param  beam_hyps: [1] on cpu, a special structure which maintains some pointers of beam search
+     *
+     */
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    const int ite  = (int)input_tensors->at("ite").getVal<uint>();
+    const int step = input_tensors->at("step").getVal<int>();
+    FT_CHECK(input_tensors->at("logits").shape.size() == 3);
+
+    const size_t batch_size       = input_tensors->at("logits").shape[0];
+    const size_t beam_width       = input_tensors->at("logits").shape[1];
+    const size_t local_batch_size = (size_t)input_tensors->at("local_batch_size").getVal<int>();
+
+    if (input_tensors->isExist("bad_words_list")) {
+        const auto& bad_words     = input_tensors->at("bad_words_list");
+        const int*  bad_words_ptr = bad_words.getPtr<const int>();
+        FT_CHECK_WITH_INFO(bad_words.shape.size() == 2 || bad_words.shape.size() == 3,
+                           "Bad words dimension must be 2 or 3.");
+
+        const bool is_matrix = bad_words.shape.size() == 2;
+        if (bad_words.shape.size() == 3) {
+            FT_CHECK_WITH_INFO(bad_words.shape[0] == batch_size,
+                               fmtstr("Shape of dim 0 of bad words is invalid. It must be equal to batch size."
+                                      " However, it is %d and the batch size is %d.",
+                                      bad_words.shape[0],
+                                      batch_size));
+        }
+
+        const bool   shared_bad_words = is_matrix || bad_words.shape[0] == 1;
+        const size_t bad_words_len    = bad_words.shape[is_matrix ? 1 : 2];
+        // Add check on batch size of bad words
+        const int id_offset                      = ite * local_batch_size;
+        const int decode_vocab_size_units_offset = id_offset * vocab_size_padded_;
+
+        invokeBanBadWords((T*)input_tensors->at("logits").getPtrWithOffset(decode_vocab_size_units_offset),
+                          output_tensors->at("output_ids").getPtr<const int>(),
+                          beam_width > 1 ? output_tensors->at("parent_ids").getPtr<const int>() : nullptr,
+                          batch_size,
+                          local_batch_size,
+                          beam_width,
+                          shared_bad_words ?
+                              bad_words_ptr :
+                              bad_words.getPtrWithOffset<const int>(ite * local_batch_size * 2 * bad_words_len),
+                          shared_bad_words,
+                          bad_words_len,
+                          id_offset,
+                          vocab_size_padded_,
+                          step,
+                          stream_);
+    }
+
+    // dynamic decode GPT
+    if (beam_width > 1) {
+        // Because we still not support batch beam search now, so we need to compute one by one if there are different
+        // runtime arguments.
+        const size_t dynamic_decode_batch_size      = has_diff_runtime_args_ ? 1 : local_batch_size;
+        const int    dynamic_decode_total_iteration = local_batch_size / dynamic_decode_batch_size;
+
+        for (uint dynamic_ite = ite * dynamic_decode_total_iteration;
+             dynamic_ite < (ite + 1) * dynamic_decode_total_iteration;
+             ++dynamic_ite) {
+            const int dynamic_id_offset                      = dynamic_ite * dynamic_decode_batch_size * beam_width;
+            const int dynamic_decode_vocab_size_units_offset = dynamic_id_offset * vocab_size_padded_;
+
+            // common inputs
+            Tensor logits = input_tensors->at("logits");
+            Tensor end_id = input_tensors->at("end_id");
+
+            TensorMap dynamic_decode_input_tensors(
+                {{"logits",
+                  Tensor{logits.where,
+                         logits.type,
+                         {dynamic_decode_batch_size, logits.shape[1], logits.shape[2]},
+                         logits.getPtrWithOffset(dynamic_decode_vocab_size_units_offset)}},
+                 {"step", input_tensors->at("step")},
+                 {"max_input_length", input_tensors->at("max_input_length")},
+                 {"end_id",
+                  Tensor{end_id.where,
+                         end_id.type,
+                         {dynamic_decode_batch_size},
+                         end_id.getPtrWithOffset(dynamic_ite * dynamic_decode_batch_size)}},
+                 {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &dynamic_ite}}});
+
+            if (input_tensors->isExist("embedding_bias")) {
+                dynamic_decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
+            }
+            if (input_tensors->isExist("input_lengths")) {
+                Tensor input_lengths = input_tensors->at("input_lengths");
+                dynamic_decode_input_tensors.insert(
+                    {"input_lengths",
+                     input_lengths.slice({dynamic_decode_batch_size, input_lengths.shape[1]}, dynamic_id_offset)});
+            }
+            for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+                if (t->first.find("random_seed") == std::string::npos) {
+                    dynamic_decode_input_tensors.insert(*t);
+                }
+            }
+
+            // common outputs
+            TensorMap dynamic_decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
+            if (output_tensors->isExist("sequence_length")) {
+                Tensor sequence_length = output_tensors->at("sequence_length");
+                dynamic_decode_output_tensors.insert({"sequence_length",
+                                                      Tensor{sequence_length.where,
+                                                             sequence_length.type,
+                                                             {dynamic_decode_batch_size * beam_width},
+                                                             sequence_length.getPtrWithOffset(dynamic_id_offset)}});
+            }
+            if (output_tensors->isExist("finished")) {
+                Tensor finished = output_tensors->at("finished");
+                dynamic_decode_output_tensors.insert({"finished",
+                                                      Tensor{finished.where,
+                                                             finished.type,
+                                                             {dynamic_decode_batch_size * beam_width},
+                                                             finished.getPtrWithOffset(dynamic_id_offset)}});
+            }
+            if (output_tensors->isExist("cum_log_probs")) {
+                Tensor cum_log_probs = output_tensors->at("cum_log_probs");
+                dynamic_decode_output_tensors.insert({"cum_log_probs",
+                                                      Tensor{cum_log_probs.where,
+                                                             cum_log_probs.type,
+                                                             {dynamic_decode_batch_size * beam_width},
+                                                             cum_log_probs.getPtrWithOffset(dynamic_id_offset)}});
+            }
+            if (output_tensors->isExist("beam_hyps")) {
+                dynamic_decode_output_tensors.insert("beam_hyps", output_tensors->at("beam_hyps"));
+            }
+
+            if (output_tensors->isExist("output_log_probs")) {
+                dynamic_decode_output_tensors.insert({"output_log_probs", output_tensors->at("output_log_probs")});
+            }
+
+            dynamic_decode_input_tensors.insert({"src_cache_indirection", input_tensors->at("src_cache_indirection")});
+
+            dynamic_decode_output_tensors.insert({"parent_ids", output_tensors->at("parent_ids")});
+            dynamic_decode_output_tensors.insert(
+                {"tgt_cache_indirection", output_tensors->at("tgt_cache_indirection")});
+
+            FT_CHECK_WITH_INFO(dynamic_decode_output_tensors.isExist("cum_log_probs"),
+                               "cum_log_probs should be provided in beam search.");
+
+            if (true || beam_width < 16
+                || (output_tensors->isExist("beam_hyps")
+                    && input_tensors->getVal<float>("beam_search_diversity_rate", 0.0f) != 0.0f)) {
+                // only online_beamsearch_decode_ support beam_search_diversity_rate when beam_hyps is used
+                online_beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+            }
+            else {
+                FT_CHECK(false);  // deprecate this module
+                beamsearch_decode_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+            }
+        }  // end of dynamic_ite
+    }
+    else {  // beam_width=1
+        // In sampling, we have supported batch sampling. So, we always compute all sentences once.
+        const size_t local_batch_offset = ite * local_batch_size * beam_width;
+
+        Tensor logits = input_tensors->at("logits");
+        Tensor end_id = input_tensors->at("end_id");
+
+        TensorMap decode_input_tensors(
+            {{"logits",
+              logits.slice({local_batch_size, beam_width, logits.shape[2]}, local_batch_offset * logits.shape[2])},
+             {"step", input_tensors->at("step")},
+             {"max_input_length", input_tensors->at("max_input_length")},
+             {"end_id", end_id.slice({local_batch_size}, ite * local_batch_size)},
+             {"ite", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &ite}}});
+
+        if (input_tensors->isExist("embedding_bias")) {
+            decode_input_tensors.insert({"embedding_bias", input_tensors->at("embedding_bias")});
+        }
+        if (input_tensors->isExist("input_lengths")) {
+            Tensor input_lengths = input_tensors->at("input_lengths");
+            decode_input_tensors.insert(
+                {"input_lengths", input_lengths.slice({local_batch_size, beam_width}, local_batch_offset)});
+        }
+
+        TensorMap decode_output_tensors({{"output_ids", output_tensors->at("output_ids")}});
+        if (output_tensors->isExist("sequence_length")) {
+            Tensor sequence_length = output_tensors->at("sequence_length");
+            decode_output_tensors.insert(
+                {"sequence_length", sequence_length.slice({local_batch_size * beam_width}, local_batch_offset)});
+        }
+        if (output_tensors->isExist("finished")) {
+            Tensor finished = output_tensors->at("finished");
+            decode_output_tensors.insert(
+                {"finished", finished.slice({local_batch_size * beam_width}, local_batch_offset)});
+        }
+        if (output_tensors->isExist("cum_log_probs")) {
+            Tensor cum_log_probs = output_tensors->at("cum_log_probs");
+            decode_output_tensors.insert(
+                {"cum_log_probs", cum_log_probs.slice({local_batch_size * beam_width}, local_batch_offset)});
+        }
+        if (output_tensors->isExist("output_log_probs")) {
+            Tensor output_log_probs = output_tensors->at("output_log_probs");
+            int    max_input_length = input_tensors->at("max_input_length").getVal<int>();
+            size_t step_offset      = (step - max_input_length) * batch_size * beam_width;
+            decode_output_tensors.insert({"output_log_probs",
+                                          output_log_probs.slice({output_log_probs.shape[0] - (step - max_input_length),
+                                                                  local_batch_size * beam_width},
+                                                                 step_offset + local_batch_offset)});
+        }
+
+        // Run topk / topp decode layers.
+        // Currently, we support batch sampling. If the runtime arguments are like
+        // topk = [4, 0, 4]. topp = [0.0, 0.5, 0.5]
+        // then topk_decode handles [4, x, 4 + 0.5]
+        //      topp_decode handles [x, 0.5, x]
+        // where "x" are skipped.
+        topk_decode_->forward(&decode_output_tensors, &decode_input_tensors);
+        topp_decode_->forward(&decode_output_tensors, &decode_input_tensors);
+    }
+
+    if (input_tensors->isExist("stop_words_list")) {
+        const size_t id_offset         = ite * local_batch_size * beam_width;
+        const size_t stop_words_length = input_tensors->at("stop_words_list").shape[2];
+
+        invokeStopWordsCriterion(output_tensors->at("output_ids").getPtr<const int>(),
+                                 beam_width > 1 ? output_tensors->at("parent_ids").getPtr<const int>() : nullptr,
+                                 input_tensors->at("stop_words_list")
+                                     .getPtrWithOffset<const int>(ite * local_batch_size * 2 * stop_words_length),
+                                 output_tensors->at("finished").getPtrWithOffset<bool>(id_offset),
+                                 id_offset,
+                                 stop_words_length,
+                                 batch_size,
+                                 beam_width,
+                                 step,
+                                 stream_);
+    }
+
+    if (input_tensors->isExist("sequence_limit_length")) {
+        invokeLengthCriterion(output_tensors->at("finished").getPtr<bool>(),
+                              output_tensors->at("should_stop").getPtr<bool>(),
+                              h_pinned_finished_sum_,
+                              input_tensors->at("sequence_limit_length").getPtr<const uint32_t>(),
+                              batch_size,
+                              beam_width,
+                              step,
+                              stream_);
+    }
+}
+
+template<typename T>
+bool DynamicDecodeLayer<T>::hasDiffRuntimeArgs(TensorMap* input_tensors)
+{
+    for (int i = 0; i < (int)runtime_arg_names_.size(); i++) {
+        if (input_tensors->isExist(runtime_arg_names_[i])) {
+            auto tensor = input_tensors->at(runtime_arg_names_[i]);
+            FT_CHECK(tensor.shape.size() == 1);
+            for (int j = 1; j < (int)tensor.shape[0]; j++) {
+                const void* data = tensor.data;
+                switch (tensor.type) {
+                    case TYPE_FP32:
+                        if (((const float*)data)[0] != ((const float*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    case TYPE_INT32:
+                        if (((const int*)data)[0] != ((const int*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    case TYPE_UINT32:
+                        if (((const uint*)data)[0] != ((const uint*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    case TYPE_UINT64:
+                        if (((const unsigned long long int*)data)[0] != ((const unsigned long long int*)data)[j]) {
+                            return true;
+                        }
+                        break;
+                    default:
+                        FT_CHECK_WITH_INFO(false, runtime_arg_names_[i] + ": " + tensor.toString() + " is invalid.");
+                        break;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+template class DynamicDecodeLayer<float>;
+template class DynamicDecodeLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/DynamicDecodeLayer.h b/src/fastertransformer/layers/DynamicDecodeLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d09c82571371bb41f92df9a75752afefce1d022f
--- /dev/null
+++ b/src/fastertransformer/layers/DynamicDecodeLayer.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class DynamicDecodeLayer: public BaseLayer {
+protected:
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    void initialize();
+    bool hasDiffRuntimeArgs(TensorMap* input_tensors);
+
+    DynamicDecodeBaseLayer* online_beamsearch_decode_;
+    DynamicDecodeBaseLayer* beamsearch_decode_;
+    DynamicDecodeBaseLayer* topk_decode_;
+    DynamicDecodeBaseLayer* topp_decode_;
+
+    size_t          vocab_size_;
+    size_t          vocab_size_padded_;
+    cudaDeviceProp* cuda_device_prop_;
+
+    // List of argument names which can have different values in runtime
+    // and does not support a batched version of kernel in beam search.
+    const std::vector<std::string> runtime_arg_names_ = {"beam_search_diversity_rate",
+                                                         "temperature",
+                                                         "len_penalty",
+                                                         "repetition_penalty",
+                                                         "presence_penalty",
+                                                         "min_length"};
+
+    bool has_diff_runtime_args_ = false;
+    int* h_pinned_finished_sum_ = nullptr;
+
+public:
+    curandState_t* topk_curandstate_buf()
+    {
+        return static_cast<BaseSamplingLayer<T>*>(topk_decode_)->curandstate_buf();
+    }
+    curandState_t* topp_curandstate_buf()
+    {
+        return static_cast<BaseSamplingLayer<T>*>(topp_decode_)->curandstate_buf();
+    }
+
+    DynamicDecodeLayer(size_t           vocab_size,
+                       size_t           vocab_size_padded,
+                       int              end_id,
+                       cudaStream_t     stream,
+                       cublasMMWrapper* cublas_wrapper,
+                       IAllocator*      allocator,
+                       bool             is_free_buffer_after_forward,
+                       cudaDeviceProp*  cuda_device_prop);
+
+    ~DynamicDecodeLayer();
+    DynamicDecodeLayer(DynamicDecodeLayer const& dynamic_decode_layer);
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args);
+    void forward(TensorMap* output_tensors, TensorMap* input_tensors);
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnFP8Layer.cc b/src/fastertransformer/layers/FfnFP8Layer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dba832a02636f7552ddc31e2dcb16a930cc7e741
--- /dev/null
+++ b/src/fastertransformer/layers/FfnFP8Layer.cc
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/FfnFP8Layer.h"
+#include "src/fastertransformer/kernels/activation_fp8_kernels.h"
+#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::forward(TensorMap*                  output_tensors,
+                                  TensorMap*                  input_tensors,
+                                  const FfnFP8Weight<T1, T2>* ffn_weights)
+{
+    // input tensors:
+    //      input_hidden_state [token_num, d_model],
+
+    // output tensors:
+    //      output_hidden_state [token_num, d_model],
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() == 1);
+    FT_CHECK(output_tensors->size() == 1);
+
+    const int m                  = input_tensors->at("input_hidden_state").shape[0];
+    const int d_model            = input_tensors->at("input_hidden_state").shape[1];
+    const T1* input_hidden_state = input_tensors->at("input_hidden_state").getPtr<T1>();
+    Tensor    output_tensor      = output_tensors->at("output_hidden_state");
+    allocateBuffer(m);
+
+#ifdef FUSE_GEMM_ACT
+    if (fp8_mode_ == 1) {
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+        reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+            ->Gemm(inter_buf_bf16_,
+                   (int)1,
+                   (int)m,
+                   (int)inter_size_,
+                   (int)d_model,
+                   (int64_t)0,
+                   (int64_t)0,
+                   (int64_t)0,
+                   &alpha,
+                   &beta,
+                   input_hidden_state,
+                   ffn_weights->intermediate_weight.kernel,
+                   ffn_weights->intermediate_weight.input_scale,
+                   ffn_weights->intermediate_weight.per_channel_scale_min,  // identity_scale
+                   stream_);
+        invokeAddBiasActivation(m,
+                                ffn_weights->intermediate_weight.bias,
+                                ffn_weights->intermediate_weight.output_scale,
+                                ffn_weights->intermediate_weight.scale,
+                                ffn_weights->intermediate_weight.per_channel_scale_min,
+                                ffn_weights->output_weight.input_scale_inv);
+    }
+    else if (fp8_mode_ == 2) {
+#ifdef USE_QGMMA
+        if (getActivationType() == ActivationType::Gelu) {
+            PUSH_RANGE("FFN gemm 1 bias gelu");
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Conv1x1Gemm<false, true>(inter_buf_,
+                                           m,
+                                           inter_size_,
+                                           d_model,
+                                           input_hidden_state,
+                                           ffn_weights->intermediate_weight.kernel,
+                                           ffn_weights->intermediate_weight.bias,
+                                           *(ffn_weights->intermediate_weight.input_h_scale),   // scale_a,
+                                           *(ffn_weights->intermediate_weight.weight_h_scale),  // scale_b,
+                                           *(ffn_weights->output_weight.input_h_scale_inv),     // scale_d,
+                                           stream_);
+            POP_RANGE;
+        }
+        else if (getActivationType() == ActivationType::Relu) {
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Conv1x1Gemm<true, false>(inter_buf_,
+                                           m,
+                                           inter_size_,
+                                           d_model,
+                                           input_hidden_state,
+                                           ffn_weights->intermediate_weight.kernel,
+                                           ffn_weights->intermediate_weight.bias,
+                                           *(ffn_weights->intermediate_weight.input_h_scale),   // scale_a,
+                                           *(ffn_weights->intermediate_weight.weight_h_scale),  // scale_b,
+                                           *(ffn_weights->output_weight.input_h_scale_inv),     // scale_d,
+                                           stream_);
+        }
+#else  // USE_QGMMA
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+        if (getActivationType() == ActivationType::Gelu) {
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<false, true>(inter_buf_bf16_,
+#else   // FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<false, true>(inter_buf_,
+#endif  // FP8_GEMM_OUTPUT_QUANT_DISABLE
+                                             (int)1,
+                                             (int)m,
+                                             (int)inter_size_,
+                                             (int)d_model,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             &alpha,
+                                             &beta,
+                                             input_hidden_state,
+                                             ffn_weights->intermediate_weight.kernel,
+                                             ffn_weights->intermediate_weight.input_scale,
+                                             ffn_weights->intermediate_weight.weight_scale,
+                                             ffn_weights->intermediate_weight.bias,
+                                             ffn_weights->intermediate_weight.output_scale,
+                                             stream_);
+        }
+        else if (getActivationType() == ActivationType::Relu) {
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<true, false>(inter_buf_bf16_,
+#else   // FP8_GEMM_OUTPUT_QUANT_DISABLE
+                ->Gemm_Bias_Act<true, false>(inter_buf_,
+#endif  // #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+                                             (int)1,
+                                             (int)m,
+                                             (int)inter_size_,
+                                             (int)d_model,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             (int64_t)0,
+                                             &alpha,
+                                             &beta,
+                                             input_hidden_state,
+                                             ffn_weights->intermediate_weight.kernel,
+                                             ffn_weights->intermediate_weight.input_scale,
+                                             ffn_weights->intermediate_weight.weight_scale,
+                                             ffn_weights->intermediate_weight.bias,
+                                             ffn_weights->intermediate_weight.output_scale,
+                                             stream_);
+        }
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+        invokeQuantizeMatrix<T1, T2, QUANTIZE_MODE::PER_TENSOR>(
+            inter_buf_, ffn_weights->output_weight.input_scale_inv, inter_buf_bf16_, m * inter_size_, 1, stream_);
+#endif FP8_GEMM_OUTPUT_QUANT_DISABLE
+#endif  // USE_QGMMA
+    }
+
+#else  // FUSE_GEMM_ACT
+    PUSH_RANGE("FFN gemm 1");
+#ifdef SPARSITY_ENABLED
+    int m_tmp = m;
+    if (m_tmp % 8 != 0) {
+        m_tmp = (m_tmp / 8 + 1) * 8;
+    }
+    const int m_padded = m_tmp;
+    if (sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, d_model)) {
+        FT_CHECK(false);
+        // cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+        //                         CUBLAS_OP_N,
+        //                         inter_size_,
+        //                         m_padded,
+        //                         d_model,
+        //                         ffn_weights->intermediate_weight.sp_kernel,
+        //                         input_hidden_state,
+        //                         inter_buf_);
+    }
+    else {
+#endif  // SPARSITY_ENABLED
+        if (fp8_mode_ == 1) {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Gemm(inter_buf_bf16_,
+                       (int)1,
+                       (int)m,
+                       (int)inter_size_,
+                       (int)d_model,
+                       (int64_t)0,
+                       (int64_t)0,
+                       (int64_t)0,
+                       &alpha,
+                       &beta,
+                       input_hidden_state,
+                       ffn_weights->intermediate_weight.kernel,
+                       ffn_weights->intermediate_weight.input_scale,
+                       ffn_weights->intermediate_weight.per_channel_scale_min,  // identity_scale
+                       stream_);
+        }
+        else if (fp8_mode_ == 2) {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                ->Gemm(inter_buf_bf16_,
+                       (int)1,
+                       (int)m,
+                       (int)inter_size_,
+                       (int)d_model,
+                       (int64_t)0,
+                       (int64_t)0,
+                       (int64_t)0,
+                       &alpha,
+                       &beta,
+                       input_hidden_state,
+                       ffn_weights->intermediate_weight.kernel,
+                       ffn_weights->intermediate_weight.input_scale,
+                       ffn_weights->intermediate_weight.weight_scale,
+                       stream_);
+        }
+#ifdef SPARSITY_ENABLED
+    }
+#endif  // SPARSITY_ENABLED
+    POP_RANGE;
+
+    PUSH_RANGE("FFN add bias act");
+    if (fp8_mode_ == 1) {
+        invokeAddBiasActivation(m,
+                                ffn_weights->intermediate_weight.bias,
+                                ffn_weights->intermediate_weight.output_scale,
+                                ffn_weights->intermediate_weight.scale,
+                                ffn_weights->intermediate_weight.per_channel_scale_min,
+                                ffn_weights->output_weight.input_scale_inv);
+    }
+    else if (fp8_mode_ == 2) {
+        invokeAddBiasActivation(m,
+                                ffn_weights->intermediate_weight.bias,
+                                ffn_weights->intermediate_weight.output_scale,
+                                nullptr,
+                                nullptr,
+                                ffn_weights->output_weight.input_scale_inv);
+    }
+    sync_check_cuda_error();
+    POP_RANGE;
+#endif  // FUSE_GEMM_ACT
+
+    PUSH_RANGE("FFN gemm 2");
+#ifdef SPARSITY_ENABLED
+    if (sparse_ && cublas_wrapper_->isUseSparse(1, d_model, m, inter_size_)) {
+        FT_CHECK(false);
+        // cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+        //                         CUBLAS_OP_N,
+        //                         d_model,
+        //                         m_padded,
+        //                         inter_size_,
+        //                         ffn_weights->output_weight.sp_kernel,
+        //                         inter_buf_,
+        //                         output_tensor);
+    }
+    else {
+#endif SPARSITY_ENABLED
+        if (fp8_mode_ == 1) {
+            const float alpha = 1.0f;
+            const float beta  = 0.0f;
+            if (output_tensor.type == TYPE_BF16) {
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T2>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->identity_scale,
+                           stream_);
+            }
+            else if (output_tensor.type == TYPE_FP8_E4M3) {
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T1>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->output_weight.per_channel_scale_min,
+                           ffn_weights->output_weight.output_scale_inv,
+                           stream_);
+            }
+            else {
+                FT_CHECK(false);
+            }
+        }
+        else if (fp8_mode_ == 2) {
+            if (output_tensor.type == TYPE_BF16) {
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T2>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->output_weight.weight_scale,
+                           stream_);
+            }
+            else if (output_tensor.type == TYPE_FP8_E4M3) {
+                // It looks like conv1x1Gemm does not bring better performance for this gemm
+                // because the k dimension of this gemm is large
+                // #ifdef USE_QGMMA
+                //                 reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                //                     ->Conv1x1Gemm<false, false>(output_tensor.getPtr<T1>(),
+                //                                                 m,
+                //                                                 d_model,
+                //                                                 inter_size_,
+                //                                                 inter_buf_,
+                //                                                 ffn_weights->output_weight.kernel,
+                //                                                 ffn_weights->output_weight.bias,
+                //                                                 *(ffn_weights->output_weight.input_h_scale),       //
+                //                                                 scale_a,
+                //                                                 *(ffn_weights->output_weight.weight_h_scale),      //
+                //                                                 scale_b,
+                //                                                 *(ffn_weights->output_weight.output_h_scale_inv),  //
+                //                                                 scale_d, stream_);
+                // #else   // USE_QGMMA
+                const float alpha = 1.0f;
+                const float beta  = 0.0f;
+                reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
+                    ->Gemm(output_tensor.getPtr<T1>(),
+                           (int)1,
+                           (int)m,
+                           (int)d_model,
+                           (int)inter_size_,
+                           (int64_t)0,
+                           (int64_t)0,
+                           (int64_t)0,
+                           &alpha,
+                           &beta,
+                           (const __nv_fp8_e4m3*)inter_buf_,
+                           (const __nv_fp8_e4m3*)ffn_weights->output_weight.kernel,
+                           ffn_weights->output_weight.input_scale,
+                           ffn_weights->output_weight.weight_scale,
+                           ffn_weights->output_weight.output_scale_inv,
+                           stream_);
+                // #endif  // USE_QGMMA
+            }
+            else {
+                FT_CHECK(false);
+            }
+        }
+#ifdef SPARSITY_ENABLED
+    }
+#endif  // SPARSITY_ENABLED
+    POP_RANGE;
+
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T1, typename T2>
+FfnFP8Layer<T1, T2>::FfnFP8Layer(size_t           inter_size,
+                                 int              fp8_mode,
+                                 cudaStream_t     stream,
+                                 cublasMMWrapper* cublas_wrapper,
+                                 IAllocator*      allocator,
+                                 bool             is_free_buffer_after_forward,
+                                 bool             sparse):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
+    inter_size_(inter_size),
+    fp8_mode_(fp8_mode)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T1, typename T2>
+FfnFP8Layer<T1, T2>::FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer):
+    BaseLayer(ffn_layer.stream_,
+              ffn_layer.cublas_wrapper_,
+              ffn_layer.allocator_,
+              ffn_layer.is_free_buffer_after_forward_,
+              ffn_layer.cuda_device_prop_,
+              ffn_layer.sparse_),
+    inter_size_(ffn_layer.inter_size_),
+    fp8_mode_(ffn_layer.fp8_mode_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T1, typename T2>
+FfnFP8Layer<T1, T2>::~FfnFP8Layer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::allocateBuffer(size_t token_num)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    inter_buf_          = (T1*)allocator_->reMalloc(inter_buf_, sizeof(T1) * token_num * inter_size_, false);
+    inter_buf_bf16_     = (T2*)allocator_->reMalloc(inter_buf_bf16_, sizeof(T2) * token_num * inter_size_, false);
+    is_allocate_buffer_ = true;
+}
+
+template<typename T1, typename T2>
+void FfnFP8Layer<T1, T2>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&inter_buf_));
+        allocator_->free((void**)(&inter_buf_bf16_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template class FfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
+
+template<typename T1, typename T2>
+GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(size_t           inter_size,
+                                         int              fp8_mode,
+                                         cudaStream_t     stream,
+                                         cublasMMWrapper* cublas_wrapper,
+                                         IAllocator*      allocator,
+                                         bool             is_free_buffer_after_forward,
+                                         bool             sparse):
+    FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
+{
+}
+
+template<typename T1, typename T2>
+GeluFfnFP8Layer<T1, T2>::GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& gelu_ffn_layer):
+    FfnFP8Layer<T1, T2>(gelu_ffn_layer)
+{
+}
+
+template<typename T1, typename T2>
+void GeluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int    m,
+                                                      const T2*    bias,
+                                                      const float* input_scale,
+                                                      const float* input_scale_2,
+                                                      const float* input_scale_2_min,
+                                                      const float* output_scale)
+{
+    FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
+                                     inter_buf_,
+                                     bias,
+                                     input_scale,
+                                     input_scale_2,
+                                     input_scale_2_min,
+                                     output_scale,
+                                     (uint32_t)m,
+                                     (uint32_t)inter_size_,
+                                     stream_};
+    invokeFP8AddBiasGelu<T1, T2>(param);
+}
+
+template class GeluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
+
+template<typename T1, typename T2>
+ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(size_t           inter_size,
+                                         int              fp8_mode,
+                                         cudaStream_t     stream,
+                                         cublasMMWrapper* cublas_wrapper,
+                                         IAllocator*      allocator,
+                                         bool             is_free_buffer_after_forward,
+                                         bool             sparse):
+    FfnFP8Layer<T1, T2>(inter_size, fp8_mode, stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
+{
+}
+
+template<typename T1, typename T2>
+ReluFfnFP8Layer<T1, T2>::ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& relu_ffn_layer):
+    FfnFP8Layer<T1, T2>(relu_ffn_layer)
+{
+}
+
+template<typename T1, typename T2>
+void ReluFfnFP8Layer<T1, T2>::invokeAddBiasActivation(const int    m,
+                                                      const T2*    bias,
+                                                      const float* input_scale,
+                                                      const float* input_scale_2,
+                                                      const float* input_scale_2_min,
+                                                      const float* output_scale)
+{
+    FP8ActivationParam<T1, T2> param{inter_buf_bf16_,
+                                     inter_buf_,
+                                     bias,
+                                     input_scale,
+                                     input_scale_2,
+                                     input_scale_2_min,
+                                     output_scale,
+                                     (uint32_t)m,
+                                     (uint32_t)inter_size_,
+                                     stream_};
+    invokeFP8AddBiasRelu<T1, T2>(param);
+}
+
+template class ReluFfnFP8Layer<__nv_fp8_e4m3, __nv_bfloat16>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnFP8Layer.h b/src/fastertransformer/layers/FfnFP8Layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfc3debaa91a060f9827be8d255dcd4fe974d164
--- /dev/null
+++ b/src/fastertransformer/layers/FfnFP8Layer.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnFP8Weight.h"
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+class FfnFP8Layer: public BaseLayer {
+private:
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    void allocateBuffer(size_t token_num);
+
+protected:
+    const int    fp8_mode_;
+    T1*          inter_buf_      = nullptr;
+    T2*          inter_buf_bf16_ = nullptr;
+    size_t       inter_size_;
+    virtual void invokeAddBiasActivation(const int    m,
+                                         const T2*    bias,
+                                         const float* input_scale,
+                                         const float* input_scale_2,
+                                         const float* input_scale_2_min,
+                                         const float* output_scale) = 0;
+
+public:
+    FfnFP8Layer(size_t           inter_size,
+                int              fp8_mode,
+                cudaStream_t     stream,
+                cublasMMWrapper* cublas_wrapper,
+                IAllocator*      allocator,
+                bool             is_free_buffer_after_forward,
+                bool             sparse = false);
+
+    FfnFP8Layer(FfnFP8Layer<T1, T2> const& ffn_layer);
+
+    virtual ~FfnFP8Layer();
+
+    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnFP8Weight<T1, T2>* ffn_weights);
+    virtual ActivationType getActivationType() = 0;
+};
+
+template<typename T1, typename T2>
+class GeluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
+public:
+    GeluFfnFP8Layer(size_t           inter_size,
+                    int              fp8_mode_,
+                    cudaStream_t     stream,
+                    cublasMMWrapper* cublas_wrapper,
+                    IAllocator*      allocator,
+                    bool             is_free_buffer_after_forward,
+                    bool             sparse = false);
+
+    GeluFfnFP8Layer(GeluFfnFP8Layer<T1, T2> const& ffn_layer);
+
+    virtual ~GeluFfnFP8Layer() = default;
+    ActivationType getActivationType() override
+    {
+        return ActivationType::Gelu;
+    };
+
+protected:
+    using FfnFP8Layer<T1, T2>::stream_;
+
+private:
+    using FfnFP8Layer<T1, T2>::inter_buf_;
+    using FfnFP8Layer<T1, T2>::inter_size_;
+    using FfnFP8Layer<T1, T2>::fp8_mode_;
+    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
+    void invokeAddBiasActivation(const int    m,
+                                 const T2*    bias,
+                                 const float* input_scale,
+                                 const float* input_scale_2,
+                                 const float* input_scale_2_min,
+                                 const float* output_scale) override;
+};
+
+template<typename T1, typename T2>
+class ReluFfnFP8Layer: public FfnFP8Layer<T1, T2> {
+public:
+    ReluFfnFP8Layer(size_t           inter_size,
+                    int              fp8_mode,
+                    cudaStream_t     stream,
+                    cublasMMWrapper* cublas_wrapper,
+                    IAllocator*      allocator,
+                    bool             is_free_buffer_after_forward,
+                    bool             sparse = false);
+
+    ReluFfnFP8Layer(ReluFfnFP8Layer<T1, T2> const& ffn_layer);
+
+    virtual ~ReluFfnFP8Layer() = default;
+    ActivationType getActivationType() override
+    {
+        return ActivationType::Relu;
+    };
+
+protected:
+    using FfnFP8Layer<T1, T2>::stream_;
+
+private:
+    using FfnFP8Layer<T1, T2>::inter_buf_;
+    using FfnFP8Layer<T1, T2>::inter_size_;
+    using FfnFP8Layer<T1, T2>::fp8_mode_;
+    using FfnFP8Layer<T1, T2>::inter_buf_bf16_;
+    void invokeAddBiasActivation(const int    m,
+                                 const T2*    bias,
+                                 const float* input_scale,
+                                 const float* input_scale_2,
+                                 const float* input_scale_2_min,
+                                 const float* output_scale) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnFP8Weight.h b/src/fastertransformer/layers/FfnFP8Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..3669d86679fefdea3b7f0ffa69717dfe828c6d4b
--- /dev/null
+++ b/src/fastertransformer/layers/FfnFP8Weight.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+struct FfnFP8Weight: FfnWeight<T1, T2> {
+    ScaleList* scale_list_ptr;
+    float*     identity_scale;
+    float*     identity_h_scale;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnINT8Weight.h b/src/fastertransformer/layers/FfnINT8Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd46eee4206013dc33ebc126f3ee21664da7bf05
--- /dev/null
+++ b/src/fastertransformer/layers/FfnINT8Weight.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+namespace fastertransformer {
+
+template<typename T>
+struct FfnINT8Weight: FfnWeight<T> {
+    ScaleList* scale_list_ptr;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnLayer.cc b/src/fastertransformer/layers/FfnLayer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ac441198b51a44b73287533f95fc00b343d7143
--- /dev/null
+++ b/src/fastertransformer/layers/FfnLayer.cc
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void FfnLayer<T>::forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                          const std::vector<fastertransformer::Tensor>* input_tensors,
+                          const FfnWeight<T>*                           ffn_weights)
+{
+    TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
+    TensorMap output_tensor({{"ffn_output", output_tensors->at(0)}});
+    forward(&output_tensor, &input_tensor, ffn_weights);
+}
+
+template<typename T>
+void FfnLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights)
+{
+    // input tensors:
+    //      ffn_input [token_num, hidden_dimension],
+    //      ia3_tasks [batch_size] (optional)
+    //      moe_k     [1], uint64 (optional)
+    //      padding_offset [token_num] (optional)
+    //      seq_len [1], int32, (optional), only used for ia3
+
+    // output tensors:
+    //      ffn_output [token_num, hidden_dimension] or [moe_k * token_num, hidden_dimension] if use_moe
+    //      expert_scales [token_num, moe_k] (optional)
+    //      expanded_source_row_to_expanded_dest_row [token_num, moe_k] (optional)
+    //      expert_for_source_row [token_num, moe_k] (optional)
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 1 && input_tensors->size() <= 5);
+    FT_CHECK(output_tensors->size() >= 1 || output_tensors->size() <= 4);
+    bool   use_moe = false;
+    size_t moe_k   = 0;
+    if (input_tensors->isExist("moe_k")) {
+        use_moe = true;
+        moe_k   = input_tensors->at("moe_k").getVal<size_t>();
+    }
+    allocateBuffer(input_tensors->at("ffn_input").shape[0], moe_k, use_moe);
+
+    const int m             = input_tensors->at("ffn_input").shape[0];
+    T*        output_tensor = output_tensors->at("ffn_output").getPtr<T>();
+    const T*  input_tensor  = input_tensors->at("ffn_input").getPtr<const T>();
+
+    // for moe output
+    T*   expert_scales    = nullptr;
+    int* permuted_rows    = nullptr;
+    int* permuted_experts = nullptr;
+
+    // moe outputs should exist or not together
+    FT_CHECK((use_moe && output_tensors->isExist("expert_scales")
+              && output_tensors->isExist("expanded_source_row_to_expanded_dest_row")
+              && output_tensors->isExist("expert_for_source_row"))
+             || (!use_moe && !output_tensors->isExist("expert_scales")
+                 && !output_tensors->isExist("expanded_source_row_to_expanded_dest_row")
+                 && !output_tensors->isExist("expert_for_source_row")));
+
+    if (use_moe) {
+        expert_scales    = output_tensors->at("expert_scales").getPtr<T>();
+        permuted_rows    = output_tensors->at("expanded_source_row_to_expanded_dest_row").getPtr<int>();
+        permuted_experts = output_tensors->at("expert_for_source_row").getPtr<int>();
+    }
+
+    // TODO: INT8 and Sparsity are currently not implemented (geglu or reglu)
+    const bool use_gated_activation = use_gated_activation_ && ffn_weights->intermediate_weight2.kernel != nullptr;
+
+    // moe can't be used with use_gated_activation currently
+    FT_CHECK(!(use_gated_activation && use_moe));
+    auto activation_type = getActivationType();
+
+    const int* ia3_tasks = input_tensors->getPtr<const int>("ia3_tasks", nullptr);
+
+    if (use_moe) {
+        PUSH_RANGE("FFN moe");
+        FT_CHECK(ia3_tasks == nullptr);
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              expert_num_,
+                              m,
+                              hidden_units_,
+                              ffn_weights->gating_weight.kernel,
+                              expert_num_,
+                              input_tensor,
+                              hidden_units_,
+                              moe_gates_buf_,
+                              expert_num_);
+
+        if (int8_mode_ == 0) {
+            moe_fc_runner_->run_moe_fc(input_tensor,
+                                       moe_gates_buf_,
+                                       ffn_weights->intermediate_weight.kernel,
+                                       ffn_weights->intermediate_weight.weight_only_quant_scale,
+                                       ffn_weights->intermediate_weight.bias,
+                                       activation_type,
+                                       ffn_weights->output_weight.kernel,
+                                       ffn_weights->output_weight.weight_only_quant_scale,
+                                       m,
+                                       hidden_units_,
+                                       inter_size_,
+                                       expert_num_,
+                                       moe_k,
+                                       moe_fc_workspace_,
+                                       output_tensor,
+                                       expert_scales,
+                                       permuted_rows,
+                                       permuted_experts,
+                                       stream_);
+        }
+        else if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(moe_int8_weight_only_fc_runner_.get() != NULL,
+                               "weight only runner was not initialized.");
+
+            FT_CHECK(ffn_weights->intermediate_weight.int8_kernel != NULL
+                     && ffn_weights->intermediate_weight.weight_only_quant_scale != NULL);
+
+            FT_CHECK(ffn_weights->output_weight.int8_kernel != NULL
+                     && ffn_weights->output_weight.weight_only_quant_scale != NULL);
+
+            moe_int8_weight_only_fc_runner_->run_moe_fc(
+                input_tensor,
+                moe_gates_buf_,
+                reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
+                ffn_weights->intermediate_weight.weight_only_quant_scale,
+                ffn_weights->intermediate_weight.bias,
+                activation_type,
+                reinterpret_cast<const uint8_t*>(ffn_weights->output_weight.int8_kernel),
+                ffn_weights->output_weight.weight_only_quant_scale,
+                m,
+                hidden_units_,
+                inter_size_,
+                expert_num_,
+                moe_k,
+                moe_fc_workspace_,
+                output_tensor,
+                expert_scales,
+                permuted_rows,
+                permuted_experts,
+                stream_);
+        }
+        else {
+            FT_CHECK_WITH_INFO(false, "Invalid int8 mode for MoE");
+        }
+
+        sync_check_cuda_error();
+        if (is_free_buffer_after_forward_ == true) {
+            freeBuffer();
+        }
+        sync_check_cuda_error();
+        POP_RANGE;
+        return;
+    }
+
+    PUSH_RANGE("FFN gemm 1");
+    int m_tmp = input_tensors->at("ffn_input").shape[0];
+    if (m_tmp % 8 != 0) {
+        m_tmp = (m_tmp / 8 + 1) * 8;
+    }
+    const int m_padded = m_tmp;
+#ifdef SPARSITY_ENABLED
+    bool use_sparse_gemm = sparse_ && cublas_wrapper_->isUseSparse(1, inter_size_, m, hidden_units_);
+#else
+    constexpr bool use_sparse_gemm = false;
+#endif
+
+    if (use_sparse_gemm) {
+        FT_CHECK(!use_gated_activation);
+#ifdef SPARSITY_ENABLED
+        cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+                                CUBLAS_OP_N,
+                                inter_size_,
+                                m_padded,
+                                hidden_units_,
+                                ffn_weights->intermediate_weight.sp_kernel,
+                                input_tensor,
+                                inter_buf_);
+#endif
+    }
+    else {
+        if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
+            FT_CHECK(ffn_weights->intermediate_weight.int8_kernel != NULL
+                     && ffn_weights->intermediate_weight.weight_only_quant_scale != NULL);
+
+            if (ia3_tasks == nullptr && !use_gated_activation) {
+                // launch fused GEMM + activation
+                weight_only_int8_fc_runner_->gemm_bias_act(
+                    input_tensor,
+                    reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
+                    ffn_weights->intermediate_weight.weight_only_quant_scale,
+                    ffn_weights->intermediate_weight.bias,
+                    inter_buf_,
+                    m,
+                    inter_size_,
+                    hidden_units_,
+                    activation_type,
+                    mixed_gemm_workspace_,
+                    mixed_gemm_ws_bytes_,
+                    stream_);
+            }
+            else {
+                // Otherwise, let FT handle activation
+                weight_only_int8_fc_runner_->gemm(
+                    input_tensor,
+                    reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight.int8_kernel),
+                    ffn_weights->intermediate_weight.weight_only_quant_scale,
+                    inter_buf_,
+                    m,
+                    inter_size_,
+                    hidden_units_,
+                    mixed_gemm_workspace_,
+                    mixed_gemm_ws_bytes_,
+                    stream_);
+
+                if (use_gated_activation) {
+                    FT_CHECK(ffn_weights->intermediate_weight2.int8_kernel != NULL
+                             && ffn_weights->intermediate_weight2.weight_only_quant_scale != NULL);
+
+                    weight_only_int8_fc_runner_->gemm(
+                        input_tensor,
+                        reinterpret_cast<const uint8_t*>(ffn_weights->intermediate_weight2.int8_kernel),
+                        ffn_weights->intermediate_weight2.weight_only_quant_scale,
+                        inter_buf_2_,
+                        m,
+                        inter_size_,
+                        hidden_units_,
+                        mixed_gemm_workspace_,
+                        mixed_gemm_ws_bytes_,
+                        stream_);
+                }
+            }
+        }
+        else if (int8_mode_ == 2) {
+            FT_CHECK(!use_gated_activation);
+            cublas_wrapper_->Int8Gemm(inter_size_,
+                                      m,
+                                      hidden_units_,
+                                      ffn_weights->intermediate_weight.int8_kernel,
+                                      hidden_units_,
+                                      input_tensors->getPtr<int8_t>("ffn_input"),
+                                      hidden_units_,
+                                      reinterpret_cast<int8_t*>(inter_buf_),
+                                      inter_size_,
+                                      ffn_weights->intermediate_weight.scale_inter);
+        }
+        else {
+            cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                  CUBLAS_OP_N,
+                                  inter_size_,
+                                  m,
+                                  hidden_units_,
+                                  ffn_weights->intermediate_weight.kernel,
+                                  inter_size_,
+                                  input_tensor,
+                                  hidden_units_,
+                                  inter_buf_,
+                                  inter_size_);
+            if (use_gated_activation) {
+                cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      inter_size_,
+                                      m,
+                                      hidden_units_,
+                                      ffn_weights->intermediate_weight2.kernel,
+                                      inter_size_,
+                                      input_tensor,
+                                      hidden_units_,
+                                      inter_buf_2_,
+                                      inter_size_);
+            }
+        }
+    }
+
+    POP_RANGE;
+
+    if (int8_mode_ != 1 || ia3_tasks != nullptr || use_gated_activation) {
+        // if int8_mode == 1 && ia3_tasks == nullptr && we don't use gated activations, we use cutlass
+        // to fuse GEMM + bias + activation, so we skip the activation function here. In all
+        // other cases, we must apply the activation function separately.
+        PUSH_RANGE("add bias act");
+        genericActivation(m,
+                          ffn_weights->intermediate_weight.bias,
+                          use_gated_activation ? ffn_weights->intermediate_weight2.bias : nullptr,
+                          input_tensors->at("ia3_tasks", {MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<const int>(),
+                          ffn_weights->ia3_weight.kernel,
+                          int8_mode_ == 2 ? ffn_weights->intermediate_weight.scale_out : (float*)nullptr,
+                          int8_mode_ == 2 ? ffn_weights->output_weight.scale : (float*)nullptr,
+                          input_tensors->getPtr<int>("padding_offset", nullptr),
+                          input_tensors->getVal<int>("seq_len", 1));
+        POP_RANGE;
+    }
+
+    sync_check_cuda_error();
+
+    PUSH_RANGE("FFN gemm 2");
+#ifdef SPARSITY_ENABLED
+    use_sparse_gemm = sparse_ && cublas_wrapper_->isUseSparse(1, hidden_units_, m, inter_size_);
+#endif
+    if (use_sparse_gemm) {
+#ifdef SPARSITY_ENABLED
+        cublas_wrapper_->SpGemm(CUBLAS_OP_N,
+                                CUBLAS_OP_N,
+                                hidden_units_,
+                                m_padded,
+                                inter_size_,
+                                ffn_weights->output_weight.sp_kernel,
+                                inter_buf_,
+                                output_tensor);
+#endif
+    }
+    else {
+        if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
+            FT_CHECK(ffn_weights->output_weight.int8_kernel != NULL
+                     && ffn_weights->output_weight.weight_only_quant_scale != NULL);
+            weight_only_int8_fc_runner_->gemm(inter_buf_,
+                                              reinterpret_cast<const uint8_t*>(ffn_weights->output_weight.int8_kernel),
+                                              ffn_weights->output_weight.weight_only_quant_scale,
+                                              output_tensor,
+                                              m,
+                                              hidden_units_,
+                                              inter_size_,
+                                              mixed_gemm_workspace_,
+                                              mixed_gemm_ws_bytes_,
+                                              stream_);
+        }
+        else if (int8_mode_ == 2) {
+            int8_fc_runner_->gemm(reinterpret_cast<int8_t*>(inter_buf_),
+                                  ffn_weights->output_weight.int8_kernel,
+                                  QuantMode::PerTensorQuant,
+                                  ffn_weights->output_weight.scale_inter,
+                                  ffn_weights->output_weight.scale_out,
+                                  output_tensors->getPtr<T>("ffn_output"),
+                                  m,
+                                  hidden_units_,
+                                  inter_size_,
+                                  nullptr,
+                                  0,
+                                  stream_);
+        }
+        else {
+            cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                                  CUBLAS_OP_N,
+                                  hidden_units_,
+                                  m,
+                                  inter_size_,
+                                  ffn_weights->output_weight.kernel,
+                                  hidden_units_,
+                                  inter_buf_,
+                                  inter_size_,
+                                  output_tensor,
+                                  hidden_units_);
+        }
+    }
+    sync_check_cuda_error();
+    POP_RANGE;
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T>
+FfnLayer<T>::FfnLayer(size_t           max_batch_size,
+                      size_t           max_seq_len,
+                      size_t           head_num,
+                      size_t           size_per_head,
+                      size_t           expert_num,
+                      size_t           inter_size,
+                      cudaStream_t     stream,
+                      cublasMMWrapper* cublas_wrapper,
+                      IAllocator*      allocator,
+                      bool             is_free_buffer_after_forward,
+                      bool             sparse,
+                      int              int8_mode,
+                      bool             use_gated_activation):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse),
+    max_token_num_(max_batch_size * max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    expert_num_(expert_num),
+    hidden_units_(head_num * size_per_head),
+    max_inter_size_(inter_size),
+    inter_size_(inter_size),
+    int8_mode_(int8_mode),
+    use_gated_activation_(use_gated_activation),
+    int8_fc_runner_(int8_mode == 2 ? std::make_shared<CutlassInt8GemmRunner<T>>() : nullptr)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (int8_mode_ == 0) {
+        moe_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, T>>();
+    }
+    else if (int8_mode_ == 1) {
+        FT_CHECK_WITH_INFO(!(std::is_same<T, float>::value), "Weight only quant not supported for fp32.");
+        moe_int8_weight_only_fc_runner_ = std::make_shared<CutlassMoeFCRunner<T, uint8_t>>();
+        weight_only_int8_fc_runner_     = std::make_shared<CutlassFpAIntBGemmRunner<T, uint8_t>>();
+    }
+}
+
+template<typename T>
+FfnLayer<T>::FfnLayer(FfnLayer<T> const& ffn_layer):
+    BaseLayer(ffn_layer.stream_,
+              ffn_layer.cublas_wrapper_,
+              ffn_layer.allocator_,
+              ffn_layer.is_free_buffer_after_forward_,
+              ffn_layer.cuda_device_prop_,
+              ffn_layer.sparse_),
+    max_token_num_(ffn_layer.max_token_num_),
+    head_num_(ffn_layer.head_num_),
+    size_per_head_(ffn_layer.size_per_head_),
+    expert_num_(ffn_layer.expert_num_),
+    hidden_units_(ffn_layer.hidden_units_),
+    max_inter_size_(ffn_layer.max_inter_size_),
+    inter_size_(ffn_layer.inter_size_),
+    int8_mode_(ffn_layer.int8_mode_),
+    use_gated_activation_(ffn_layer.use_gated_activation_),
+    moe_fc_runner_(ffn_layer.moe_fc_runner_),
+    moe_int8_weight_only_fc_runner_(ffn_layer.moe_int8_weight_only_fc_runner_),
+    weight_only_int8_fc_runner_(ffn_layer.weight_only_int8_fc_runner_),
+    int8_fc_runner_(ffn_layer.int8_fc_runner_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T>
+FfnLayer<T>::~FfnLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T>
+void FfnLayer<T>::allocateBuffer()
+{
+    FT_CHECK_WITH_INFO(false,
+                       "FfnLayer::allocateBuffer() is deprecated. Use `allocateBuffer(size_t token_num, ...)` instead");
+}
+
+template<typename T>
+void FfnLayer<T>::allocateBuffer(size_t token_num, int moe_k, bool use_moe)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (use_moe) {
+        moe_gates_buf_ =
+            (T*)allocator_->reMalloc(moe_gates_buf_, sizeof(T) * pad_to_multiple_of_16(token_num * expert_num_), false);
+        size_t ws_size_moe = 0;
+        if (int8_mode_ == 0) {
+            FT_CHECK_WITH_INFO(moe_fc_runner_.get() != NULL, "moe runner was not initialized.");
+            ws_size_moe = moe_fc_runner_->getWorkspaceSize(token_num, hidden_units_, inter_size_, expert_num_, moe_k);
+        }
+        else if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(moe_int8_weight_only_fc_runner_.get() != NULL,
+                               "weight only moe runner was not initialized.");
+            ws_size_moe = moe_int8_weight_only_fc_runner_->getWorkspaceSize(
+                token_num, hidden_units_, inter_size_, expert_num_, moe_k);
+        }
+
+        moe_fc_workspace_ = (char*)allocator_->reMalloc(moe_fc_workspace_, sizeof(char) * ws_size_moe, false);
+    }
+    else {
+        const auto type_size = int8_mode_ == 2 ? sizeof(int8_t) : sizeof(T);
+        inter_buf_           = (T*)allocator_->reMalloc(inter_buf_, type_size * token_num * max_inter_size_, false);
+        if (use_gated_activation_) {
+            inter_buf_2_ = (T*)allocator_->reMalloc(inter_buf_2_, sizeof(T) * token_num * max_inter_size_, false);
+        }
+
+        if (int8_mode_ == 1) {
+            FT_CHECK_WITH_INFO(weight_only_int8_fc_runner_.get() != NULL, "weight only runner was not initialized.");
+            // We use max_size for n and k since we reuse buffers for both FCs and want to allocate the max
+            // possible memory that would be required by any of the individual gemms.
+            const int max_size    = std::max(hidden_units_, inter_size_);
+            mixed_gemm_ws_bytes_  = weight_only_int8_fc_runner_->getWorkspaceSize(token_num, max_size, max_size);
+            mixed_gemm_workspace_ = (char*)allocator_->reMalloc(mixed_gemm_workspace_, mixed_gemm_ws_bytes_, false);
+        }
+        else if (int8_mode_ == 2) {
+            const int max_size   = std::max(hidden_units_, inter_size_);
+            int8_gemm_ws_bytes_  = int8_fc_runner_->getWorkspaceSize(token_num, max_size, max_size);
+            int8_gemm_workspace_ = (char*)allocator_->reMalloc(int8_gemm_workspace_, int8_gemm_ws_bytes_, false);
+        }
+    }
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void FfnLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&inter_buf_));
+        if (use_gated_activation_) {
+            allocator_->free((void**)(&inter_buf_2_));
+        }
+        if (expert_num_ != 0) {
+            allocator_->free((void**)(&moe_gates_buf_));
+            allocator_->free((void**)(&moe_fc_workspace_));
+        }
+
+        if (mixed_gemm_workspace_) {
+            allocator_->free((void**)(&mixed_gemm_workspace_));
+            mixed_gemm_ws_bytes_ = 0;
+        }
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+#define INVOKE_GENERIC_ACT(ACT)                                                                                        \
+    invokeGenericActivation<ACT>(inter_buf_,                                                                           \
+                                 bias1,                                                                                \
+                                 inter_buf_2_,                                                                         \
+                                 bias2,                                                                                \
+                                 ia3_tasks,                                                                            \
+                                 ia3_weights,                                                                          \
+                                 m,                                                                                    \
+                                 inter_size_,                                                                          \
+                                 int8_mode_,                                                                           \
+                                 activation_in,                                                                        \
+                                 activation_out,                                                                       \
+                                 padding_offset,                                                                       \
+                                 seq_len,                                                                              \
+                                 stream_);
+
+template<typename T>
+void FfnLayer<T>::genericActivation(int          m,
+                                    const T*     bias1,
+                                    const T*     bias2,
+                                    const int*   ia3_tasks,
+                                    const T*     ia3_weights,
+                                    const float* activation_in,
+                                    const float* activation_out,
+                                    const int*   padding_offset,
+                                    const int    seq_len)
+{
+    if (ia3_tasks != nullptr) {
+        FT_CHECK(seq_len > 0);
+    }
+
+    // dispatch according to actual activation
+    switch (getActivationType()) {
+        case ActivationType::Gelu:
+        case ActivationType::GeGLU:
+            if (inter_buf_2_ == nullptr && int8_mode_ <= 1) {
+                invokeAddBiasGeluV2(
+                    inter_buf_, bias1, ia3_tasks, ia3_weights, padding_offset, seq_len, m, inter_size_, stream_);
+            }
+            else {
+                INVOKE_GENERIC_ACT(GeluActivation);
+            }
+            break;
+        case ActivationType::Relu:
+        case ActivationType::ReGLU:
+            INVOKE_GENERIC_ACT(ReluActivation);
+            break;
+        case ActivationType::Silu:
+        case ActivationType::SiGLU:
+            INVOKE_GENERIC_ACT(SiluActivation);
+            break;
+        case ActivationType::Identity:
+            INVOKE_GENERIC_ACT(IdentityActivation);
+            break;
+    }
+}
+
+#undef INVOKE_GENERIC_ACT
+
+template class FfnLayer<float>;
+template class FfnLayer<half>;
+#ifdef ENABLE_BF16
+template class FfnLayer<__nv_bfloat16>;
+#endif
+
+template<typename T>
+GeluFfnLayer<T>::GeluFfnLayer(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           expert_num,
+                              size_t           inter_size,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse,
+                              int              int8_mode,
+                              bool             use_gated_activation):
+    FfnLayer<T>(max_batch_size,
+                max_seq_len,
+                head_num,
+                size_per_head,
+                expert_num,
+                inter_size,
+                stream,
+                cublas_wrapper,
+                allocator,
+                is_free_buffer_after_forward,
+                sparse,
+                int8_mode,
+                use_gated_activation)
+{
+}
+
+template<typename T>
+GeluFfnLayer<T>::GeluFfnLayer(GeluFfnLayer<T> const& gelu_ffn_layer): FfnLayer<T>(gelu_ffn_layer)
+{
+}
+
+template class GeluFfnLayer<float>;
+template class GeluFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class GeluFfnLayer<__nv_bfloat16>;
+#endif
+
+template<typename T>
+ReluFfnLayer<T>::ReluFfnLayer(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           expert_num,
+                              size_t           inter_size,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse,
+                              int              int8_mode,
+                              bool             use_gated_activation):
+    FfnLayer<T>(max_batch_size,
+                max_seq_len,
+                head_num,
+                size_per_head,
+                expert_num,
+                inter_size,
+                stream,
+                cublas_wrapper,
+                allocator,
+                is_free_buffer_after_forward,
+                sparse,
+                int8_mode,
+                use_gated_activation)
+{
+}
+
+template<typename T>
+ReluFfnLayer<T>::ReluFfnLayer(ReluFfnLayer<T> const& relu_ffn_layer): FfnLayer<T>(relu_ffn_layer)
+{
+}
+
+template class ReluFfnLayer<float>;
+template class ReluFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class ReluFfnLayer<__nv_bfloat16>;
+#endif
+
+template<typename T>
+SiluFfnLayer<T>::SiluFfnLayer(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           expert_num,
+                              size_t           inter_size,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse,
+                              bool             use_gated_activation):
+    FfnLayer<T>(max_batch_size,
+                max_seq_len,
+                head_num,
+                size_per_head,
+                expert_num,
+                inter_size,
+                stream,
+                cublas_wrapper,
+                allocator,
+                is_free_buffer_after_forward,
+                sparse,
+                0,
+                use_gated_activation)
+{
+}
+
+template<typename T>
+SiluFfnLayer<T>::SiluFfnLayer(SiluFfnLayer<T> const& gelu_ffn_layer): FfnLayer<T>(gelu_ffn_layer)
+{
+}
+
+template class SiluFfnLayer<float>;
+template class SiluFfnLayer<half>;
+#ifdef ENABLE_BF16
+template class SiluFfnLayer<__nv_bfloat16>;
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnLayer.h b/src/fastertransformer/layers/FfnLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..af7ae76060a050391f25f0c8bf94579a2787a619
--- /dev/null
+++ b/src/fastertransformer/layers/FfnLayer.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "src/fastertransformer/kernels/matrix_vector_multiplication.h"
+#include "src/fastertransformer/kernels/moe_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/utils/activation_types.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <stdint.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+class FfnLayer: public BaseLayer {
+private:
+    // buffer handling
+    size_t max_token_num_ = 0;
+
+    // meta data
+    size_t head_num_;       // (martinma): this member is not used in this class. Remove it?
+    size_t size_per_head_;  // (martinma): this member is not used in this class. Remove it?
+    size_t expert_num_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    // gated activation
+    bool use_gated_activation_;
+
+    std::shared_ptr<CutlassMoeFCRunner<T, T>>       moe_fc_runner_;
+    std::shared_ptr<CutlassMoeFCRunner<T, uint8_t>> moe_int8_weight_only_fc_runner_;
+
+    std::shared_ptr<CutlassFpAIntBGemmRunner<T, uint8_t>> weight_only_int8_fc_runner_;
+    std::shared_ptr<CutlassInt8GemmRunner<T>>             int8_fc_runner_;
+
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    void allocateBuffer(int moe_k = 0, bool use_moe = false);
+    void allocateBuffer(size_t token_num, int moe_k = 0, bool use_moe = false);
+
+protected:
+    T*    inter_buf_        = nullptr;
+    T*    inter_buf_2_      = nullptr;  // for gated activation
+    T*    moe_gates_buf_    = nullptr;
+    char* moe_fc_workspace_ = nullptr;
+
+    char*  mixed_gemm_workspace_ = nullptr;
+    size_t mixed_gemm_ws_bytes_  = 0;
+    char*  int8_gemm_workspace_  = nullptr;
+    size_t int8_gemm_ws_bytes_   = 0;
+
+    size_t inter_size_;
+    /* used to allocater memory buffers
+       different ffn layers (inter_size) will
+       reuse the same ffn layer with the max inter size.
+       max_inter_size will be passed as inter_size when initializing the ffn layer
+    */
+    size_t max_inter_size_;
+
+    // int8_mode_ == 0 means we don't use any mechanism related to INT8.
+    // int8_mode_ == 1 for weight quantized only gemm for GPT
+    // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales)
+    int int8_mode_ = 0;
+
+    virtual ActivationType getActivationType() const
+    {
+        return ActivationType::InvalidType;
+    };
+
+    void genericActivation(int          m,
+                           const T*     bias1,
+                           const T*     bias2,
+                           const int*   ia3_tasks,
+                           const T*     ia3_weights,
+                           const float* activation_in,
+                           const float* activation_out,
+                           const int*   padding_offset,
+                           const int    seq_len);
+
+public:
+    FfnLayer(size_t           max_batch_size,
+             size_t           max_seq_len,
+             size_t           head_num,       // (martinma): redundant parameter?
+             size_t           size_per_head,  // (martinma): redundant parameter?
+             size_t           expert_num,
+             size_t           inter_size,
+             cudaStream_t     stream,
+             cublasMMWrapper* cublas_wrapper,
+             IAllocator*      allocator,
+             bool             is_free_buffer_after_forward,
+             bool             sparse               = false,
+             int              int8_mode            = 0,
+             bool             use_gated_activation = false);
+
+    FfnLayer(FfnLayer<T> const& ffn_layer);
+
+    virtual ~FfnLayer();
+
+    void resetInterSize(size_t runtime_inter_size)
+    {
+        inter_size_ = runtime_inter_size;
+    }
+
+    virtual void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                         const std::vector<fastertransformer::Tensor>* input_tensors,
+                         const FfnWeight<T>*                           ffn_weights);
+    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
+};
+
+template<typename T>
+class GeluFfnLayer: public FfnLayer<T> {
+public:
+    GeluFfnLayer(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           expert_num,
+                 size_t           inter_size,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse               = false,
+                 int              int8_mode            = 0,
+                 bool             use_gated_activation = false);
+
+    GeluFfnLayer(GeluFfnLayer<T> const& ffn_layer);
+
+    virtual ~GeluFfnLayer() = default;
+
+protected:
+    using FfnLayer<T>::stream_;
+    virtual ActivationType getActivationType() const override
+    {
+        return ActivationType::Gelu;
+    };
+
+private:
+    using FfnLayer<T>::inter_buf_;
+    using FfnLayer<T>::inter_buf_2_;
+    using FfnLayer<T>::inter_size_;
+};
+
+template<typename T>
+class ReluFfnLayer: public FfnLayer<T> {
+public:
+    ReluFfnLayer(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           expert_num,
+                 size_t           inter_size,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse               = false,
+                 int              int8_mode            = 0,
+                 bool             use_gated_activation = false);
+
+    ReluFfnLayer(ReluFfnLayer<T> const& ffn_layer);
+
+    virtual ~ReluFfnLayer() = default;
+
+protected:
+    using FfnLayer<T>::stream_;
+    virtual ActivationType getActivationType() const override
+    {
+        return ActivationType::Relu;
+    };
+
+private:
+    using FfnLayer<T>::inter_buf_;
+    using FfnLayer<T>::inter_buf_2_;
+    using FfnLayer<T>::inter_size_;
+};
+
+template<typename T>
+class SiluFfnLayer: public FfnLayer<T> {
+public:
+    SiluFfnLayer(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           expert_num,
+                 size_t           inter_size,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse               = false,
+                 bool             use_gated_activation = false);
+
+    SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);
+
+    virtual ~SiluFfnLayer() = default;
+
+protected:
+    using FfnLayer<T>::stream_;
+    virtual ActivationType getActivationType() const override
+    {
+        return ActivationType::Silu;
+    };
+
+private:
+    using FfnLayer<T>::inter_buf_;
+    using FfnLayer<T>::inter_buf_2_;
+    using FfnLayer<T>::inter_size_;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnLayerINT8.cc b/src/fastertransformer/layers/FfnLayerINT8.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aeac788bcc2cbe72b59bfc9eccb19093a9d5914b
--- /dev/null
+++ b/src/fastertransformer/layers/FfnLayerINT8.cc
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FfnLayerINT8.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void FfnLayerINT8<T>::forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                              const std::vector<fastertransformer::Tensor>* input_tensors,
+                              const FfnWeight<T>*                           ffn_weights)
+{
+    // input_tensors: [input (token_num, hidden_dimension)]
+    // output_tensors: [output (token_num, hidden_dimension)]
+    ScaleList* scale_list = ((const FfnINT8Weight<T>*)ffn_weights)->scale_list_ptr;
+
+    cublasINT8MMWrapper* cublas_wrapper = (cublasINT8MMWrapper*)cublas_wrapper_;
+
+    FT_CHECK(isValidTokenNum(input_tensors->at(0).shape[0]));
+    allocateBuffer();
+
+    const int m = static_cast<int>(input_tensors->at(0).shape[0]);
+#ifdef SPARSITY_ENABLED
+    int m_tmp = m;
+    if (m_tmp % 16 != 0) {
+        m_tmp = (m_tmp / 16 + 1) * 16;
+    }
+    const int m_padded = m_tmp;
+#endif
+
+    int32_t*      output_tensor = output_tensors->at(0).getPtr<int32_t>();
+    const int8_t* input_tensor  = input_tensors->at(0).getPtr<const int8_t>();
+
+    PUSH_RANGE("FFN gemm 1");
+    if (int8_mode_ == 1) {
+        cublas_wrapper->Gemm(inter_int_buf_,
+                             1,
+                             m,
+                             inter_size_,
+                             hidden_units_,
+                             0,
+                             0,
+                             0,
+                             input_tensor,
+                             (int8_t*)(ffn_weights->intermediate_weight.kernel));
+    }
+    else if (int8_mode_ == 2 || int8_mode_ == 3) {
+#ifdef SPARSITY_ENABLED
+        if (sparse_) {
+            cublas_wrapper->SpGemm(inter_size_,
+                                   m_padded,
+                                   hidden_units_,
+                                   scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
+                                   (int8_t*)(ffn_weights->intermediate_weight.sp_kernel),
+                                   input_tensor,
+                                   (int8_t*)inter_int_buf_);
+        }
+        else {
+#endif
+            cublas_wrapper->Gemm((int8_t*)inter_int_buf_,
+                                 1,
+                                 m,
+                                 inter_size_,
+                                 hidden_units_,
+                                 0,
+                                 0,
+                                 0,
+                                 scale_list->h_scale_list_[scale_list->p3_offset_ + 6],
+                                 input_tensor,
+                                 (int8_t*)(ffn_weights->intermediate_weight.kernel));
+#ifdef SPARSITY_ENABLED
+        }
+#endif
+    }
+    POP_RANGE;
+
+    PUSH_RANGE("add bias act");
+    invokeAddBiasActivation(m, ffn_weights->intermediate_weight.bias, scale_list);
+    POP_RANGE;
+    sync_check_cuda_error();
+
+    PUSH_RANGE("FFN gemm 2");
+    if (int8_mode_ == 1) {
+        cublas_wrapper->Gemm(output_tensor,
+                             1,
+                             m,
+                             hidden_units_,
+                             inter_size_,
+                             0,
+                             0,
+                             0,
+                             inter_buf_,
+                             (int8_t*)(ffn_weights->output_weight.kernel));
+    }
+    else if (int8_mode_ == 2 || int8_mode_ == 3) {
+#ifdef SPARSITY_ENABLED
+        if (sparse_) {
+            cublas_wrapper->SpGemm(hidden_units_,
+                                   m_padded,
+                                   inter_size_,
+                                   scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
+                                   (int8_t*)(ffn_weights->output_weight.sp_kernel),
+                                   inter_buf_,
+                                   (int8_t*)output_tensor);
+        }
+        else {
+#endif
+            cublas_wrapper->Gemm((int8_t*)output_tensor,
+                                 1,
+                                 m,
+                                 hidden_units_,
+                                 inter_size_,
+                                 0,
+                                 0,
+                                 0,
+                                 scale_list->h_scale_list_[scale_list->p3_offset_ + 7],
+                                 inter_buf_,
+                                 (int8_t*)(ffn_weights->output_weight.kernel));
+#ifdef SPARSITY_ENABLED
+        }
+#endif
+    }
+    POP_RANGE;
+
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T>
+FfnLayerINT8<T>::FfnLayerINT8(size_t           max_batch_size,
+                              size_t           max_seq_len,
+                              size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           inter_size,
+                              int              int8_mode,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward,
+                              bool             sparse):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    max_token_num_(max_batch_size * max_seq_len),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    hidden_units_(head_num * size_per_head),
+    inter_size_(inter_size),
+    int8_mode_(int8_mode),
+    sparse_(sparse)
+{
+}
+
+template<typename T>
+FfnLayerINT8<T>::FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer):
+    BaseLayer(
+        ffn_layer.stream_, ffn_layer.cublas_wrapper_, ffn_layer.allocator_, ffn_layer.is_free_buffer_after_forward_),
+    max_token_num_(ffn_layer.max_token_num_),
+    head_num_(ffn_layer.head_num_),
+    size_per_head_(ffn_layer.size_per_head_),
+    hidden_units_(ffn_layer.hidden_units_),
+    inter_size_(ffn_layer.inter_size_),
+    int8_mode_(ffn_layer.int8_mode_),
+    sparse_(ffn_layer.sparse_)
+{
+}
+
+template<typename T>
+FfnLayerINT8<T>::~FfnLayerINT8()
+{
+    cublas_wrapper_ = nullptr;
+    freeBuffer();
+}
+
+template<typename T>
+void FfnLayerINT8<T>::allocateBuffer()
+{
+    if (is_allocate_buffer_ == false) {
+        inter_int_buf_ =
+            (int32_t*)allocator_->reMalloc(inter_int_buf_, sizeof(int32_t) * max_token_num_ * inter_size_, false);
+        inter_buf_ = (int8_t*)allocator_->reMalloc(inter_buf_, sizeof(int8_t) * max_token_num_ * inter_size_, false);
+        is_allocate_buffer_ = true;
+    }
+}
+
+template<typename T>
+void FfnLayerINT8<T>::freeBuffer()
+{
+    if (is_allocate_buffer_ == true) {
+        allocator_->free((void**)(&inter_int_buf_));
+        allocator_->free((void**)(&inter_buf_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+bool FfnLayerINT8<T>::isValidTokenNum(size_t token_num)
+{
+    if (max_token_num_ == 0) {
+        max_token_num_ = token_num;
+        return true;
+    }
+    else {
+        return token_num <= max_token_num_;
+    }
+}
+
+template class FfnLayerINT8<float>;
+template class FfnLayerINT8<half>;
+
+template<typename T>
+GeluFfnLayerINT8<T>::GeluFfnLayerINT8(size_t           max_batch_size,
+                                      size_t           max_seq_len,
+                                      size_t           head_num,
+                                      size_t           size_per_head,
+                                      size_t           inter_size,
+                                      int              int8_mode,
+                                      cudaStream_t     stream,
+                                      cublasMMWrapper* cublas_wrapper,
+                                      IAllocator*      allocator,
+                                      bool             is_free_buffer_after_forward,
+                                      bool             sparse):
+    FfnLayerINT8<T>(max_batch_size,
+                    max_seq_len,
+                    head_num,
+                    size_per_head,
+                    inter_size,
+                    int8_mode,
+                    stream,
+                    cublas_wrapper,
+                    allocator,
+                    is_free_buffer_after_forward,
+                    sparse)
+{
+}
+
+template<typename T>
+GeluFfnLayerINT8<T>::GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& gelu_ffn_layer): FfnLayerINT8<T>(gelu_ffn_layer)
+{
+}
+
+template<typename T>
+void GeluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
+{
+    if (int8_mode_ == 1) {
+        invokeAddBiasGeluCol32<T>(inter_buf_,
+                                  inter_int_buf_,
+                                  bias,
+                                  m,
+                                  inter_size_,
+                                  stream_,
+                                  &(scale_list->d_scale_list_[scale_list->p2_offset_ + 4 * hidden_units_]),
+                                  &(scale_list->d_scale_list_[44 + 2]),
+                                  &(scale_list->d_scale_list_[52 + 3]));
+    }
+    else if (int8_mode_ == 2 || int8_mode_ == 3) {
+#ifdef SPARSITY_ENABLED
+        if (sparse_) {
+            invokeAddBiasGeluRow<T>(inter_buf_,
+                                    (const int8_t*)inter_int_buf_,
+                                    bias,
+                                    m,
+                                    inter_size_,
+                                    stream_,
+                                    &(scale_list->d_scale_list_[48 + 1]),
+                                    &(scale_list->d_scale_list_[52 + 3]));
+        }
+        else {
+#endif
+            invokeAddBiasGeluCol32<T>(inter_buf_,
+                                      (const int8_t*)inter_int_buf_,
+                                      bias,
+                                      m,
+                                      inter_size_,
+                                      stream_,
+                                      &(scale_list->d_scale_list_[48 + 1]),
+                                      &(scale_list->d_scale_list_[52 + 3]));
+#ifdef SPARSITY_ENABLED
+        }
+#endif
+    }
+}
+
+template class GeluFfnLayerINT8<float>;
+template class GeluFfnLayerINT8<half>;
+
+template<typename T>
+ReluFfnLayerINT8<T>::ReluFfnLayerINT8(size_t           max_batch_size,
+                                      size_t           max_seq_len,
+                                      size_t           head_num,
+                                      size_t           size_per_head,
+                                      size_t           inter_size,
+                                      int              int8_mode,
+                                      cudaStream_t     stream,
+                                      cublasMMWrapper* cublas_wrapper,
+                                      IAllocator*      allocator,
+                                      bool             is_free_buffer_after_forward):
+    FfnLayerINT8<T>(max_batch_size,
+                    max_seq_len,
+                    head_num,
+                    size_per_head,
+                    inter_size,
+                    int8_mode,
+                    stream,
+                    cublas_wrapper,
+                    allocator,
+                    is_free_buffer_after_forward)
+{
+}
+
+template<typename T>
+ReluFfnLayerINT8<T>::ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& relu_ffn_layer): FfnLayerINT8<T>(relu_ffn_layer)
+{
+}
+
+template<typename T>
+void ReluFfnLayerINT8<T>::invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list)
+{
+    // TODO
+}
+
+template class ReluFfnLayerINT8<float>;
+template class ReluFfnLayerINT8<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnLayerINT8.h b/src/fastertransformer/layers/FfnLayerINT8.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d07ee14938d3010600b9c9d3bd4f380c6c98d71
--- /dev/null
+++ b/src/fastertransformer/layers/FfnLayerINT8.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnINT8Weight.h"
+#include "src/fastertransformer/kernels/activation_int8_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+class GeluFfnLayerINT8;
+
+template<typename T>
+class ReluFfnLayerINT8;
+
+template<typename T>
+class FfnLayerINT8: public BaseLayer {
+private:
+    // buffer handling
+    size_t max_token_num_ = 0;
+
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    bool isValidTokenNum(size_t token_num);
+
+protected:
+    size_t inter_size_;
+    int    int8_mode_;
+    bool   sparse_;
+
+    int*         inter_int_buf_;
+    int8_t*      inter_buf_;
+    virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
+
+public:
+    FfnLayerINT8(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           inter_size,
+                 int              int8_mode,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse = false);
+
+    FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
+
+    ~FfnLayerINT8();
+
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors,
+                 const FfnWeight<T>*                           ffn_weights);
+
+    friend GeluFfnLayerINT8<T>;
+    friend ReluFfnLayerINT8<T>;
+};
+
+template<typename T>
+class GeluFfnLayerINT8: public FfnLayerINT8<T> {
+public:
+    GeluFfnLayerINT8(size_t           max_batch_size,
+                     size_t           max_seq_len,
+                     size_t           head_num,
+                     size_t           size_per_head,
+                     size_t           inter_size,
+                     int              int8_mode,
+                     cudaStream_t     stream,
+                     cublasMMWrapper* cublas_wrapper,
+                     IAllocator*      allocator,
+                     bool             is_free_buffer_after_forward,
+                     bool             sparse = false);
+
+    GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
+
+    ~GeluFfnLayerINT8() = default;
+
+private:
+    using FfnLayerINT8<T>::inter_int_buf_;
+    using FfnLayerINT8<T>::inter_buf_;
+    using FfnLayerINT8<T>::inter_size_;
+    using FfnLayerINT8<T>::stream_;
+    using FfnLayerINT8<T>::int8_mode_;
+    using FfnLayerINT8<T>::sparse_;
+    using FfnLayerINT8<T>::hidden_units_;
+    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
+};
+
+template<typename T>
+class ReluFfnLayerINT8: public FfnLayerINT8<T> {
+public:
+    ReluFfnLayerINT8(size_t           max_batch_size,
+                     size_t           max_seq_len,
+                     size_t           head_num,
+                     size_t           size_per_head,
+                     size_t           inter_size,
+                     int              int8_mode,
+                     cudaStream_t     stream,
+                     cublasMMWrapper* cublas_wrapper,
+                     IAllocator*      allocator,
+                     bool             is_free_buffer_after_forward);
+
+    ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
+
+    ~ReluFfnLayerINT8() = default;
+
+private:
+    using FfnLayerINT8<T>::inter_int_buf_;
+    using FfnLayerINT8<T>::inter_buf_;
+    using FfnLayerINT8<T>::inter_size_;
+    using FfnLayerINT8<T>::stream_;
+    using FfnLayerINT8<T>::int8_mode_;
+    using FfnLayerINT8<T>::hidden_units_;
+    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/FfnWeight.h b/src/fastertransformer/layers/FfnWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a8dd6714f15f2534bafc4f4b42be391dada5c02
--- /dev/null
+++ b/src/fastertransformer/layers/FfnWeight.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "DenseWeight.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2 = T1>
+struct FfnWeight {
+    DenseWeight<T1, T2> gating_weight;
+    DenseWeight<T1, T2> intermediate_weight;
+    DenseWeight<T1, T2> intermediate_weight2;  // for gated activation
+    DenseWeight<T1, T2> output_weight;
+    DenseWeight<T1, T2> ia3_weight;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers/AttentionWeight.h b/src/fastertransformer/layers/attention_layers/AttentionWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..673cbe9ffd4cf8cdbd0a3be438ace922e46993f9
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/AttentionWeight.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/DenseWeight.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2 = T1>
+struct AttentionWeight {
+    DenseWeight<T1, T2> query_weight;
+    DenseWeight<T1, T2> key_weight;
+    DenseWeight<T1, T2> value_weight;
+    DenseWeight<T1, T2> attention_output_weight;
+    DenseWeight<T1, T2> ia3_key_weight;
+    DenseWeight<T1, T2> ia3_value_weight;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h b/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..df36ff52d347ba658789f94a24a3f618129e71e1
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <vector>
+
+// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+enum class AttentionType {
+    UNFUSED_MHA,
+    UNFUSED_PADDED_MHA,
+    FUSED_MHA,
+    FUSED_PADDED_MHA
+};
+
+/* NOTE:
+1. only swin-style relative position bias is supported currently
+2. gpt-style (causal-mask) models support any-sequence-length fmha, so we don't need to call isValidSeqLen at run-time
+3. bert/vit can also support any-seq-length fmha
+*/
+template<typename T>
+AttentionType getAttentionType(size_t     size_per_head,
+                               const int  sm,
+                               const bool remove_padding,
+                               const int  max_seq_len,
+                               const bool is_fuse                          = true,
+                               const bool with_swin_relative_position_bias = false,
+                               const bool causal_mask                      = false)
+{
+
+    if (std::is_same<T, half>::value && is_fuse) {
+        // Bert/Vit
+        if (!causal_mask) {
+            if (!with_swin_relative_position_bias
+                && (((sm == kSM_70 || sm == kSM_72) && size_per_head == 64)
+                    || ((sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
+                        && (size_per_head == 64 || size_per_head == 32)))) {
+                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+            }
+            else if (with_swin_relative_position_bias && (sm == kSM_75 || sm == kSM_80 || sm == kSM_86)
+                     && max_seq_len <= 256 && size_per_head == 32) {
+                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+            }
+        }
+        // GPT and its variants
+        else {
+           // FMHA_ENABLE only affects gpt-style models (causal-mask)
+            char * fused_qkv = std::getenv("FMHA_ENABLE");
+            if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
+                if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
+                    && (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
+                        || size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) {
+                    return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+                }
+            }
+        }
+    }
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value && is_fuse) {
+        if (!causal_mask) {
+            if ((sm == kSM_89 || sm == kSM_90) && max_seq_len < 512 && is_fuse && size_per_head == 64) {
+                return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+            }
+            else {
+                return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+            }
+        }
+    }
+#endif
+
+    return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+}
+
+template<typename T>
+AttentionType getAttentionTypeINT8(
+    size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len, const int int8_mode)
+{
+    if ((int8_mode == 1 || int8_mode == 2)
+        && (((sm == kSM_80 || sm == kSM_86) && (size_per_head == 64 || size_per_head == 32) && max_seq_len <= 512)
+            || (sm == kSM_75
+                && ((size_per_head == 64 && max_seq_len <= 384) || (size_per_head == 32 && max_seq_len <= 512))))) {
+        return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+    }
+    else {
+        return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+    }
+}
+
+inline bool isFusedMHA(AttentionType attention_type)
+{
+    return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::FUSED_PADDED_MHA;
+}
+
+inline bool isUnPaddedMHA(AttentionType attention_type)
+{
+    return attention_type == AttentionType::FUSED_MHA || attention_type == AttentionType::UNFUSED_MHA;
+}
+
+inline bool isPaddedMHA(AttentionType attention_type)
+{
+    return attention_type == AttentionType::FUSED_PADDED_MHA || attention_type == AttentionType::UNFUSED_PADDED_MHA;
+}
+
+inline AttentionType getUnfusedAttentionType(AttentionType attention_type)
+{
+    if (attention_type == AttentionType::FUSED_MHA) {
+        return AttentionType::UNFUSED_MHA;
+    }
+    else if (attention_type == AttentionType::FUSED_PADDED_MHA) {
+        return AttentionType::UNFUSED_PADDED_MHA;
+    }
+    return attention_type;
+}
+
+template<typename T>
+class BaseAttentionLayer: public BaseLayer {
+
+public:
+    virtual void
+    forward(TensorMap* output_tensors, TensorMap* input_tensors, const AttentionWeight<T>* attention_weights) = 0;
+
+    BaseAttentionLayer(cudaStream_t     stream,
+                       cublasMMWrapper* cublas_wrapper,
+                       IAllocator*      allocator,
+                       bool             is_free_buffer_after_forward,
+                       bool             sparse = false):
+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
+    {
+    }
+    virtual ~BaseAttentionLayer() = default;
+    virtual bool isValidSeqLen(const size_t seq_len)
+    {
+        return true;
+    }
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7030d819db44a2b657b17b1fae0c8bef4b29bf42
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
diff --git a/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h b/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa891831e3e73b8dfeb199cf894f2efe0ce7f7c5
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
+    const float* qk_scale;
+    const float* qk_scale_inv;
+    float*       qk_h_scale;
+    float*       qk_h_scale_inv;
+    float*       identity_scale;
+    float*       identity_h_scale;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers_fp8/BaseAttentionFP8Layer.h b/src/fastertransformer/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b92b47bd4bbb5819bd7791c9ff7701c6ba52ff4f
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <vector>
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasFP8MMWrapper.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+// template<typename T>
+// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
+// const bool is_fuse = true)
+// {
+//     if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm ==
+//     kSM_72)
+//         && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
+//         return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
+//     }
+//     else {
+//         return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
+//     }
+// }
+
+template<typename T1, typename T2>
+class BaseAttentionFP8Layer: public BaseLayer {
+
+public:
+    virtual void forward(TensorMap*                        output_tensors,
+                         TensorMap*                        input_tensors,
+                         const AttentionFP8Weight<T1, T2>* attention_weights) = 0;
+
+    BaseAttentionFP8Layer(cudaStream_t     stream,
+                          cublasMMWrapper* cublas_wrapper,
+                          IAllocator*      allocator,
+                          bool             is_free_buffer_after_forward,
+                          bool             sparse = false):
+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
+    {
+    }
+    virtual ~BaseAttentionFP8Layer() = default;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers_fp8/CMakeLists.txt b/src/fastertransformer/layers/attention_layers_fp8/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4c3b8f8cecb3b2ddaefece482efdacae6e14c11f
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers_fp8/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
diff --git a/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h b/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..43315d51d729c4fb6a5e7e73892552a8c76d4798
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct AttentionINT8Weight: AttentionWeight<T> {
+    ScaleList* scale_list_ptr;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/attention_layers_int8/CMakeLists.txt b/src/fastertransformer/layers/attention_layers_int8/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0d1a96fef343cf30231e3db9550f6b989e667375
--- /dev/null
+++ b/src/fastertransformer/layers/attention_layers_int8/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
diff --git a/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.cu b/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f09929393b0200d6060ec5fd5902047c2cf8017b
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.cu
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+__global__ void update_indir_cache_kernel(int*        tgt_indir_cache,
+                                          const int*  src_indir_cache,
+                                          const int*  beam_ids,
+                                          const bool* finished,
+                                          int         start_step,
+                                          int         batch_dim,
+                                          int         local_batch_size,
+                                          int         beam_width,
+                                          int         max_seq_len,
+                                          int         step)
+{
+    int       time_step = threadIdx.x + blockIdx.x * blockDim.x;
+    int       bb_id     = threadIdx.y + blockIdx.y * blockDim.y;
+    const int batch_id  = bb_id / beam_width;
+    const int beam_id   = bb_id % beam_width;
+
+    if (bb_id >= beam_width * local_batch_size || time_step >= min(step + 1, max_seq_len) || finished[bb_id]) {
+        return;
+    }
+    time_step += start_step;
+    const int time_step_circ = time_step % max_seq_len;
+
+    const int src_beam = beam_ids[batch_id * beam_width + beam_id];
+
+    const uint tgt_offset = batch_id * beam_width * max_seq_len + beam_id * max_seq_len + time_step_circ;
+    const uint src_offset = batch_id * beam_width * max_seq_len + src_beam * max_seq_len + time_step_circ;
+
+    tgt_indir_cache[tgt_offset] = (time_step == step) ? beam_id : src_indir_cache[src_offset];
+}
+
+void update_indir_cache_kernelLauncher(int*         tgt_indir_cache,
+                                       const int*   src_indir_cache,
+                                       const int*   beam_ids,
+                                       const bool*  finished,
+                                       int          batch_dim,
+                                       int          local_batch_size,
+                                       int          beam_width,
+                                       int          max_seq_len,
+                                       int          step,
+                                       cudaStream_t stream)
+{
+    const dim3 block(32);
+    const int  start_step = max(0, step + 1 - max_seq_len);
+    const int  num_steps  = min(step + 1, max_seq_len);
+    // Update indirections steps [start_step, step], included
+    const dim3 grid((num_steps + block.x - 1) / block.x, local_batch_size * beam_width);
+    update_indir_cache_kernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
+                                                          src_indir_cache,
+                                                          beam_ids,
+                                                          finished,
+                                                          start_step,
+                                                          batch_dim,
+                                                          local_batch_size,
+                                                          beam_width,
+                                                          max_seq_len,
+                                                          step);
+}
+
+template<typename T>
+BaseBeamSearchLayer<T>::BaseBeamSearchLayer(size_t           max_batch_size,
+                                            size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           beam_width,
+                                            size_t           vocab_size,
+                                            size_t           vocab_size_padded,
+                                            int              end_id,
+                                            float            diversity_rate,
+                                            float            temperature,
+                                            float            len_penalty,
+                                            float            repetition_penalty,
+                                            cudaStream_t     stream,
+                                            cublasMMWrapper* cublas_wrapper,
+                                            IAllocator*      allocator,
+                                            bool             is_free_buffer_after_forward):
+    DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr),
+    vocab_size_(vocab_size),
+    vocab_size_padded_(vocab_size_padded)
+{
+}
+
+template<typename T>
+BaseBeamSearchLayer<T>::BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer):
+    DynamicDecodeBaseLayer(beam_search_layer),
+    vocab_size_(beam_search_layer.vocab_size_),
+    vocab_size_padded_(beam_search_layer.vocab_size_padded_),
+    topk_softmax_workspace_size_(beam_search_layer.topk_softmax_workspace_size_)
+{
+}
+
+template<typename T>
+BaseBeamSearchLayer<T>::~BaseBeamSearchLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    freeBuffer();
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&topk_softmax_workspace_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    // do nothing.
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width]
+    //      ite [1] on cpu
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width]
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width]
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
+                                                              {"embedding_bias", input_tensors->at(1)},
+                                                              {"step", input_tensors->at(2)},
+                                                              {"src_cache_indirection", input_tensors->at(4)},
+                                                              {"max_input_length", input_tensors->at(5)},
+                                                              {"input_lengths", input_tensors->at(6)},
+                                                              {"ite", input_tensors->at(7)}};
+
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
+                                                               {"finished", output_tensors->at(1)},
+                                                               {"cum_log_probs", output_tensors->at(2)},
+                                                               {"parent_ids", output_tensors->at(3)},
+                                                               {"sequence_length", output_tensors->at(4)},
+                                                               {"tgt_cache_indirection", output_tensors->at(5)}};
+    forward(&output_tensors_map, &input_tensors_map);
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                     const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    TensorMap input_map(*input_tensors);
+    TensorMap output_map(*output_tensors);
+    forward(&output_map, &input_map);
+}
+
+template<typename T>
+void BaseBeamSearchLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      end_id [local_batch_size]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width], optional
+    //      ite [1] on cpu
+    //      beam_search_diversity_rate [1] on cpu, optional
+    //      temperature [1] on cpu, optional
+    //      len_penalty [1] on cpu, optional
+    //      repetition_penalty [1] on cpu, optional
+    //      presence_penalty [1] on cpu, optional
+    //          Only one of repetition and presence penalties is allowed.
+    //      min_length [1] on cpu, int, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width], optional
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width], optional
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      output_log_probs [max_seq_len, batch_size, beam_width], optional
+    //      beam_hyps, optional
+
+    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(output_tensors->size() >= 5);
+    const int batch_size = output_tensors->at("output_ids").shape[1];
+    const int beam_width = output_tensors->at("output_ids").shape[2];
+    allocateBuffer(batch_size, beam_width);
+
+    const int step             = input_tensors->at("step").getVal<int>();
+    const int ite              = input_tensors->at("ite").getVal<int>();
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+
+    const float temperature    = input_tensors->getVal<float>("temperature", 1.0f);
+    const T*    embedding_bias = input_tensors->getPtr<const T>("embedding_bias", nullptr);
+
+    RepetitionPenaltyType repetition_penalty_type = RepetitionPenaltyType::None;
+    float                 repetition_penalty      = getDefaultPenaltyValue(repetition_penalty_type);
+    if (input_tensors->isExist("repetition_penalty") || input_tensors->isExist("presence_penalty")) {
+        FT_CHECK_WITH_INFO(
+            !(input_tensors->isExist("repetition_penalty") && input_tensors->isExist("presence_penalty")),
+            "Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
+            "Please provide one of repetition_penalty or presence_penalty.");
+        repetition_penalty_type = input_tensors->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
+                                                                                 RepetitionPenaltyType::Additive;
+        repetition_penalty      = repetition_penalty_type == RepetitionPenaltyType::Multiplicative ?
+                                      input_tensors->getVal<float>("repetition_penalty") :
+                                      input_tensors->getVal<float>("presence_penalty");
+    }
+
+    invokeAddBiasApplyPenalties(
+        step,
+        input_tensors->at("logits").getPtr<T>(),
+        output_tensors->at("output_ids")
+            .getPtrWithOffset<const int>((step - 1) * batch_size * beam_width + ite * local_batch_size * beam_width),
+        output_tensors->getPtr<const int>("output_ids"),
+        output_tensors->getPtr<const int>("parent_ids"),
+        input_tensors->getPtr<const int>("input_lengths", nullptr),
+        output_tensors->getPtr<const int>("sequence_length", nullptr),
+        embedding_bias,
+        ite,
+        input_tensors->getVal<int>("max_input_length"),
+        local_batch_size,
+        batch_size,
+        beam_width,
+        vocab_size_,
+        vocab_size_padded_,
+        input_tensors->getPtr<const int>("end_id", nullptr),
+        temperature,
+        repetition_penalty,
+        repetition_penalty_type,
+        input_tensors->getVal<const int>("min_length", 0),
+        stream_);
+    sync_check_cuda_error();
+
+    invokeSoftMax(output_tensors, input_tensors);
+
+    if (beam_width > 1) {
+        const int max_seq_len = output_tensors->at("output_ids").shape[0];
+
+        update_indir_cache_kernelLauncher(
+            output_tensors->at("tgt_cache_indirection").getPtr<int>(),
+            input_tensors->at("src_cache_indirection").getPtr<const int>(),
+            output_tensors->at("parent_ids")
+                .getPtrWithOffset<const int>(+step * beam_width * batch_size + ite * local_batch_size * beam_width),
+            output_tensors->at("finished").getPtr<const bool>(),
+            batch_size,
+            local_batch_size,
+            beam_width,
+            max_seq_len,
+            step,
+            stream_);
+        sync_check_cuda_error();
+    }
+    sync_check_cuda_error();
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template class BaseBeamSearchLayer<float>;
+template class BaseBeamSearchLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h b/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbd02ef7d58572a3c45785a149b7e24c0c393c9f
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class BaseBeamSearchLayer: public DynamicDecodeBaseLayer {
+private:
+    void freeBuffer();
+
+protected:
+    // meta data
+    size_t vocab_size_;
+    size_t vocab_size_padded_;
+
+    size_t topk_softmax_workspace_size_;
+    void*  topk_softmax_workspace_ = nullptr;
+
+    virtual void allocateBuffer()                                                   = 0;
+    virtual void allocateBuffer(size_t batch_size, size_t beam_width)               = 0;
+    virtual void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
+
+public:
+    BaseBeamSearchLayer(size_t           max_batch_size,
+                        size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           beam_width,
+                        size_t           vocab_size,
+                        size_t           vocab_size_padded,
+                        int              end_id,
+                        float            diversity_rate,
+                        float            temperature,
+                        float            len_penalty,
+                        float            repetition_penalty,
+                        cudaStream_t     stream,
+                        cublasMMWrapper* cublas_wrapper,
+                        IAllocator*      allocator,
+                        bool             is_free_buffer_after_forward);
+
+    BaseBeamSearchLayer(BaseBeamSearchLayer<T> const& beam_search_layer);
+
+    ~BaseBeamSearchLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors) override;
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors) override;
+    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
+};
+
+void update_indir_cache_kernelLauncher(int*         tgt_indir_cache,
+                                       const int*   src_indir_cache,
+                                       const int*   beam_ids,
+                                       const bool*  finished,
+                                       int          batch_dim,
+                                       int          beam_width,
+                                       int          max_seq_len,
+                                       int          ite,
+                                       cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.cu b/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..23efb4828177c14d260498e2f01e4991c3bd3574
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.cu
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+__global__ void logProbAddCumLogProb(float*       log_probs,
+                                     const T*     logits,
+                                     const float* cum_log_probs,
+                                     const int*   end_ids,
+                                     const bool*  finished,
+                                     const int    beam_width,
+                                     const int    n)
+{
+    int  bid    = blockIdx.x;
+    bool finish = finished != nullptr ? finished[bid] : false;
+    int  offset = bid * n;
+
+    float            max_val = -1 * FLT_MAX;
+    __shared__ float s_max_val;
+    __shared__ float s_sum_val;
+
+    if (finish) {
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = (tid == end_ids[bid / beam_width]) ? cum_log_probs[bid] : -FLT_MAX;
+        }
+    }
+    else {
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = (float)(logits[offset + tid]);
+            max_val                 = max(max_val, log_probs[offset + tid]);
+        }
+
+        max_val = blockReduceMax(max_val);
+        if (threadIdx.x == 0) {
+            s_max_val = max_val;
+        }
+        __syncthreads();
+
+        float sum_val = 0.0f;
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = __expf(log_probs[offset + tid] - s_max_val);
+            sum_val += log_probs[offset + tid];
+        }
+
+        sum_val = blockReduceSum(sum_val);
+        if (threadIdx.x == 0) {
+            s_sum_val = sum_val + 1e-6f;
+        }
+        __syncthreads();
+
+        for (int tid = threadIdx.x; tid < n; tid += blockDim.x) {
+            log_probs[offset + tid] = logf(log_probs[offset + tid] / s_sum_val) + cum_log_probs[bid];
+        }
+    }
+}
+
+template<typename T>
+void invokeLogProbAddCumLogProb(float*       log_probs,
+                                const T*     logits,
+                                const float* cum_log_probs,
+                                const int*   end_ids,
+                                const bool*  finished,
+                                const int    m,
+                                const int    beam_width,
+                                const int    n,
+                                cudaStream_t stream)
+{
+    dim3 grid(m);
+    dim3 block(min(n, 1024));
+    /*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
+    logProbAddCumLogProb<<<grid, block, 0, stream>>>(
+        log_probs, logits, cum_log_probs, end_ids, finished, beam_width, n);
+}
+
+template<typename T>
+__global__ void updateStatesKernel(T*             log_probs,
+                                   T*             cum_log_probs,
+                                   float*         output_log_probs,
+                                   bool*          finished,
+                                   int*           parent_ids,
+                                   int*           sequence_length,
+                                   int*           word_ids,
+                                   int*           output_ids,
+                                   BeamHypotheses beam_hyps,
+                                   const int      local_batch_size,
+                                   const int      beam_width,
+                                   const int      vocab_size,
+                                   const int*     end_ids)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
+         index += blockDim.x * gridDim.x) {
+
+        int batch_id           = index / beam_width;
+        sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
+
+        int beam_id = (word_ids[index] / vocab_size) % beam_width;
+        int word_id = word_ids[index] % vocab_size;
+
+        if (output_log_probs != nullptr) {
+            // get the cum_log_probs of previous run
+            output_log_probs[index] = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id]
+                                      - cum_log_probs[batch_id * beam_width + beam_id];
+        }
+        cum_log_probs[index]   = log_probs[batch_id * beam_width * vocab_size + beam_id * vocab_size + word_id];
+        sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
+        finished[index]        = word_id == end_ids[batch_id] ? 1 : 0;
+        parent_ids[index]      = beam_id;
+        word_ids[index]        = word_id;
+        output_ids[index]      = word_id;
+
+        if (beam_hyps.num_beams != nullptr) {
+            if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
+                for (int i = 0; i < beam_width; i++) {
+                    finished[batch_id * beam_width + i] = true;
+                }
+            }
+        }
+    }
+}
+
+void invokeUpdateStates(float*          log_probs,
+                        float*          cum_log_probs,
+                        float*          output_log_probs,
+                        bool*           finished,
+                        int*            parent_ids,
+                        int*            sequence_length,
+                        int*            word_ids,
+                        int*            output_ids,
+                        BeamHypotheses* beam_hyps,
+                        const int       local_batch_size,
+                        const int       beam_width,
+                        const int       vocab_size,
+                        const int*      end_ids,
+                        cudaStream_t    stream)
+{
+    dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
+    dim3 block(256);
+
+    updateStatesKernel<float><<<grid, block, 0, stream>>>(log_probs,
+                                                          cum_log_probs,
+                                                          output_log_probs,
+                                                          finished,
+                                                          parent_ids,
+                                                          sequence_length,
+                                                          word_ids,
+                                                          output_ids,
+                                                          *beam_hyps,
+                                                          local_batch_size,
+                                                          beam_width,
+                                                          vocab_size,
+                                                          end_ids);
+}
+
+template<typename T>
+void BeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width]
+    //      ite [1] on cpu
+    //      beam_search_diversity_rate [1] on cpu, optional
+    //      temperature [1] on cpu, optional
+    //      len_penalty [1] on cpu, optional
+    //      repetition_penalty [1] on cpu, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width]
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width]
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      output_log_probs [max_seq_len, batch_size * beam_width], optional
+    //      beam_hyps, optional
+
+    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(output_tensors->size() >= 6);
+
+    const int   batch_size       = output_tensors->at("output_ids").shape[1];
+    const int   beam_width       = output_tensors->at("output_ids").shape[2];
+    const int   step             = input_tensors->at("step").getVal<int>();
+    const int   ite              = input_tensors->at("ite").getVal<int>();
+    const int   local_batch_size = input_tensors->at("logits").shape[0];
+    const float diversity_rate   = input_tensors->isExist("beam_search_diversity_rate") ?
+                                       input_tensors->at("beam_search_diversity_rate").getVal<float>() :
+                                       0.0f;
+    const float length_penalty =
+        input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
+
+    const int id_offset = step * batch_size * beam_width + ite * local_batch_size * beam_width;
+    invokeLogProbAddCumLogProb(float_log_prob_buf_,
+                               input_tensors->at("logits").getPtr<T>(),
+                               output_tensors->at("cum_log_probs").getPtr<float>(),
+                               input_tensors->at("end_id").getPtr<const int>(),
+                               output_tensors->at("finished").getPtr<bool>(),
+                               local_batch_size * beam_width,
+                               beam_width,
+                               vocab_size_padded_,
+                               stream_);
+    sync_check_cuda_error();
+
+    BeamHypotheses beam_hyps;
+    if (output_tensors->isExist("beam_hyps") && diversity_rate == 0.0f) {
+        beam_hyps                      = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
+        beam_hyps.step                 = step;
+        beam_hyps.ite                  = ite;
+        beam_hyps.local_batch_size     = local_batch_size;
+        beam_hyps.batch_size           = output_tensors->at("output_ids").shape[1];
+        beam_hyps.max_seq_len          = output_tensors->at("output_ids").shape[0];
+        beam_hyps.output_ids_src       = output_tensors->at("output_ids").getPtr<int>();
+        beam_hyps.parent_ids_src       = output_tensors->at("parent_ids").getPtr<int>();
+        beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
+        beam_hyps.length_penalty       = length_penalty;
+    }
+
+    invokeTopkBeamSearch<float>(topk_softmax_workspace_,
+                                topk_softmax_workspace_size_,
+                                float_log_prob_buf_,
+                                output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                                &beam_hyps,
+                                output_tensors->at("finished").getPtr<bool>(),
+                                output_tensors->isExist("sequence_length") ?
+                                    output_tensors->at("sequence_length").getPtr<int>() :
+                                    (int*)nullptr,
+                                local_batch_size,
+                                beam_width,
+                                vocab_size_padded_,
+                                diversity_rate,
+                                length_penalty,
+                                input_tensors->at("end_id").getPtr<const int>(),
+                                stream_);
+    sync_check_cuda_error();
+
+    invokeUpdateStates(float_log_prob_buf_,
+                       output_tensors->at("cum_log_probs").getPtr<float>(),
+                       output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
+                       output_tensors->at("finished").getPtr<bool>(),
+                       output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
+                       output_tensors->at("sequence_length").getPtr<int>(),
+                       output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                       output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                       &beam_hyps,
+                       local_batch_size,
+                       beam_width,
+                       vocab_size_padded_,
+                       input_tensors->at("end_id").getPtr<const int>(),
+                       stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void BeamSearchLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void BeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    invokeTopkBeamSearch<float>(nullptr,
+                                topk_softmax_workspace_size_,
+                                nullptr,
+                                nullptr,
+                                nullptr,
+                                nullptr,
+                                nullptr,
+                                batch_size,
+                                beam_width,
+                                vocab_size_padded_,
+                                0.0f,  // diversity rate
+                                0.0f,  // length penalty
+                                nullptr,
+                                stream_);
+    topk_softmax_workspace_ = reinterpret_cast<float*>(allocator_->reMalloc(
+        topk_softmax_workspace_,
+        topk_softmax_workspace_size_ + sizeof(float) * batch_size * beam_width * vocab_size_padded_,
+        false));
+    float_log_prob_buf_     = (float*)((char*)topk_softmax_workspace_ + topk_softmax_workspace_size_);
+    is_allocate_buffer_     = true;
+}
+
+template<typename T>
+BeamSearchLayer<T>::BeamSearchLayer(size_t           max_batch_size,
+                                    size_t           head_num,
+                                    size_t           size_per_head,
+                                    size_t           beam_width,
+                                    size_t           vocab_size,
+                                    size_t           vocab_size_padded,
+                                    int              end_id,
+                                    float            diversity_rate,
+                                    float            temperature,
+                                    float            len_penalty,
+                                    float            repetition_penalty,
+                                    cudaStream_t     stream,
+                                    cublasMMWrapper* cublas_wrapper,
+                                    IAllocator*      allocator,
+                                    bool             is_free_buffer_after_forward):
+    BaseBeamSearchLayer<T>(max_batch_size,
+                           head_num,
+                           size_per_head,
+                           beam_width,
+                           vocab_size,
+                           vocab_size_padded,
+                           end_id,
+                           diversity_rate,
+                           temperature,
+                           len_penalty,
+                           repetition_penalty,
+                           stream,
+                           cublas_wrapper,
+                           allocator,
+                           is_free_buffer_after_forward)
+{
+}
+
+template<typename T>
+BeamSearchLayer<T>::BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer):
+    BaseBeamSearchLayer<T>(beam_search_layer)
+{
+}
+
+template<typename T>
+BeamSearchLayer<T>::~BeamSearchLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template class BeamSearchLayer<float>;
+template class BeamSearchLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h b/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c30f23ac868f71e0806a15c29956fe6163a7b83
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/BeamSearchLayer.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/beam_search_topk_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+#include <float.h>
+
+namespace fastertransformer {
+
+template<typename T>
+class BeamSearchLayer: public BaseBeamSearchLayer<T> {
+private:
+    // meta data
+    using BaseBeamSearchLayer<T>::vocab_size_;
+    using BaseBeamSearchLayer<T>::vocab_size_padded_;
+
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t beam_width) override;
+    void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
+
+    using BaseBeamSearchLayer<T>::stream_;
+    using BaseBeamSearchLayer<T>::is_allocate_buffer_;
+    using BaseBeamSearchLayer<T>::allocator_;
+
+    float* float_log_prob_buf_ = nullptr;
+
+protected:
+public:
+    BeamSearchLayer(size_t           max_batch_size,
+                    size_t           head_num,
+                    size_t           size_per_head,
+                    size_t           beam_width,
+                    size_t           vocab_size,
+                    size_t           vocab_size_padded,
+                    int              end_id,
+                    float            diversity_rate,
+                    float            temperature,
+                    float            len_penalty,
+                    float            repetition_penalty,
+                    cudaStream_t     stream,
+                    cublasMMWrapper* cublas_wrapper,
+                    IAllocator*      allocator,
+                    bool             is_free_buffer_after_forward);
+
+    BeamSearchLayer(BeamSearchLayer<T> const& beam_search_layer);
+
+    ~BeamSearchLayer();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/beam_search_layers/CMakeLists.txt b/src/fastertransformer/layers/beam_search_layers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2708722334b402d6238f63d520d58e40b31f4128
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu)
+set_property(TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels cuda_utils)
+
+add_library(OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu)
+set_property(TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels)
+
+add_library(BeamSearchLayer STATIC BeamSearchLayer.cu)
+set_property(TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels)
diff --git a/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.cu b/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..89f489d00a10d0753dd94d5c114144e0af44cd3e
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.cu
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h"
+
+namespace fastertransformer {
+
+static const int SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS = 128;
+static const int MAX_K                             = 4;
+
+template<typename T>
+__global__ void update_kernel(bool*          finished,
+                              int*           parent_ids,
+                              int*           sequence_length,
+                              int*           word_ids,
+                              int*           output_ids,
+                              BeamHypotheses beam_hyps,
+                              const int      vocab_size,
+                              const int*     end_ids,
+                              const int      local_batch_size,
+                              const int      beam_width)
+{
+    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < local_batch_size * beam_width;
+         index += blockDim.x * gridDim.x) {
+
+        int batch_id           = index / beam_width;
+        sequence_length[index] = finished[index] ? sequence_length[index] : sequence_length[index] + 1;
+
+        int beam_id = (word_ids[index] / vocab_size) % beam_width;
+        int word_id = word_ids[index] % vocab_size;
+
+        sequence_length[index] = sequence_length[batch_id * beam_width + beam_id];
+        finished[index]        = word_id == end_ids[index / beam_width] ? 1 : 0;
+        parent_ids[index]      = beam_id;
+        word_ids[index]        = word_id;
+        output_ids[index]      = word_id;
+
+        if (beam_hyps.num_beams != nullptr) {
+            if (beam_hyps.num_beams[beam_hyps.ite * beam_hyps.local_batch_size + batch_id] == beam_width) {
+                for (int i = 0; i < beam_width; i++) {
+                    finished[batch_id * beam_width + i] = true;
+                }
+            }
+        }
+    }
+}
+
+void invokeUpdate(bool*           finished,
+                  int*            parent_ids,
+                  int*            sequence_length,
+                  int*            word_ids,
+                  int*            output_ids,
+                  BeamHypotheses* beam_hyps,
+                  const int       local_batch_size,
+                  const int       beam_width,
+                  const int       vocab_size_padded,
+                  const int*      end_ids,
+                  cudaStream_t    stream)
+{
+    dim3 grid((int)ceil(local_batch_size * beam_width * 1.0 / 256));
+    dim3 block(256);
+
+    update_kernel<float><<<grid, block, 0, stream>>>(finished,
+                                                     parent_ids,
+                                                     sequence_length,
+                                                     word_ids,
+                                                     output_ids,
+                                                     *beam_hyps,
+                                                     vocab_size_padded,
+                                                     end_ids,
+                                                     local_batch_size,
+                                                     beam_width);
+}
+
+template<typename T>
+void OnlineBeamSearchLayer<T>::invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, beam_width, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      src_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size * beam_width]
+    //      ite [1] on cpu
+    //      beam_search_diversity_rate [1] on cpu, optional
+    //      temperature [1] on cpu, optional
+    //      len_penalty [1] on cpu, optional
+    //      repetition_penalty [1] on cpu, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size, beam_width]
+    //      finished [local_batch_size * beam_width]
+    //      cum_log_probs [local_batch_size * beam_width]
+    //      parent_ids [max_seq_len, batch_size * beam_width]
+    //      sequence_length [local_batch_size * beam_width]
+    //      tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
+    //      output_log_probs [max_seq_len, batch_size, beam_width]
+
+    FT_CHECK(input_tensors->size() >= 7);
+    FT_CHECK(output_tensors->size() >= 6);
+
+    const int   batch_size       = output_tensors->at("output_ids").shape[1];
+    const int   beam_width       = output_tensors->at("output_ids").shape[2];
+    const int   step             = input_tensors->at("step").getVal<int>();
+    const int   ite              = input_tensors->at("ite").getVal<int>();
+    const int   local_batch_size = input_tensors->at("logits").shape[0];
+    const float diversity_rate   = input_tensors->isExist("beam_search_diversity_rate") ?
+                                       input_tensors->at("beam_search_diversity_rate").getVal<float>() :
+                                       0.0f;
+    const float length_penalty =
+        input_tensors->isExist("len_penalty") ? input_tensors->at("len_penalty").getVal<float>() : 0.0f;
+
+    const int id_offset = step * batch_size * beam_width + local_batch_size * ite * beam_width;
+
+    BeamHypotheses beam_hyps;
+    if (output_tensors->isExist("beam_hyps")) {
+        beam_hyps                      = *((BeamHypotheses*)(output_tensors->at("beam_hyps").getPtr<void>()));
+        beam_hyps.step                 = step;
+        beam_hyps.ite                  = ite;
+        beam_hyps.local_batch_size     = local_batch_size;
+        beam_hyps.batch_size           = output_tensors->at("output_ids").shape[1];
+        beam_hyps.max_seq_len          = output_tensors->at("output_ids").shape[0];
+        beam_hyps.output_ids_src       = output_tensors->at("output_ids").getPtr<int>();
+        beam_hyps.parent_ids_src       = output_tensors->at("parent_ids").getPtr<int>();
+        beam_hyps.sequence_lengths_src = output_tensors->at("sequence_length").getPtr<int>();
+        beam_hyps.log_probs_src        = output_tensors->getPtr<float>("output_log_probs", nullptr);
+        beam_hyps.length_penalty       = length_penalty;
+        beam_hyps.end_ids              = input_tensors->at("end_id").getPtr<int>();
+    }
+
+    invokeTopkSoftMax(input_tensors->at("logits").getPtr<T>(),
+                      (const T*)(nullptr),
+                      output_tensors->at("finished").getPtr<bool>(),
+                      output_tensors->at("sequence_length").getPtr<int>(),
+                      output_tensors->at("cum_log_probs").getPtr<float>(),
+                      output_tensors->getPtrWithOffset<float>("output_log_probs", id_offset, nullptr),
+                      output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                      topk_softmax_workspace_,
+                      topk_softmax_workspace_size_,
+                      &beam_hyps,
+                      local_batch_size,
+                      beam_width,
+                      vocab_size_padded_,
+                      input_tensors->at("end_id").getPtr<int>(),
+                      diversity_rate,
+                      length_penalty,
+                      stream_);
+    sync_check_cuda_error();
+
+    invokeUpdate(output_tensors->at("finished").getPtr<bool>(),
+                 output_tensors->at("parent_ids").getPtrWithOffset<int>(id_offset),
+                 output_tensors->at("sequence_length").getPtr<int>(),
+                 output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                 output_tensors->at("output_ids").getPtrWithOffset<int>(id_offset),
+                 &beam_hyps,
+                 local_batch_size,
+                 beam_width,
+                 vocab_size_padded_,
+                 input_tensors->at("end_id").getPtr<const int>(),
+                 stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void OnlineBeamSearchLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void OnlineBeamSearchLayer<T>::allocateBuffer(size_t batch_size, size_t beam_width)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // we need to check 2 * beam_width candidates each time
+    // 64 is the max beam width we support now.
+    topk_softmax_workspace_size_ =
+        (size_t)(ceil(batch_size * 64 * (64 * 2) / 4.) * 4 * 2
+                 + ceil(batch_size * (64 * 2) * SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS * (2 * (MAX_K * 2) + 2) / 4.) * 4);
+
+    topk_softmax_workspace_ = reinterpret_cast<float*>(
+        allocator_->reMalloc(topk_softmax_workspace_, sizeof(float) * topk_softmax_workspace_size_, true));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(size_t           max_batch_size,
+                                                size_t           head_num,
+                                                size_t           size_per_head,
+                                                size_t           beam_width,
+                                                size_t           vocab_size,
+                                                size_t           vocab_size_padded,
+                                                int              end_id,
+                                                float            diversity_rate,
+                                                float            temperature,
+                                                float            len_penalty,
+                                                float            repetition_penalty,
+                                                cudaStream_t     stream,
+                                                cublasMMWrapper* cublas_wrapper,
+                                                IAllocator*      allocator,
+                                                bool             is_free_buffer_after_forward):
+    BaseBeamSearchLayer<T>(max_batch_size,
+                           head_num,
+                           size_per_head,
+                           beam_width,
+                           vocab_size,
+                           vocab_size_padded,
+                           end_id,
+                           diversity_rate,
+                           temperature,
+                           len_penalty,
+                           repetition_penalty,
+                           stream,
+                           cublas_wrapper,
+                           allocator,
+                           is_free_buffer_after_forward)
+{
+}
+
+template<typename T>
+OnlineBeamSearchLayer<T>::OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer):
+    BaseBeamSearchLayer<T>(beam_search_layer)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template<typename T>
+OnlineBeamSearchLayer<T>::~OnlineBeamSearchLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template class OnlineBeamSearchLayer<float>;
+template class OnlineBeamSearchLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h b/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3991a11cef2df525110bc15808d1ad86b7550a2e
--- /dev/null
+++ b/src/fastertransformer/layers/beam_search_layers/OnlineBeamSearchLayer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/kernels/online_softmax_beamsearch_kernels.h"
+#include "src/fastertransformer/layers/beam_search_layers/BaseBeamSearchLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class OnlineBeamSearchLayer: public BaseBeamSearchLayer<T> {
+private:
+    // meta data
+    using BaseBeamSearchLayer<T>::vocab_size_;
+    using BaseBeamSearchLayer<T>::vocab_size_padded_;
+
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_size_;
+    using BaseBeamSearchLayer<T>::topk_softmax_workspace_;
+
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t beam_width) override;
+    void invokeSoftMax(TensorMap* output_tensors, TensorMap* input_tensors) override;
+
+    using BaseBeamSearchLayer<T>::stream_;
+    using BaseBeamSearchLayer<T>::is_allocate_buffer_;
+    using BaseBeamSearchLayer<T>::allocator_;
+
+protected:
+public:
+    OnlineBeamSearchLayer(size_t           max_batch_size,
+                          size_t           head_num,
+                          size_t           size_per_head,
+                          size_t           beam_width,
+                          size_t           vocab_size,
+                          size_t           vocab_size_padded,
+                          int              end_id,
+                          float            diversity_rate,
+                          float            temperature,
+                          float            len_penalty,
+                          float            repetition_penalty,
+                          cudaStream_t     stream,
+                          cublasMMWrapper* cublas_wrapper,
+                          IAllocator*      allocator,
+                          bool             is_free_buffer_after_forward);
+
+    OnlineBeamSearchLayer(OnlineBeamSearchLayer<T> const& beam_search_layer);
+
+    ~OnlineBeamSearchLayer();
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb168847f8313790280e982d0111465560d4e83b
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
+#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include <algorithm>
+
+namespace fastertransformer {
+
+template<typename T>
+void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    curandstate_buf_ = reinterpret_cast<curandState_t*>(
+        allocator_->reMalloc(curandstate_buf_, sizeof(curandState_t) * batch_size, false));
+    random_seeds_buf_ = reinterpret_cast<unsigned long long*>(
+        allocator_->reMalloc(random_seeds_buf_, sizeof(unsigned long long) * batch_size, false));
+    temperature_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(temperature_buf_, sizeof(float) * batch_size, false));
+    repetition_penalty_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(repetition_penalty_buf_, sizeof(float) * batch_size, false));
+    min_lengths_buf_ = reinterpret_cast<int*>(allocator_->reMalloc(min_lengths_buf_, sizeof(int) * batch_size, false));
+    runtime_logits_buf_ = reinterpret_cast<T*>(
+        allocator_->reMalloc(runtime_logits_buf_, sizeof(T) * batch_size * vocab_size_padded_, false));
+    skip_decode_buf_ =
+        reinterpret_cast<bool*>(allocator_->reMalloc(skip_decode_buf_, sizeof(bool) * batch_size, false));
+
+    // host buffers.
+    temperature_        = new float[batch_size];
+    repetition_penalty_ = new float[batch_size];
+    min_lengths_        = new int[batch_size];
+    skip_decode_        = new bool[batch_size];
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&curandstate_buf_));
+        allocator_->free((void**)(&random_seeds_buf_));
+        allocator_->free((void**)(&temperature_buf_));
+        allocator_->free((void**)(&repetition_penalty_buf_));
+        allocator_->free((void**)(&min_lengths_buf_));
+        allocator_->free((void**)(&runtime_logits_buf_));
+        allocator_->free((void**)(&skip_decode_buf_));
+        delete[] temperature_;
+        delete[] repetition_penalty_;
+        delete[] min_lengths_;
+        delete[] skip_decode_;
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+BaseSamplingLayer<T>::BaseSamplingLayer(size_t             max_batch_size,
+                                        size_t             vocab_size,
+                                        size_t             vocab_size_padded,
+                                        int                end_id,
+                                        size_t             top_k,
+                                        float              top_p,
+                                        unsigned long long random_seed,
+                                        float              temperature,
+                                        float              len_penalty,
+                                        float              repetition_penalty,
+                                        cudaStream_t       stream,
+                                        cublasMMWrapper*   cublas_wrapper,
+                                        IAllocator*        allocator,
+                                        bool               is_free_buffer_after_forward,
+                                        cudaDeviceProp*    cuda_device_prop):
+    DynamicDecodeBaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, cuda_device_prop),
+    vocab_size_(vocab_size),
+    vocab_size_padded_(vocab_size_padded)
+{
+}
+
+template<typename T>
+BaseSamplingLayer<T>::BaseSamplingLayer(BaseSamplingLayer const& sampling_layer):
+    DynamicDecodeBaseLayer(sampling_layer),
+    vocab_size_(sampling_layer.vocab_size_),
+    vocab_size_padded_(sampling_layer.vocab_size_padded_),
+    sampling_workspace_size_(sampling_layer.sampling_workspace_size_)
+{
+}
+
+template<typename T>
+BaseSamplingLayer<T>::~BaseSamplingLayer()
+{
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    // Set up the sampling layer for given runtime arguments.
+    //
+    // runtime_args:
+    //     runtime_top_k [1] or [batch_size] on cpu, optional.
+    //     runtime_top_p [1] or [batch_size] on cpu, optional
+    //     temperature [1] or [batch_size] on cpu, optional
+    //     repetition_penalty [1] or [batch_size] on cpu, optional
+    //     presence_penalty [1] or [batch_size] on cpu, optional,
+    //         repetition_penalty and presence_penalty are mutually exclusive.
+    //     min_length [1] or [batch_size] on cpu, optional
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ? runtime_args->at("runtime_top_k") : Tensor();
+    Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
+    allocateBuffer(batch_size, runtime_top_k, runtime_top_p);
+
+    // If runtime argument has single random seed, using this random seed to initialize the random table of all
+    // sentences. If the argument has [batch_size] random seeds, initializing the random table by different random seeds
+    // respectively. If no random seed, initialize the random table of all sentences by 0 directly.
+    if (runtime_args->isExist("random_seed")) {
+        Tensor random_seeds = runtime_args->at("random_seed");
+        FT_CHECK_WITH_INFO(random_seeds.shape.size() == 1
+                               && (random_seeds.size() == 1 || random_seeds.size() == batch_size),
+                           fmtstr("random_seeds must be of shape [1] or [batch_size(%ld)], got random_seeds.shape=%s",
+                                  batch_size,
+                                  vec2str(random_seeds.shape).c_str()));
+        if (random_seeds.size() == 1) {
+            invokeCurandInitialize(curandstate_buf_, batch_size, random_seeds.getVal<unsigned long long>(), stream_);
+            sync_check_cuda_error();
+        }
+        else {
+            unsigned long long* random_seed_ptr = random_seeds.getPtr<unsigned long long>();
+            cudaAutoCpy(random_seeds_buf_, random_seed_ptr, batch_size, stream_);
+            invokeCurandBatchInitialize(curandstate_buf_, batch_size, random_seeds_buf_, stream_);
+            sync_check_cuda_error();
+        }
+    }
+    else {
+        // Initialize curand states using the default seed 0.
+        invokeCurandInitialize(curandstate_buf_, batch_size, 0, stream_);
+    }
+
+    // Setup penalties.
+    const float default_temperature = 1.0f;
+    Tensor      temperature         = runtime_args->isExist("temperature") ?
+                                          runtime_args->at("temperature") :
+                                          Tensor(MEMORY_CPU, TYPE_FP32, {1}, &default_temperature);
+    if (temperature.size() == 1) {
+        float tp = temperature.getVal<float>();
+        deviceFill(temperature_buf_, batch_size, tp, stream_);
+        std::fill_n(temperature_, batch_size, tp);
+    }
+    else {
+        cudaAutoCpy(temperature_buf_, temperature.getPtr<float>(), batch_size, stream_);
+        std::copy_n(temperature.getPtr<float>(), batch_size, temperature_);
+    }
+
+    if (runtime_args->isExist("repetition_penalty") || runtime_args->isExist("presence_penalty")) {
+        FT_CHECK_WITH_INFO(
+            !(runtime_args->isExist("repetition_penalty") && runtime_args->isExist("presence_penalty")),
+            "Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
+            "Please provide one of repetition_penalty or presence_penalty.");
+        repetition_penalty_type_ = runtime_args->isExist("repetition_penalty") ? RepetitionPenaltyType::Multiplicative :
+                                                                                 RepetitionPenaltyType::Additive;
+        Tensor repetition_penalty = repetition_penalty_type_ == RepetitionPenaltyType::Multiplicative ?
+                                        runtime_args->at("repetition_penalty") :
+                                        runtime_args->at("presence_penalty");
+        if (repetition_penalty.size() == 1) {
+            float rp = repetition_penalty.getVal<float>();
+            deviceFill(repetition_penalty_buf_, batch_size, rp, stream_);
+            std::fill_n(repetition_penalty_, batch_size, rp);
+        }
+        else {
+            cudaAutoCpy(repetition_penalty_buf_, repetition_penalty.getPtr<float>(), batch_size, stream_);
+            std::copy_n(repetition_penalty.getPtr<float>(), batch_size, repetition_penalty_);
+        }
+    }
+    else {
+        repetition_penalty_type_ = RepetitionPenaltyType::None;
+    }
+
+    const int default_min_length = 0;
+    Tensor    min_lengths = runtime_args->at("min_length", Tensor(MEMORY_CPU, TYPE_INT32, {1}, &default_min_length));
+    if (min_lengths.size() == 1) {
+        int minlen = min_lengths.getVal<int>();
+        deviceFill(min_lengths_buf_, batch_size, minlen, stream_);
+        std::fill_n(min_lengths_, batch_size, minlen);
+    }
+    else {
+        cudaAutoCpy(min_lengths_buf_, min_lengths.getPtr<int>(), batch_size, stream_);
+        std::copy_n(min_lengths.getPtr<int>(), batch_size, min_lengths_);
+    }
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::forward(std::vector<Tensor>* output_tensors, const std::vector<Tensor>* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded]
+    //      step [1] on cpu
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size]
+    //      ite [1] on cpu
+    //      random_seed [1] on cpu, optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size]
+    //      finished [local_batch_size]
+    //      sequence_length [local_batch_size]
+    //      cum_log_probs [local_batch_size], must be float*
+
+    FT_CHECK(false);  // TODO deprecated, need to remove
+    std::unordered_map<std::string, Tensor> input_tensors_map{{"logits", input_tensors->at(0)},
+                                                              {"embedding_bias", input_tensors->at(1)},
+                                                              {"step", input_tensors->at(2)},
+                                                              {"max_input_length", input_tensors->at(3)},
+                                                              {"input_lengths", input_tensors->at(4)},
+                                                              {"ite", input_tensors->at(5)}};
+    if (input_tensors->size() == 7) {
+        input_tensors_map.insert({"random_seed", input_tensors->at(6)});
+    }
+
+    std::unordered_map<std::string, Tensor> output_tensors_map{{"output_ids", output_tensors->at(0)},
+                                                               {"finished", output_tensors->at(1)},
+                                                               {"sequence_length", output_tensors->at(2)},
+                                                               {"cum_log_probs", output_tensors->at(3)}};
+    forward(&output_tensors_map, &input_tensors_map);
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                                   const std::unordered_map<std::string, Tensor>* input_tensors)
+{
+    FT_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
+    TensorMap input_map(*input_tensors);
+    TensorMap output_map(*output_tensors);
+    forward(&output_map, &input_map);
+}
+
+template<typename T>
+void BaseSamplingLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded], optional
+    //      step [1] on cpu
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size], optional
+    //      ite [1] on cpu
+    //      end_id [local_batch_size], optional
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size]
+    //      finished [local_batch_size], optional
+    //      sequence_length [local_batch_size], optional
+    //      cum_log_probs [batch_size], must be float*, optional
+    //          The cumultative log probability of generated tokens.
+    //      output_log_probs [local_batch_size], must be float*, optional
+    //          The log probs at the current step.
+
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 4);
+    FT_CHECK(output_tensors->size() >= 1);
+    const int batch_size       = output_tensors->at("output_ids").shape[1];
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+    const int step             = input_tensors->at("step").getVal<int>();
+    const int ite              = input_tensors->at("ite").getVal<int>();
+    const int max_input_length = input_tensors->at("max_input_length").getVal<int>();
+    T*        logits           = input_tensors->at("logits").getPtr<T>();
+
+#define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; }))
+
+    bool* skip_decode = skip_decode_ + ite * local_batch_size;
+    if (ALL_OF(skip_decode, local_batch_size, bool, true)) {
+        // No sample in the current batch to do TopX sampling.
+        return;
+    }
+    skip_any_ = std::any_of(skip_decode, skip_decode + local_batch_size, [](bool b) { return b; });
+    if (skip_any_) {
+        // A TopX Sampling layer directly changes the logit values. In case of skip_any==true,
+        // meaning topk and topp layers will run simultaneously for a batch in the same step.
+        // We copy the logits to an internal buffer, not affecting the other sampling layers.
+        FT_CHECK(input_tensors->at("logits").size() == local_batch_size * vocab_size_padded_);
+        cudaD2Dcpy(runtime_logits_buf_, logits, input_tensors->at("logits").size());
+        logits = runtime_logits_buf_;
+    }
+
+    const T* embedding_bias =
+        input_tensors->isExist("embedding_bias") ? input_tensors->at("embedding_bias").getPtr<T>() : nullptr;
+    if (embedding_bias != nullptr || !ALL_OF(temperature_ + ite * local_batch_size, local_batch_size, float, 1.0f)) {
+        invokeBatchApplyTemperaturePenalty(logits,
+                                           embedding_bias,
+                                           temperature_buf_ + ite * local_batch_size,
+                                           local_batch_size,
+                                           vocab_size_,
+                                           vocab_size_padded_,
+                                           stream_);
+    }
+    sync_check_cuda_error();
+
+    if (step > 1 && repetition_penalty_type_ != RepetitionPenaltyType::None) {
+        float default_value = getDefaultPenaltyValue(repetition_penalty_type_);
+        if (!ALL_OF(repetition_penalty_ + ite * local_batch_size, local_batch_size, float, default_value)) {
+            invokeBatchApplyRepetitionPenalty(
+                logits,
+                repetition_penalty_buf_ + ite * local_batch_size,
+                output_tensors->at("output_ids").getPtrWithOffset<int>(ite * local_batch_size),
+                batch_size,
+                local_batch_size,
+                vocab_size_padded_,
+                input_tensors->at("input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {}, nullptr}).getPtr<int>(),
+                max_input_length,
+                step,
+                repetition_penalty_type_,
+                stream_);
+            sync_check_cuda_error();
+        }
+    }
+
+    const int  num_generated_tokens      = step - max_input_length;
+    const int* min_lengths               = min_lengths_ + ite * local_batch_size;
+    const bool invoke_min_length_penalty = std::any_of(
+        min_lengths, min_lengths + local_batch_size, [&](int min_length) { return min_length > num_generated_tokens; });
+    if (invoke_min_length_penalty) {
+        FT_CHECK_WITH_INFO(input_tensors->isExist("end_id"), "Need end_id to apply min length penlaty");
+        invokeMinLengthPenalty(logits,
+                               min_lengths_buf_ + ite * local_batch_size,
+                               input_tensors->getPtr<const int>("end_id"),
+                               output_tensors->getPtr<const int>("sequence_length"),
+                               max_input_length,
+                               local_batch_size,
+                               vocab_size_padded_,
+                               stream_);
+        sync_check_cuda_error();
+    }
+#undef ALL_OF
+
+    runSampling(output_tensors, input_tensors);
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template class BaseSamplingLayer<float>;
+template class BaseSamplingLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3489b8a1fc47b5eee2f90a64f16a752a499c6492
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <curand_kernel.h>
+
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/layers/DynamicDecodeBaseLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class BaseSamplingLayer: public DynamicDecodeBaseLayer {
+private:
+    bool isValidBatchSize(size_t batch_size);
+
+protected:
+    size_t vocab_size_;
+    size_t vocab_size_padded_;
+
+    size_t              sampling_workspace_size_;
+    void*               sampling_workspace_ = nullptr;
+    curandState_t*      curandstate_buf_    = nullptr;
+    unsigned long long* random_seeds_buf_   = nullptr;
+
+    float* temperature_buf_        = nullptr;
+    float* repetition_penalty_buf_ = nullptr;
+    int*   min_lengths_buf_        = nullptr;
+    bool*  skip_decode_buf_        = nullptr;
+    T*     runtime_logits_buf_     = nullptr;
+
+    float* temperature_        = nullptr;
+    float* repetition_penalty_ = nullptr;
+    int*   min_lengths_        = nullptr;
+    bool*  skip_decode_        = nullptr;
+    bool   skip_any_           = false;
+
+    RepetitionPenaltyType repetition_penalty_type_ = RepetitionPenaltyType::None;
+
+    virtual void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
+
+    virtual void freeBuffer();
+    virtual void allocateBuffer() = 0;
+    virtual void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p);
+
+public:
+    curandState_t* curandstate_buf()
+    {
+        return curandstate_buf_;
+    }
+
+    BaseSamplingLayer(size_t             max_batch_size,
+                      size_t             vocab_size,
+                      size_t             vocab_size_padded,
+                      int                end_id,
+                      size_t             top_k,
+                      float              top_p,
+                      unsigned long long random_seed,  // TODO(bhsueh) delete
+                      float              temperature,
+                      float              len_penalty,
+                      float              repetition_penalty,
+                      cudaStream_t       stream,
+                      cublasMMWrapper*   cublas_wrapper,
+                      IAllocator*        allocator,
+                      bool               is_free_buffer_after_forward,
+                      cudaDeviceProp*    cuda_device_prop);
+
+    BaseSamplingLayer(BaseSamplingLayer const& sampling_layer);
+
+    ~BaseSamplingLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors) override;
+    void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
+                 const std::unordered_map<std::string, Tensor>* input_tensors) override;
+    void forward(TensorMap* output_tensors, TensorMap* input_tensors) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/sampling_layers/CMakeLists.txt b/src/fastertransformer/layers/sampling_layers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fab57668e80479fabe2fcbf7b5033522ad99ccfe
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
+set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels memory_utils)
+
+add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
+set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels)
+
+add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
+set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
diff --git a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0092e279269ca43d0acf50d12d44605f88644e8
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.cu
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <float.h>
+
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<uint TOP_K_MAX>
+__global__ void setup_topk_runtime_args(int    batch_size,
+                                        uint   top_k,
+                                        uint*  top_ks,
+                                        int    top_ks_size,
+                                        float  top_p,
+                                        float* top_ps,
+                                        int    top_ps_size,
+                                        bool*  skip_decode)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = index; i < batch_size; i += gridDim.x * blockDim.x) {
+        uint  k = top_ks_size > 1 ? top_ks[i] : top_k;
+        float p = top_ps_size > 1 ? top_ps[i] : top_p;
+        if (k == 0 && p == 0.0f) {
+            // FT's topp implementation does not support topp = 0.0f, but it equivalent to greedy search.
+            // So, we set the topk = 1 as an alternative solution.
+            k = 1;
+        }
+        if (k > 0 && p == 0.0f) {
+            // for compatibility <= FT5.0.
+            // This case corresponds to the old topk sampling, which is equivalent to
+            // the old topk_topp sampling with topp=1.0f. TopKSamplingLayer and
+            // TopKTopPSamplingLayer are now merged by TopKSamplingLayer. Thus, we
+            // replace the case topk>0 and topp=0.0f by topk>0 and topp=1.0f for the
+            // compatibility.
+            p = 1.0f;
+        }
+        // Clip k value. A topk sampling kernel supports up to TOP_K_MAX=64.
+        top_ks[i] = k > TOP_K_MAX ? TOP_K_MAX : k;
+        if (k > TOP_K_MAX) {
+            printf("[WARNING] topk (%d) is larger than max supported number (%d) for token %d"
+                   " clip to max supported number %d. \n",
+                   k,
+                   TOP_K_MAX,
+                   i,
+                   top_ks[i]);
+        }
+        // Clip p value if it is out of range. range = [0.0, 1.0].
+        top_ps[i] = p < 0.0f ? 0.0f : (p > 1.0f ? 1.0f : p);
+        if (p < 0.0f || p > 1.0f) {
+            printf("[WARNING] topp (%f) is out of range ([0.0, 1.0f]) for token %d"
+                   " clip to closest number %f.\n",
+                   p,
+                   i,
+                   top_ps[i]);
+        }
+        skip_decode[i] = k == 0;
+    }
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
+    uint max_top_k = top_k.size() > 0 ? top_k.max<uint>() : 1;
+    if (max_top_k == 0) {
+        // for safety. TopKSamplingLayer handles a case of top_k=0 and top_p=0 as
+        // a greedy decode, i.e. top_k=1, although such case has max_top_k=0.
+        max_top_k = 1;
+    }
+    invokeTopKSampling<T>(nullptr,
+                          sampling_workspace_size_,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          nullptr,
+                          max_top_k,
+                          1.0f,
+                          vocab_size_padded_,
+                          nullptr,
+                          stream_,
+                          batch_size,
+                          skip_decode_buf_);
+    sampling_workspace_ = allocator_->reMalloc(sampling_workspace_, sampling_workspace_size_, false);
+    runtime_top_k_buf_ =
+        reinterpret_cast<uint*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(uint) * batch_size, false));
+    runtime_top_p_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&sampling_workspace_));
+        allocator_->free((void**)(&runtime_top_k_buf_));
+        allocator_->free((void**)(&runtime_top_p_buf_));
+    }
+    BaseSamplingLayer<T>::freeBuffer();
+    is_allocate_buffer_ = false;
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    // Setup runtime topk and topp arguments.
+    //
+    // runtime_args:
+    //     runtime_top_k [1] or [batch_size] on cpu, optional, uint.
+    //     runtime_top_p [1] or [batch_size] on cpu, optional, float.
+    //     temperature [1] or [batch_size] on cpu, optional
+    //     repetition_penalty [1] or [batch_size] on cpu, optional
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
+
+    uint         tmp_top_k     = 0;
+    const Tensor runtime_top_k = runtime_args->isExist("runtime_top_k") ?
+                                     runtime_args->at("runtime_top_k") :
+                                     Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &tmp_top_k);
+    const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
+    const size_t runtime_top_k_size = runtime_top_k.size();
+    const size_t runtime_top_p_size = runtime_top_p.size();
+
+    uint  top_k = runtime_top_k.max<uint>();
+    float top_p = runtime_top_p_size == 0 ? 0.0f : runtime_top_p.getVal<float>();
+
+    if (runtime_top_k_size > 1) {
+        FT_CHECK_WITH_INFO(
+            runtime_top_k.size() == batch_size,
+            fmtstr("runtime_top_k.size() (%d) == batch_size (%d) is not satisfied!", runtime_top_k.size(), batch_size));
+        cudaAutoCpy(runtime_top_k_buf_, runtime_top_k.getPtr<uint>(), batch_size, stream_);
+    }
+    if (runtime_top_p_size > 1) {
+        FT_CHECK_WITH_INFO(
+            runtime_top_p.size() == batch_size,
+            fmtstr("runtime_top_p.size() (%d) == batch_size (%d) is not satisfied!", runtime_top_p.size(), batch_size));
+        cudaAutoCpy(runtime_top_p_buf_, runtime_top_p.getPtr<float>(), batch_size, stream_);
+    }
+
+    dim3 block(std::min((int)batch_size, 256));
+    dim3 grid(div_up((int)batch_size, (int)block.x));
+    // support top_k up to 1024.
+    setup_topk_runtime_args<1024><<<grid, block, 0, stream_>>>(batch_size,
+                                                               top_k,
+                                                               runtime_top_k_buf_,
+                                                               runtime_top_k_size,
+                                                               top_p,
+                                                               runtime_top_p_buf_,
+                                                               runtime_top_p_size,
+                                                               skip_decode_buf_);
+    cudaAutoCpy(skip_decode_, skip_decode_buf_, batch_size, stream_);
+    uint* runtime_top_ks = new uint[batch_size];
+    cudaAutoCpy(runtime_top_ks, runtime_top_k_buf_, batch_size, stream_);
+    runtime_max_top_k_ = static_cast<int>(*std::max_element(runtime_top_ks, runtime_top_ks + batch_size));
+    delete[] runtime_top_ks;
+}
+
+template<typename T>
+void TopKSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    // input_tensors:
+    //      logits [local_batch_size, vocab_size_padded]
+    //      embedding_bias [vocab_size_padded], optional
+    //      step [1] on cpu
+    //      max_input_length [1] on cpu
+    //      input_lengths [local_batch_size], optional
+    //      ite [1] on cpu
+
+    // output_tensors:
+    //      output_ids [max_seq_len, batch_size]
+    //      finished [local_batch_size], optional
+    //      sequence_length [local_batch_size], optional
+    //      cum_log_probs [batch_size], must be float*, optional
+    //          The cumultative log probability of generated tokens.
+    //      output_log_probs [local_batch_size], must be float*, optional
+    //          The log probs at the current step.
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 4);
+    FT_CHECK(output_tensors->size() >= 1);
+
+    const int batch_size       = output_tensors->at("output_ids").shape[1];
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+    const int ite              = input_tensors->at("ite").getVal<int>();
+    const int step             = input_tensors->at("step").getVal<int>();
+
+    // in case of skip any, the logit value is already copied and processed.
+    T* logits = !skip_any_ ? input_tensors->at("logits").getPtr<T>() : runtime_logits_buf_;
+
+    invokeAddBiasEndMask(logits,
+                         (T*)(nullptr),
+                         input_tensors->at("end_id").getPtr<const int>(),
+                         output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+                         local_batch_size,
+                         vocab_size_,
+                         vocab_size_padded_,
+                         stream_);
+    sync_check_cuda_error();
+
+    float* cum_log_probs =
+        output_tensors->isExist("cum_log_probs") ? output_tensors->at("cum_log_probs").getPtr<float>() : nullptr;
+    float* output_log_probs =
+        output_tensors->isExist("output_log_probs") ? output_tensors->at("output_log_probs").getPtr<float>() : nullptr;
+
+    if (cum_log_probs != nullptr || output_log_probs != nullptr) {
+        invokeAddBiasSoftMax(
+            logits,
+            (T*)(nullptr),
+            input_tensors->at("end_id").getPtr<int>(),
+            output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+            local_batch_size,
+            vocab_size_padded_,
+            vocab_size_,
+            stream_);
+        sync_check_cuda_error();
+    }
+
+    invokeBatchTopKSampling(
+        sampling_workspace_,
+        sampling_workspace_size_,
+        logits,
+        output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size + ite * local_batch_size),
+        output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>(),
+        output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+        cum_log_probs,
+        output_log_probs,
+        curandstate_buf_ + ite * local_batch_size,
+        (int)runtime_max_top_k_,  // useless because runtime_top_k_buf_ is never nullptr. Keep for legacy.
+        (int*)(runtime_top_k_buf_ + ite * local_batch_size),
+        1.0f,  // useless because runtime_top_p_buf_ is never nullptr. Keep for legacy.
+        runtime_top_p_buf_ + ite * local_batch_size,
+        vocab_size_padded_,
+        input_tensors->at("end_id").getPtr<int>(),
+        stream_,
+        local_batch_size,
+        skip_decode_buf_ + ite * local_batch_size);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+TopKSamplingLayer<T>::TopKSamplingLayer(size_t             max_batch_size,
+                                        size_t             vocab_size,
+                                        size_t             vocab_size_padded,
+                                        int                end_id,
+                                        size_t             top_k,
+                                        unsigned long long random_seed,
+                                        float              temperature,
+                                        float              len_penalty,
+                                        float              repetition_penalty,
+                                        cudaStream_t       stream,
+                                        cublasMMWrapper*   cublas_wrapper,
+                                        IAllocator*        allocator,
+                                        bool               is_free_buffer_after_forward):
+    BaseSamplingLayer<T>(max_batch_size,
+                         vocab_size,
+                         vocab_size_padded,
+                         end_id,
+                         top_k,
+                         0.0f,
+                         random_seed,
+                         temperature,
+                         len_penalty,
+                         repetition_penalty,
+                         stream,
+                         cublas_wrapper,
+                         allocator,
+                         is_free_buffer_after_forward,
+                         nullptr)
+{
+}
+
+template<typename T>
+TopKSamplingLayer<T>::TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampling_layer):
+    BaseSamplingLayer<T>(top_k_sampling_layer)
+{
+}
+
+template<typename T>
+TopKSamplingLayer<T>::~TopKSamplingLayer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    freeBuffer();
+}
+
+template class TopKSamplingLayer<float>;
+template class TopKSamplingLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b62a44b3cae76e328538b16a99f6bb2f733c42b
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class TopKSamplingLayer: public BaseSamplingLayer<T> {
+private:
+    void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) override;
+
+    void freeBuffer() override;
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) override;
+
+    uint   runtime_max_top_k_ = 1;
+    uint*  runtime_top_k_buf_ = nullptr;
+    float* runtime_top_p_buf_ = nullptr;
+    using BaseSamplingLayer<T>::vocab_size_;
+    using BaseSamplingLayer<T>::vocab_size_padded_;
+
+    using BaseSamplingLayer<T>::sampling_workspace_size_;
+    using BaseSamplingLayer<T>::sampling_workspace_;
+    using BaseSamplingLayer<T>::curandstate_buf_;
+    using BaseSamplingLayer<T>::random_seeds_buf_;
+    using BaseSamplingLayer<T>::skip_decode_buf_;
+    using BaseSamplingLayer<T>::skip_decode_;
+    using BaseSamplingLayer<T>::skip_any_;
+    using BaseSamplingLayer<T>::runtime_logits_buf_;
+
+    using BaseSamplingLayer<T>::stream_;
+    using BaseSamplingLayer<T>::allocator_;
+    using BaseSamplingLayer<T>::is_allocate_buffer_;
+
+protected:
+public:
+    TopKSamplingLayer(size_t             max_batch_size,
+                      size_t             vocab_size,
+                      size_t             vocab_size_padded,
+                      int                end_id,
+                      size_t             top_k,
+                      unsigned long long random_seed,
+                      float              temperature,
+                      float              len_penalty,
+                      float              repetition_penalty,
+                      cudaStream_t       stream,
+                      cublasMMWrapper*   cublas_wrapper,
+                      IAllocator*        allocator,
+                      bool               is_free_buffer_after_forward);
+    TopKSamplingLayer(TopKSamplingLayer<T> const& top_k_sampling_layer);
+    ~TopKSamplingLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2fa3e6b0f6c11c89d93290acdd7d6f968fe295ca
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.cu
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <float.h>
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
+#include "src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+static __global__ void set_topp_runtime_args(int             batch_size,
+                                             uint            top_k,
+                                             uint*           top_ks,
+                                             int             top_ks_size,
+                                             float           top_p,
+                                             float*          top_ps,
+                                             int             top_ps_size,
+                                             bool*           skip_decode,
+                                             float*          initial_top_p_buf,
+                                             float*          top_p_decay_buf,
+                                             const float*    top_p_decay,
+                                             float*          top_p_min_buf,
+                                             const float*    top_p_min,
+                                             int32_t*        top_p_reset_ids_buf,
+                                             const uint32_t* top_p_reset_ids)
+{
+    /**
+     * @brief Setup the runtime arguments for topp, broadcasting top_p to top_ps
+                and top_k to top_ks, copying top_p_decay/top_p_min/top_p_reset_ids
+                to internal buffers.
+     *
+     * \param batch_size            [batch_size]
+     * \param op_k                  [batch_size]
+     * \param top_ks                [batch_size]
+     * \param top_ks_size           [batch_size]
+     * \param top_p                 [batch_size]
+     * \param top_ps                [batch_size]
+     * \param top_ps_size           [batch_size]
+     * \param skip_decode           [batch_size]
+     * \param initial_top_p_buf     [batch_size]
+     * \param top_p_decay_buf       [batch_size]
+     * \param top_p_decay           [batch_size], optional, must between [0, 1]
+     * \param top_p_min_buf         [batch_size]
+     * \param top_p_min             [batch_size], optional, must between [0, 1]
+     * \param top_p_reset_ids_buf    [batch_size]
+     * \param top_p_reset_ids        [batch_size], optional
+     *
+     */
+
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int i = index; i < batch_size; i += gridDim.x * blockDim.x) {
+        uint  k = top_ks_size > 1 ? top_ks[i] : top_k;
+        float p = top_ps_size > 1 ? top_ps[i] : top_p;
+        if (k == 0 && p == 0.0f) {
+            // FT's topp implementation does not support topp = 0.0f, but it equivalent to greedy search.
+            // So, we set the topk = 1 as an alternative solution.
+            k = 1;
+        }
+        top_ks[i] = k;
+        // Clip p value if it is out of range. range = [0.0, 1.0].
+        top_ps[i] = p < 0.0f ? 0.0f : (p > 1.0f ? 1.0f : p);
+        if (p < 0.0f || p > 1.0f) {
+            printf("[WARNING] topp (%f) is out of range ([0.0, 1.0f]) for token %d"
+                   " clip to closest number %f.\n",
+                   p,
+                   i,
+                   top_ps[i]);
+        }
+        skip_decode[i] = k > 0;
+
+        initial_top_p_buf[i] = top_ps[i];
+        top_p_decay_buf[i]   = top_p_decay == nullptr ? 1.0f : top_p_decay[i];
+        if (top_p_decay_buf[i] > 1.0f || top_p_decay_buf[i] <= 0.0f) {
+            printf("[WARNING] top_p_decay_buf (%f) is out of range ([0.0, 1.0f]) for token %d,"
+                   " change to 1.0f.\n",
+                   top_p_decay_buf[i],
+                   i);
+            top_p_decay_buf[i] = 1.0f;
+        }
+        top_p_min_buf[i] = top_p_min == nullptr ? 1e-6f : top_p_min[i];  // prevent topp becoming 0.0
+        if (top_p_min_buf[i] > 1.0f || top_p_min_buf[i] <= 0.0f) {
+            printf("[WARNING] top_p_min_buf (%f) is out of range ([0.0, 1.0f]) for token %d,"
+                   " change to 0.5f.\n",
+                   top_p_min_buf[i],
+                   i);
+            top_p_min_buf[i] = 0.5f;
+        }
+        top_p_reset_ids_buf[i] = (int32_t)(top_p_reset_ids == nullptr ? -1 : top_p_reset_ids[i]);
+    }
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::allocateBuffer(batch_size, top_k, top_p);
+    invokeTopPSampling<T>(nullptr,  // workspace
+                          sampling_workspace_size_,
+                          cub_temp_storage_size_,
+                          nullptr,  // output_ids
+                          nullptr,  // sequence_length
+                          nullptr,  // finished_buffer
+                          nullptr,  // cum_log_probs
+                          nullptr,  // output_log_probs
+                          nullptr,  // log_probs
+                          topp_id_vals_buf_,
+                          topp_offset_buf_,
+                          begin_topp_offset_buf_,
+                          curandstate_buf_,
+                          batch_size,
+                          vocab_size_padded_,
+                          nullptr,
+                          top_p.size() > 0 ? top_p.max<float>() : 0.0f,
+                          stream_,
+                          cuda_device_prop_,
+                          skip_decode_buf_);
+    sampling_workspace_ = allocator_->reMalloc(sampling_workspace_, sampling_workspace_size_, true);
+    runtime_top_k_buf_ =
+        reinterpret_cast<uint*>(allocator_->reMalloc(runtime_top_k_buf_, sizeof(uint) * batch_size, false));
+    runtime_top_p_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(runtime_top_p_buf_, sizeof(float) * batch_size, false));
+    initial_top_p_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(initial_top_p_buf_, sizeof(float) * batch_size, false));
+    top_p_decay_buf_ =
+        reinterpret_cast<float*>(allocator_->reMalloc(top_p_decay_buf_, sizeof(float) * batch_size, false));
+    top_p_min_buf_ = reinterpret_cast<float*>(allocator_->reMalloc(top_p_min_buf_, sizeof(float) * batch_size, false));
+    top_p_reset_ids_buf_ =
+        reinterpret_cast<int32_t*>(allocator_->reMalloc(top_p_reset_ids_buf_, sizeof(int32_t) * batch_size, false));
+    topp_id_vals_buf_ = reinterpret_cast<int*>(
+        allocator_->reMalloc(topp_id_vals_buf_, sizeof(int) * batch_size * vocab_size_padded_, false));
+    topp_offset_buf_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(topp_offset_buf_, sizeof(int) * (batch_size + 1), false));
+    begin_topp_offset_buf_ =
+        reinterpret_cast<int*>(allocator_->reMalloc(begin_topp_offset_buf_, sizeof(int) * (batch_size + 1), false));
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&sampling_workspace_));
+        allocator_->free((void**)(&topp_id_vals_buf_));
+        allocator_->free((void**)(&topp_offset_buf_));
+        allocator_->free((void**)(&begin_topp_offset_buf_));
+        allocator_->free((void**)(&runtime_top_k_buf_));
+        allocator_->free((void**)(&runtime_top_p_buf_));
+        allocator_->free((void**)(&initial_top_p_buf_));
+        allocator_->free((void**)(&top_p_decay_buf_));
+        allocator_->free((void**)(&top_p_min_buf_));
+        allocator_->free((void**)(&top_p_reset_ids_buf_));
+    }
+    BaseSamplingLayer<T>::freeBuffer();
+    is_allocate_buffer_ = false;
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args)
+{
+    /**
+    * @brief Set up the sampling layer for given runtime arguments.
+
+    * runtime_args:
+    *   \param  runtime_top_k [1] or [batch_size] on cpu, optional.
+    *   \param  runtime_top_p [1] or [batch_size] on cpu, optional
+    *   \param  temperature [1] or [batch_size] on cpu, optional
+    *   \param  repetition_penalty [1] or [batch_size] on cpu, optional
+    *   \param  top_p_decay [batch_size] on gpu, float, optional
+    *   \param  top_p_min [batch_size] on gpu, float, optional
+    *   \param  top_p_reset_ids [batch_size] on gpu, uint32, optional
+    **/
+
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    BaseSamplingLayer<T>::setup(batch_size, beam_width, runtime_args);
+    const Tensor runtime_top_p = runtime_args->isExist("runtime_top_p") ? runtime_args->at("runtime_top_p") : Tensor();
+    const size_t runtime_top_p_size = runtime_top_p.size();
+    if (runtime_top_p_size == 0) {
+        std::fill_n(skip_decode_, batch_size, true);
+        return;
+    }
+
+    uint         tmp_top_k          = 0;
+    const Tensor runtime_top_k      = runtime_args->isExist("runtime_top_k") ?
+                                          runtime_args->at("runtime_top_k") :
+                                          Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &tmp_top_k);
+    const size_t runtime_top_k_size = runtime_top_k.size();
+
+    uint  top_k = runtime_top_k.getVal<uint>();
+    float top_p = runtime_top_p.getVal<float>();
+
+    if (runtime_top_k_size > 1) {
+        FT_CHECK(runtime_top_k.size() == batch_size);
+        cudaH2Dcpy(runtime_top_k_buf_, runtime_top_k.getPtr<uint>(), batch_size);
+    }
+    if (runtime_top_p_size > 1) {
+        FT_CHECK(runtime_top_p.size() == batch_size);
+        cudaH2Dcpy(runtime_top_p_buf_, runtime_top_p.getPtr<float>(), batch_size);
+    }
+
+    dim3 block(std::min((int)batch_size, 256));
+    dim3 grid(div_up((int)batch_size, (int)block.x));
+
+    const float*    top_p_decay     = runtime_args->getPtr<float>("top_p_decay", nullptr);
+    const float*    top_p_min       = runtime_args->getPtr<float>("top_p_min", nullptr);
+    const uint32_t* top_p_reset_ids = runtime_args->getPtr<uint32_t>("top_p_reset_ids", nullptr);
+    set_topp_runtime_args<<<grid, block, 0, stream_>>>(batch_size,
+                                                       top_k,
+                                                       runtime_top_k_buf_,
+                                                       runtime_top_k_size,
+                                                       top_p,
+                                                       runtime_top_p_buf_,
+                                                       runtime_top_p_size,
+                                                       skip_decode_buf_,
+                                                       initial_top_p_buf_,
+                                                       top_p_decay_buf_,
+                                                       top_p_decay,
+                                                       top_p_min_buf_,
+                                                       top_p_min,
+                                                       top_p_reset_ids_buf_,
+                                                       top_p_reset_ids);
+    sync_check_cuda_error();
+    cudaAutoCpy(skip_decode_, skip_decode_buf_, batch_size, stream_);
+    float* runtime_top_ps = new float[batch_size];
+    cudaAutoCpy(runtime_top_ps, runtime_top_p_buf_, batch_size, stream_);
+    runtime_max_top_p_ = *std::max_element(runtime_top_ps, runtime_top_ps + batch_size);
+    delete[] runtime_top_ps;
+}
+
+template<typename T>
+void TopPSamplingLayer<T>::runSampling(TensorMap* output_tensors, TensorMap* input_tensors)
+{
+    /**
+    * input_tensors:
+    *   \param  logits [local_batch_size, vocab_size_padded]
+    *   \param  embedding_bias [vocab_size_padded], optional
+    *   \param  step [1] on cpu
+    *   \param  max_input_length [1] on cpu
+    *   \param  input_lengths [local_batch_size], optional
+    *   \param  ite [1] on cpu
+
+    * output_tensors:
+    *   \param  output_ids [max_seq_len, batch_size]
+    *   \param  finished [local_batch_size], optional
+    *   \param  sequence_length [local_batch_size], optional
+    *   \param  cum_log_probs [batch_size], must be float*, optional
+    *   \param  The cumultative log probability of generated tokens.
+    *   \param  output_log_probs [local_batch_size], must be float*, optional
+                    log probs at the current step.
+    **/
+
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    FT_CHECK(input_tensors->size() >= 4);
+    FT_CHECK(output_tensors->size() >= 1);
+
+    const int batch_size       = output_tensors->at("output_ids").shape[1];
+    const int local_batch_size = input_tensors->at("logits").shape[0];
+    const int step             = input_tensors->at("step").getVal<int>();
+    const int ite              = input_tensors->at("ite").getVal<int>();
+
+    // in case of skip any, the logit value is already copied and processed.
+    T* logits = !skip_any_ ? input_tensors->at("logits").getPtr<T>() : runtime_logits_buf_;
+
+    invokeTopPInitialize(
+        topp_id_vals_buf_, topp_offset_buf_, begin_topp_offset_buf_, local_batch_size, vocab_size_padded_, stream_);
+    sync_check_cuda_error();
+
+    invokeAddBiasSoftMax(logits,
+                         (T*)(nullptr),
+                         input_tensors->at("end_id").getPtr<int>(),
+                         output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+                         local_batch_size,
+                         vocab_size_padded_,
+                         vocab_size_,
+                         stream_);
+    sync_check_cuda_error();
+
+    float* cum_log_probs =
+        output_tensors->isExist("cum_log_probs") ? output_tensors->at("cum_log_probs").getPtr<float>() : nullptr;
+    float* output_log_probs =
+        output_tensors->isExist("output_log_probs") ? output_tensors->at("output_log_probs").getPtr<float>() : nullptr;
+
+    invokeBatchTopPSampling<T>(
+        sampling_workspace_,
+        sampling_workspace_size_,
+        cub_temp_storage_size_,
+        output_tensors->at("output_ids").getPtrWithOffset<int>(step * batch_size + ite * local_batch_size),
+        output_tensors->at("sequence_length", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<int>(),
+        output_tensors->at("finished", Tensor{MEMORY_GPU, TYPE_INVALID, {}, nullptr}).getPtr<bool>(),
+        cum_log_probs,
+        output_log_probs,
+        logits,
+        topp_id_vals_buf_,
+        topp_offset_buf_,
+        begin_topp_offset_buf_,
+        curandstate_buf_ + ite * local_batch_size,
+        local_batch_size,
+        vocab_size_padded_,
+        input_tensors->at("end_id").getPtr<int>(),
+        runtime_max_top_p_,
+        runtime_top_p_buf_ + ite * local_batch_size,
+        stream_,
+        cuda_device_prop_,
+        skip_decode_buf_ + ite * local_batch_size);
+    sync_check_cuda_error();
+
+    invokeComputeToppDecay(
+        runtime_top_p_buf_ + ite * local_batch_size,
+        initial_top_p_buf_ + ite * local_batch_size,
+        output_tensors->getPtrWithOffset<int>("output_ids", step * batch_size + ite * local_batch_size),
+        top_p_decay_buf_ + ite * local_batch_size,
+        top_p_min_buf_ + ite * local_batch_size,
+        top_p_reset_ids_buf_ + ite * local_batch_size,
+        local_batch_size,
+        stream_);
+    sync_check_cuda_error();
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template<typename T>
+TopPSamplingLayer<T>::TopPSamplingLayer(size_t             max_batch_size,
+                                        size_t             vocab_size,
+                                        size_t             vocab_size_padded,
+                                        int                end_id,
+                                        float              top_p,
+                                        unsigned long long random_seed,
+                                        float              temperature,
+                                        float              len_penalty,
+                                        float              repetition_penalty,
+                                        cudaStream_t       stream,
+                                        cublasMMWrapper*   cublas_wrapper,
+                                        IAllocator*        allocator,
+                                        bool               is_free_buffer_after_forward,
+                                        cudaDeviceProp*    cuda_device_prop):
+    BaseSamplingLayer<T>(max_batch_size,
+                         vocab_size,
+                         vocab_size_padded,
+                         end_id,
+                         0,
+                         top_p,
+                         random_seed,
+                         temperature,
+                         len_penalty,
+                         repetition_penalty,
+                         stream,
+                         cublas_wrapper,
+                         allocator,
+                         is_free_buffer_after_forward,
+                         cuda_device_prop)
+{
+}
+
+template<typename T>
+TopPSamplingLayer<T>::TopPSamplingLayer(TopPSamplingLayer<T> const& top_p_sampling_layer):
+    BaseSamplingLayer<T>(top_p_sampling_layer)
+{
+}
+
+template<typename T>
+TopPSamplingLayer<T>::~TopPSamplingLayer()
+{
+    freeBuffer();
+}
+
+template class TopPSamplingLayer<float>;
+template class TopPSamplingLayer<half>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..17463ce939b0427419739c3e0f8ab2f619dc30a5
--- /dev/null
+++ b/src/fastertransformer/layers/sampling_layers/TopPSamplingLayer.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class TopPSamplingLayer: public BaseSamplingLayer<T> {
+private:
+    void runSampling(TensorMap* output_tensors, TensorMap* input_tensors) override;
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, Tensor top_k, Tensor top_p) override;
+    void freeBuffer() override;
+
+    uint*    runtime_top_k_buf_ = nullptr;
+    float*   runtime_top_p_buf_ = nullptr;
+    float    runtime_max_top_p_;
+    float*   initial_top_p_buf_   = nullptr;
+    float*   top_p_decay_buf_     = nullptr;
+    float*   top_p_min_buf_       = nullptr;
+    int32_t* top_p_reset_ids_buf_ = nullptr;
+
+    int*   topp_id_vals_buf_      = nullptr;
+    int*   topp_offset_buf_       = nullptr;
+    int*   begin_topp_offset_buf_ = nullptr;
+    size_t cub_temp_storage_size_;
+
+    using BaseSamplingLayer<T>::vocab_size_;
+    using BaseSamplingLayer<T>::vocab_size_padded_;
+
+    using BaseSamplingLayer<T>::sampling_workspace_size_;
+    using BaseSamplingLayer<T>::sampling_workspace_;
+    using BaseSamplingLayer<T>::curandstate_buf_;
+    using BaseSamplingLayer<T>::random_seeds_buf_;
+    using BaseSamplingLayer<T>::skip_decode_buf_;
+    using BaseSamplingLayer<T>::skip_decode_;
+    using BaseSamplingLayer<T>::skip_any_;
+    using BaseSamplingLayer<T>::runtime_logits_buf_;
+
+    using BaseSamplingLayer<T>::stream_;
+    using BaseSamplingLayer<T>::allocator_;
+    using BaseSamplingLayer<T>::is_allocate_buffer_;
+    using BaseSamplingLayer<T>::cuda_device_prop_;
+
+protected:
+public:
+    TopPSamplingLayer(size_t             max_batch_size,
+                      size_t             vocab_size,
+                      size_t             vocab_size_padded,
+                      int                end_id,
+                      float              top_p,
+                      unsigned long long random_seed,
+                      float              temperature,
+                      float              len_penalty,
+                      float              repetition_penalty,
+                      cudaStream_t       stream,
+                      cublasMMWrapper*   cublas_wrapper,
+                      IAllocator*        allocator,
+                      bool               is_free_buffer_after_forward,
+                      cudaDeviceProp*    cuda_device_prop);
+    TopPSamplingLayer(TopPSamplingLayer<T> const& top_p_sampling_layer);
+    ~TopPSamplingLayer();
+
+    void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) override;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/BaseWeight.h b/src/fastertransformer/models/BaseWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c7312ac3db22596db06ae0e19d95123fa86446b
--- /dev/null
+++ b/src/fastertransformer/models/BaseWeight.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+#include <vector>
+
+#pragma once
+
+namespace fastertransformer {
+
+template<typename T>
+struct FtWeight {
+
+public:
+    std::string         name_;
+    std::vector<size_t> shape_;
+    size_t              size_ = 0;
+    T*                  ptr_  = nullptr;
+
+    FtWeight() {}
+    FtWeight(const std::string name, const std::vector<size_t> shape, T* ptr): name_(name), shape_(shape), ptr_(ptr)
+    {
+        size_ = 1;
+        for (uint i = 0; i < shape_.size(); i++) {
+            size_ *= shape_[i];
+        }
+    }
+
+    ~FtWeight()
+    {
+        size_ = 0;
+        ptr_  = nullptr;
+    }
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37d883f86f05a04cc4cd974f23caa0ac7a67d629
--- /dev/null
+++ b/src/fastertransformer/models/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(llama)
diff --git a/src/fastertransformer/models/llama/Barrier.h b/src/fastertransformer/models/llama/Barrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae53ff1e2096104febff25fdade28cc17b563fe7
--- /dev/null
+++ b/src/fastertransformer/models/llama/Barrier.h
@@ -0,0 +1,37 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/utils/logger.h"
+#include <pthread.h>
+
+namespace fastertransformer {
+
+class Barrier {
+public:
+    Barrier(unsigned count)
+    {
+        FT_LOG_INFO("Barrier(%d)", (int)count);
+        pthread_barrier_init(&barrier_, nullptr, count);
+    }
+
+    Barrier(const Barrier&) = delete;
+    Barrier& operator=(const Barrier&) = delete;
+    Barrier(Barrier&&) noexcept        = delete;
+    Barrier& operator=(Barrier&&) noexcept = delete;
+
+    void wait()
+    {
+        pthread_barrier_wait(&barrier_);
+    }
+
+    ~Barrier()
+    {
+        pthread_barrier_destroy(&barrier_);
+    }
+
+private:
+    pthread_barrier_t barrier_{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/CMakeLists.txt b/src/fastertransformer/models/llama/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..06797f37796c4663c5444f76a3e47b93c812e6af
--- /dev/null
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_subdirectory(fused_multi_head_attention)
+
+add_library(Llama STATIC 
+        LlamaV2.cc
+        LlamaBatch.cc
+        LlamaCacheManager.cc
+        LlamaContextDecoder.cc
+        LlamaContextAttentionLayer.cc
+        LlamaDecoderSelfAttentionLayer.cc
+        LlamaDecoder.cc
+        LlamaWeight.cc
+        LlamaDecoderLayerWeight.cc
+        LlamaFfnLayer.cc
+        llama_kernels.cu
+        llama_decoder_kernels.cu
+        llama_utils.cu)
+set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(Llama PUBLIC -lcudart
+        cublasMMWrapper
+        DynamicDecodeLayer
+        BaseBeamSearchLayer
+        activation_kernels
+        decoder_masked_multihead_attention
+        bert_preprocess_kernels
+        decoding_kernels
+        unfused_attention_kernels
+        custom_ar_kernels
+        custom_ar_comm
+        gpt_kernels
+        tensor
+        memory_utils
+        nccl_utils
+        cuda_utils
+        logger
+        llama_fmha)
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaBatch.cc b/src/fastertransformer/models/llama/LlamaBatch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d76beb6297b39a71a2177f9ca2f47a582a50f4c
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaBatch.cc
@@ -0,0 +1,1042 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/models/llama/LlamaBatch.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/logger.h"
+#include <cstdint>
+#include <iomanip>
+#include <sstream>
+#include <unordered_map>
+
+namespace fastertransformer {
+
+template<typename T>
+void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
+                                   std::vector<std::shared_ptr<Request>>& infer_reqs)
+{
+    std::unordered_map<uint64_t, int> occurance;
+
+    auto count_occurance = [&occurance](const std::vector<std::shared_ptr<Request>>& rs) {
+        for (const auto& r : rs) {
+            ++occurance[r->id];
+        }
+    };
+
+    auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) {
+        FT_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
+        req->signal.set_value(ec);
+        req.reset();
+    };
+
+    auto handle_conflict_or_invalid = [this, &occurance, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
+                                                                      const char*                            type) {
+        for (auto& r : rs) {
+            if (r) {
+                int ec = 0;
+
+                if (occurance[r->id] != 1) {
+                    ec = Request::kConflict;
+                }
+                else if (r->start_flag && r->stop_flag) {
+                    ec = Request::kInvalid;
+                }
+                else if (!r->start_flag && !llama_->kv_cache_mgr_->contains(r->id)) {
+                    ec = Request::kInvalid;
+                }
+
+                if (ec) {
+                    invalidate(type, r, ec);
+                }
+            }
+        }
+    };
+
+    auto drop_invalid = [](std::vector<std::shared_ptr<Request>>& rs) {
+        int count = 0;
+        for (int i = 0; i < rs.size(); ++i) {
+            if (rs[i]) {
+                rs[count++] = std::move(rs[i]);
+            }
+        }
+        rs.resize(count);
+    };
+
+    count_occurance(stop_reqs);
+    count_occurance(infer_reqs);
+
+    if (!stop_reqs.empty()) {
+        handle_conflict_or_invalid(stop_reqs, "stop");
+
+        // invalidate stop-only requests for inactive sequences
+        for (auto& r : stop_reqs) {
+            if (r && r->end_flag == false) {
+                int ec = Request::kInactive;
+                for (int i = 0; i < batch_size_; ++i) {
+                    if (requests_[i] && requests_[i]->id == r->id) {
+                        ec = 0;
+                        break;
+                    }
+                }
+                if (ec) {
+                    invalidate("stop", r, ec);
+                }
+            }
+        }
+
+        drop_invalid(stop_reqs);
+    }
+
+    if (!infer_reqs.empty()) {
+        handle_conflict_or_invalid(infer_reqs, "infer");
+
+        // invalidate requests for busy sequences
+        for (auto& r : infer_reqs) {
+            if (r) {
+                for (int i = 0; i < batch_size_; ++i) {
+                    if (requests_[i] && requests_[i]->id == r->id) {
+                        invalidate("infer", r, Request::kBusy);
+                        break;
+                    }
+                }
+            }
+        }
+
+        drop_invalid(infer_reqs);
+    }
+}
+
+template<typename T>
+void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests)
+{
+    for (const auto& r : requests) {
+        int ec = Request::kFail;
+        // find matching active sequence
+        for (int i = 0; i < batch_size_; ++i) {
+            // stop & optionally erase active sequence
+            if (requests_[i] && requests_[i]->id == r->id) {
+                ec = 0;
+                finishRequest(i, r->end_flag);
+                break;
+            }
+        }
+        // mismatch, try erase inactive sequence
+        if (ec && r->end_flag) {
+            ec = 0;
+            llama_->kv_cache_mgr_->erase(r->id);
+        }
+        // clear output buffers (prevent leaking conversations) if request is successfull
+        if (ec == 0) {
+            auto& output_ids      = r->outputs[rank_].at("output_ids");
+            auto& sequence_length = r->outputs[rank_].at("sequence_length");
+            check_cuda_error(
+                cudaMemsetAsync(output_ids.getPtr<int>(), 0, sizeof(int) * output_ids.shape.at(2), stream_));
+            check_cuda_error(cudaMemsetAsync(sequence_length.getPtr<int>(), 0, sizeof(int), stream_));
+            check_cuda_error(cudaStreamSynchronize(stream_));
+        }
+        if (rank_ == 0) {
+            r->signal.set_value(ec);
+        }
+    }
+}
+
+template<typename T>
+void LlamaBatch<T>::allocateBuffer(size_t batch_size, size_t session_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    const size_t batchxbeam = batch_size;
+
+    const size_t hidden_units = llama_->hidden_units_;
+    const size_t vocab_size   = llama_->vocab_size_;
+
+    context_decoder_input_buf_ =
+        (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
+    context_decoder_ids_buf_ =
+        (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_context_token_num_, false);
+
+    decoder_input_buf_  = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
+    decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);
+
+    input_ids_buf_      = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
+    input_length_buf_   = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
+    history_length_buf_ = (int*)allocator_->reMalloc(history_length_buf_, sizeof(int) * batchxbeam);
+    context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);
+
+    total_padding_count_ = (int*)allocator_->reMalloc(total_padding_count_, sizeof(int) * batchxbeam, false);
+    sequence_lengths_    = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
+
+    k_cache_ptr_buf_ = (uint64_t*)allocator_->reMalloc(k_cache_ptr_buf_, sizeof(uint64_t) * batchxbeam);
+    v_cache_ptr_buf_ = (uint64_t*)allocator_->reMalloc(v_cache_ptr_buf_, sizeof(uint64_t) * batchxbeam);
+
+    logits_buf_       = (float*)allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
+    local_logits_buf_ = (float*)allocator_->reMalloc(local_logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
+
+    token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);
+
+    end_ids_buf_   = (int*)allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false);
+    finished_buf_  = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
+    seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaBatch<T>::allocatePersistantBuffer(size_t max_batch_size)
+{
+    output_ids_buf_ = (int*)allocator_->reMalloc(output_ids_buf_, sizeof(int) * max_batch_size * session_len_, true);
+
+    stop_words_buf_ =
+        (int*)allocator_->reMalloc(stop_words_buf_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
+    bad_words_buf_ =
+        (int*)allocator_->reMalloc(bad_words_buf_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
+
+    h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
+    h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
+    h_temperature_   = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
+    h_repetition_penalty_ =
+        (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);
+    h_random_seed_ = (uint64_t*)allocator_->reMalloc(h_random_seed_, sizeof(uint64_t) * max_batch_size, true, true);
+
+    sampling_params_ = {{"stop_words_list", stop_words_buf_},
+                        {"bad_words_list", bad_words_buf_},
+                        {"runtime_top_k", h_runtime_top_k_},
+                        {"runtime_top_p", h_runtime_top_p_},
+                        {"temperature", h_temperature_},
+                        {"repetition_penalty", h_repetition_penalty_},
+                        {"random_seed", h_random_seed_}};
+
+    topk_curandstate_buf_ = allocator_->reMalloc(topk_curandstate_buf_, sizeof(curandState_t) * max_batch_size, true);
+    topp_curandstate_buf_ = allocator_->reMalloc(topp_curandstate_buf_, sizeof(curandState_t) * max_batch_size, true);
+
+    {
+        NcclGuard barrier(llama_->tensor_para_, stream_, true);
+        h_input_ids_buf_ =
+            (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
+        h_input_length_buf_ =
+            (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
+        h_history_length_buf_ =
+            (int*)allocator_->reMalloc(h_history_length_buf_, sizeof(int) * max_batch_size, false, true);
+        h_context_length_buf_ =
+            (int*)allocator_->reMalloc(h_context_length_buf_, sizeof(int) * max_batch_size, false, true);
+        h_sequence_lengths_ =
+            (int*)allocator_->reMalloc(h_sequence_lengths_, sizeof(int) * max_batch_size, false, true);
+        h_k_cache_ptr_buf_ =
+            (uintptr_t*)allocator_->reMalloc(h_k_cache_ptr_buf_, sizeof(uintptr_t) * max_batch_size, true, true);
+        h_v_cache_ptr_buf_ =
+            (uintptr_t*)allocator_->reMalloc(h_v_cache_ptr_buf_, sizeof(uintptr_t) * max_batch_size, true, true);
+        h_finished_buf_ = (bool*)allocator_->reMalloc(h_finished_buf_, sizeof(bool) * max_batch_size, false, true);
+        h_seq_limit_len_ =
+            (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
+    }
+
+    is_allocate_persistant_buffer_ = true;
+}
+
+template<typename T>
+void LlamaBatch<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)&context_decoder_input_buf_);
+        allocator_->free((void**)&context_decoder_ids_buf_);
+
+        allocator_->free((void**)&decoder_input_buf_);
+        allocator_->free((void**)&decoder_output_buf_);
+
+        allocator_->free((void**)&input_ids_buf_);
+        allocator_->free((void**)&input_length_buf_);
+        allocator_->free((void**)&history_length_buf_);
+        allocator_->free((void**)&context_length_buf_);
+
+        allocator_->free((void**)&total_padding_count_);
+        allocator_->free((void**)&sequence_lengths_);
+
+        allocator_->free((void**)&k_cache_ptr_buf_);
+        allocator_->free((void**)&v_cache_ptr_buf_);
+
+        allocator_->free((void**)&logits_buf_);
+        allocator_->free((void**)&local_logits_buf_);
+
+        allocator_->free((void**)&token_ids_buf_);
+
+        allocator_->free((void**)&end_ids_buf_);
+        allocator_->free((void**)&finished_buf_);
+        allocator_->free((void**)&seq_limit_len_);
+
+        is_allocate_buffer_ = false;
+    }
+
+    if (is_allocate_persistant_buffer_) {
+        allocator_->free((void**)&h_input_ids_buf_, true);
+        allocator_->free((void**)&h_input_length_buf_, true);
+        allocator_->free((void**)&h_history_length_buf_, true);
+        allocator_->free((void**)&h_context_length_buf_, true);
+        allocator_->free((void**)&h_sequence_lengths_, true);
+        allocator_->free((void**)&h_k_cache_ptr_buf_, true);
+        allocator_->free((void**)&h_v_cache_ptr_buf_, true);
+        allocator_->free((void**)&h_seq_limit_len_, true);
+        allocator_->free((void**)&h_finished_buf_, true);
+
+        allocator_->free((void**)&output_ids_buf_);
+
+        is_allocate_persistant_buffer_ = false;
+    }
+}
+
+template<typename T>
+LlamaBatch<T>::LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama):
+    max_batch_size_(max_batch_size),
+    max_context_token_num_(max_context_token_num),
+    session_len_(session_len),
+    rank_(llama->tensor_para_.rank_),
+    debug_(llama->debug_),
+    llama_(llama),
+    data_type_(getTensorType<T>())
+{
+    stream_         = llama_->stream_;
+    allocator_      = llama_->allocator_;
+    cublas_wrapper_ = llama_->cublas_wrapper_;
+
+    requests_.resize(max_batch_size);
+    request_seq_len_limit_.resize(max_batch_size);
+    cached_seq_.resize(max_batch_size);
+
+    allocatePersistantBuffer(max_batch_size);
+}
+
+template<typename T>
+void LlamaBatch<T>::initializeSampling(int infer_request_count)
+{
+    TensorMap inputs;
+    for (const auto& param : sampling_params_) {
+        const Tensor* ptr{};
+        for (int i = 0; i < batch_size_; ++i) {
+            if (requests_[i]->inputs[rank_].isExist(param.first)) {
+                ptr = &requests_[i]->inputs[rank_].at(param.first);
+                break;
+            }
+        }
+        if (ptr) {
+            const auto& ref   = *ptr;
+            auto        shape = ref.shape;
+            FT_CHECK(shape[0] == 1);
+            shape[0]                = batch_size_;
+            const int size_in_bytes = ref.sizeBytes();
+            check_cuda_error(cudaMemsetAsync(param.second, 0, size_in_bytes * batch_size_, stream_));
+            for (int i = 0; i < batch_size_; ++i) {
+                if (requests_[i]->inputs[rank_].isExist(param.first)) {
+                    auto& src = requests_[i]->inputs[rank_].at(param.first);
+                    FT_CHECK(ref.shape == src.shape);
+                    check_cuda_error(cudaMemcpyAsync((uint8_t*)param.second + size_in_bytes * i,
+                                                     src.getPtr<void>(),
+                                                     size_in_bytes,
+                                                     cudaMemcpyDefault,
+                                                     stream_));
+                }
+            }
+            inputs.insert({param.first, {ref.where, ref.type, shape, param.second}});
+            if (debug_ && rank_ == 0) {
+                FT_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str());
+            }
+        }
+    }
+
+    inputs_ = std::move(inputs);
+
+    llama_->dynamic_decode_layer_->setup(batch_size_, 1, &inputs_);
+
+    for (int i = 0; i < batch_size_; ++i) {
+        // recover random states if not a new request or new request w/o "random_seed"
+        if (i < batch_size_ - infer_request_count || !requests_[i]->inputs[rank_].isExist("random_seed")) {
+            check_cuda_error(cudaMemcpyAsync(llama_->dynamic_decode_layer_->topk_curandstate_buf() + i,
+                                             (curandState_t*)topk_curandstate_buf_ + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync(llama_->dynamic_decode_layer_->topp_curandstate_buf() + i,
+                                             (curandState_t*)topp_curandstate_buf_ + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+    }
+
+    handleOptArg(&inputs_, "end_id", end_ids_buf_, llama_->end_id_, batch_size_);
+    cudaStreamSynchronize(0);
+}
+
+template<typename T>
+void LlamaBatch<T>::initializeGeneration()
+{
+    max_context_len_ = *std::max_element(h_context_length_buf_, h_context_length_buf_ + batch_size_);
+
+    check_cuda_error(cudaMemsetAsync(token_ids_buf_, 0, sizeof(int) * batch_size_ * session_len_ * 2, stream_));
+    invokeTransposeAxis01(token_ids_buf_, output_ids_buf_, batch_size_, session_len_, 1, stream_);
+    sync_check_cuda_error();
+
+    // token_ids_buf_[s, b]
+    // ABCDe            ABCDe     e
+    // ABCDEFGHIJk      ABCDEFGHIJk
+    // ABCDEFGHi    ->  ABCDEFGHi i
+    // ABCDEFGh         ABCDEFGh  h
+    // ABCd             ABCd      d
+    for (int i = 0; i < batch_size_; ++i) {
+        auto token_ids = token_ids_buf_ + i;
+        auto p_src     = h_context_length_buf_[i] - 1;
+        auto p_dst     = max_context_len_ - 1;
+        if (p_src != p_dst) {  // dst and src of `cudaMemcpyAsync` must not overlap
+            check_cuda_error(cudaMemcpyAsync(token_ids + p_dst * batch_size_,
+                                             token_ids + p_src * batch_size_,
+                                             sizeof(int),
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+    }
+
+    check_cuda_error(cudaMemcpyAsync(
+        context_length_buf_, h_context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        k_cache_ptr_buf_, h_k_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+
+    check_cuda_error(
+        cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    // `sequence_lengths_` will be increased by dynamic decode
+    // note that in decoder and in output "sequence length" has differnt semantic
+    // - in decoder it means length of sequence that has kv cache already computed
+    // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
+    invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_);
+    sync_check_cuda_error();
+
+    // total_padding_count_
+    // decoding starts at max_context_len
+    check_cuda_error(cudaMemsetAsync(total_padding_count_, 0, sizeof(int) * batch_size_, stream_));
+    invokeUpdatePaddingCount(total_padding_count_,  //
+                             context_length_buf_,
+                             max_context_len_,
+                             batch_size_,
+                             1,
+                             stream_);
+    sync_check_cuda_error();
+
+    // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted for
+    for (int i = 0; i < batch_size_; ++i) {
+        h_seq_limit_len_[i] = request_seq_len_limit_[i] + (max_context_len_ - h_context_length_buf_[i]);
+        // mask finished sequences
+        h_finished_buf_[i] = max_context_len_ >= h_seq_limit_len_[i];
+    }
+    check_cuda_error(
+        cudaMemcpyAsync(seq_limit_len_, h_seq_limit_len_, sizeof(uint32_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(
+        cudaMemcpyAsync(finished_buf_, h_finished_buf_, sizeof(bool) * batch_size_, cudaMemcpyDefault, stream_));
+
+    // ! range of step_ [1, 2 * session_len]
+    // consider a sequence with context_len == session_len and another sequence with context_len == 1 and
+    // request_output_len == session_len - 1 => step_ will loop in [session_len, 2 * session_len)
+    step_ = max_context_len_;
+
+    if (rank_ == 0) {
+        FT_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_);
+        FT_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_);
+
+        FT_LOG_INFO("[initGen] slot  sequence_id  context_len  seq_limit_len  finished");
+        for (int i = 0; i < batch_size_; ++i) {
+            FT_LOG_INFO("[initGen] %4d  %11ld  %11d  %13d  %8d",
+                        i,
+                        (long)cached_seq_[i].id,
+                        h_context_length_buf_[i],
+                        (int)h_seq_limit_len_[i],
+                        (int)h_finished_buf_[i]);
+        }
+    }
+}
+
+template<typename T>
+bool LlamaBatch<T>::generate()
+{
+    constexpr int kLogInterval = 10;
+    if (rank_ == 0 && (step_ - 1) % kLogInterval == 0) {
+        FT_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1);
+    }
+
+    const bool is_first_step = step_ == max_context_len_;
+
+    std::vector<int> prev;
+    if (debug_ && rank_ == 0 && is_first_step) {
+        prev.resize(batch_size_);
+        cudaMemcpyAsync(prev.data(),
+                        token_ids_buf_ + (step_ - 1) * batch_size_,
+                        sizeof(int) * batch_size_,
+                        cudaMemcpyDefault,
+                        stream_);
+    }
+
+    // embeddingLookup(step_ - 1);
+    llama_->embeddingLookup(decoder_input_buf_,  //
+                            token_ids_buf_,
+                            batch_size_,
+                            step_ - 1);
+
+    llama_->decoderForward(decoder_output_buf_,
+                           k_cache_ptr_buf_,
+                           v_cache_ptr_buf_,
+                           decoder_input_buf_,
+                           sequence_lengths_,
+                           total_padding_count_,
+                           finished_buf_,
+                           step_,
+                           0,
+                           session_len_,
+                           batch_size_);
+
+    llama_->postDecodeEmbedding(logits_buf_,  //
+                                local_logits_buf_,
+                                decoder_output_buf_,
+                                batch_size_);
+
+    // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
+    // not supported yet.
+    bool should_stop{};
+    llama_->dynamicDecode(token_ids_buf_,
+                          finished_buf_,
+                          sequence_lengths_,
+                          &should_stop,
+                          &inputs_,
+                          &outputs_,
+                          logits_buf_,
+                          seq_limit_len_,
+                          context_length_buf_,
+                          end_ids_buf_,
+                          step_,
+                          0,
+                          max_context_len_,
+                          session_len_ * 2,
+                          batch_size_);
+
+    if (debug_ && rank_ == 0) {
+        std::vector<int> curr(batch_size_);
+
+        cudaMemcpyAsync(
+            curr.data(), token_ids_buf_ + step_ * batch_size_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_);
+        cudaStreamSynchronize(stream_);
+
+        if (is_first_step) {
+            std::stringstream sprev;
+            for (int k = 0; k < prev.size(); ++k) {
+                sprev << std::setw(6) << prev[k];
+            }
+            FT_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str());
+        }
+
+        std::stringstream scurr;
+        for (int k = 0; k < curr.size(); ++k) {
+            scurr << std::setw(6) << curr[k];
+        }
+        FT_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str());
+    }
+
+    ////////////////////////////////////////////////
+    /// ! increase the step counter
+    ++step_;
+
+    return !should_stop;
+}
+
+template<typename T>
+void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infer_requests)
+{
+    FT_CHECK(batch_size_ + infer_requests.size() <= max_batch_size_);
+
+    const int infer_request_count = infer_requests.size();
+
+    allocateBuffer(batch_size_ + infer_request_count, session_len_);
+
+    // handle infer requests
+    std::vector<int>       tmp_input_length(infer_request_count);
+    std::vector<CachedSeq> tmp_cached_seq;
+    tmp_cached_seq.reserve(infer_request_count);
+
+    int tmp_max_input_length = 0;
+    for (int i = 0; i < infer_request_count; ++i) {
+        auto& r = *infer_requests[i];
+
+        LlamaCacheManager::Sequence seq{};
+        if (r.start_flag) {
+            seq = llama_->kv_cache_mgr_->create(r.id, stream_);
+        }
+        else {
+            seq = llama_->kv_cache_mgr_->fetch(r.id, stream_);
+        }
+
+        const int step = r.inputs[rank_].getVal<int>("step", -1);
+        if (step >= 0) {
+            if (step <= seq.token_ids.size()) {
+                seq.token_ids.resize(step);
+                seq.cache_len = std::min(seq.cache_len, (size_t)step);
+            }
+            else if (rank_ == 0) {
+                FT_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
+            }
+        }
+
+        // input length with missing cache accounted for
+        int actual_input_len = r.inputs[rank_].getVal<int>("input_lengths") + (seq.token_ids.size() - seq.cache_len);
+
+        // insert `start_id` for empty sequences
+        if (seq.token_ids.empty() && actual_input_len == 0) {
+            seq.token_ids.push_back(llama_->start_id_);
+            seq.cache_len    = 0;
+            actual_input_len = seq.token_ids.size() - seq.cache_len;
+        }
+
+        tmp_input_length[i] = actual_input_len;
+
+        tmp_max_input_length = std::max((int)tmp_max_input_length, actual_input_len);
+        tmp_cached_seq.push_back(std::move(seq));
+    }
+
+    FT_CHECK(tmp_max_input_length > 0);
+    const int max_input_length = tmp_max_input_length;
+
+    // arrange requests in ascending order w.r.t actual input lengths, so that requests need context decoding will
+    // be together
+    {
+        std::vector<int> idxs(tmp_input_length.size());
+        std::iota(idxs.begin(), idxs.end(), 0);
+        std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { return tmp_input_length[i] < tmp_input_length[j]; });
+        for (int i = 0; i < idxs.size(); ++i) {
+            requests_[batch_size_ + i]   = infer_requests[idxs[i]];
+            cached_seq_[batch_size_ + i] = tmp_cached_seq[idxs[i]];
+        }
+    }
+
+    const int count = batch_size_ + infer_requests.size();
+
+    std::vector<int> tmp_input_len(count);
+
+    for (int i = batch_size_; i < count; ++i) {
+        const auto& seq = cached_seq_[i];
+
+        h_input_length_buf_[i] = requests_[i]->inputs[rank_].getVal<int>("input_lengths");
+        tmp_input_len[i]       = h_input_length_buf_[i];
+        // prepare output ids
+        // <--------> max_context_len
+        // aaaAAAA
+        // bbbbBBBBBB
+        // ccCCC
+        auto output_ids_ptr = output_ids_buf_ + i * session_len_;
+
+        // clear the persistent buffer to prevent leaking previous conversation
+        check_cuda_error(cudaMemsetAsync(output_ids_ptr, 0, sizeof(int) * session_len_, stream_));
+
+        if (!seq.token_ids.empty()) {
+            check_cuda_error(cudaMemcpyAsync(output_ids_ptr,  //
+                                             seq.token_ids.data(),
+                                             sizeof(int) * seq.token_ids.size(),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            output_ids_ptr += seq.token_ids.size();
+        }
+
+        if (h_input_length_buf_[i]) {
+            auto input_ids_ptr = requests_[i]->inputs[rank_].getPtr<int>("input_ids");
+            check_cuda_error(cudaMemcpyAsync(output_ids_ptr,  //
+                                             input_ids_ptr,
+                                             sizeof(int) * h_input_length_buf_[i],
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+
+        if (!requests_[i]->start_flag && !seq.random_state_.empty()) {
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topk_curandstate_buf_ + i,
+                                             seq.random_state_.data(),
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topp_curandstate_buf_ + i,
+                                             seq.random_state_.data() + sizeof(curandState_t),
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+        }
+    }
+
+    for (int i = batch_size_; i < count; ++i) {
+        const auto& seq           = cached_seq_[i];
+        const int   missed        = (int)seq.token_ids.size() - seq.cache_len;
+        auto        input_ids_buf = input_ids_buf_ + i * session_len_;
+        FT_CHECK(missed >= 0);
+        if (missed > 0) {
+            check_cuda_error(cudaMemcpyAsync(input_ids_buf,  //
+                                             seq.token_ids.data() + seq.cache_len,
+                                             sizeof(int) * missed,
+                                             cudaMemcpyDefault,
+                                             stream_));
+            input_ids_buf += missed;
+        }
+        auto& input_ids = requests_[i]->inputs[rank_].at("input_ids");
+        check_cuda_error(cudaMemcpyAsync(input_ids_buf,  //
+                                         input_ids.getPtr<int>(),
+                                         sizeof(int) * h_input_length_buf_[i],
+                                         cudaMemcpyDefault,
+                                         stream_));
+        h_input_length_buf_[i] += missed;
+        h_history_length_buf_[i] = seq.cache_len;
+        h_context_length_buf_[i] = h_input_length_buf_[i] + h_history_length_buf_[i];
+
+        const int request_output_len = requests_[i]->inputs[rank_].getVal<int>("request_output_len");
+        request_seq_len_limit_[i]    = h_context_length_buf_[i] + request_output_len;
+        // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
+        // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
+        if (request_seq_len_limit_[i] >= session_len_) {
+            request_seq_len_limit_[i] = session_len_ - 1;
+            if (rank_ == 0) {
+                const int trunc_output_len = request_seq_len_limit_[i] - h_context_length_buf_[i];
+                FT_LOG_WARNING(
+                    "[initialize] [%ld] total sequence length (%d + %d) exceeds session_len (%d), request_output_len is truncated to %d",
+                    (long)seq.id,
+                    h_context_length_buf_[i],
+                    request_output_len,
+                    (int)session_len_,
+                    trunc_output_len);
+            }
+        }
+
+        h_k_cache_ptr_buf_[i] = (uint64_t)seq.k_cache;
+        h_v_cache_ptr_buf_[i] = (uint64_t)seq.v_cache;
+    }
+
+    const int max_context_len = *std::max_element(h_context_length_buf_ + batch_size_, h_context_length_buf_ + count);
+
+    batch_size_      = count;
+    max_context_len_ = max_context_len;
+    step_            = max_context_len;
+
+    check_cuda_error(
+        cudaMemcpyAsync(input_length_buf_, h_input_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        history_length_buf_, h_history_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        context_length_buf_, h_context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        k_cache_ptr_buf_, h_k_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(cudaMemcpyAsync(
+        v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
+
+    if (llama_->tensor_para_.rank_ == 0) {
+        FT_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count);
+        FT_LOG_INFO("[init] batch_size = %d", (int)batch_size_);
+        FT_LOG_INFO("[init] session_len = %d", (int)session_len_);
+        FT_LOG_INFO("[init] max_input_length = %d", (int)max_input_length);
+        FT_LOG_INFO("[init] max_context_len = %d", (int)max_context_len);
+        FT_LOG_INFO(
+            "[init] slot  sequence_id  history_len  input_len  context_len  tmp_input_len  token_ids.size  cache_len");
+        for (int i = batch_size_ - infer_request_count; i < batch_size_; ++i) {
+            FT_LOG_INFO("[init] %4d  %11ld  %11d  %9d  %11d  %13d  %14d  %9d",
+                        i,
+                        (int)cached_seq_[i].id,
+                        h_history_length_buf_[i],
+                        h_input_length_buf_[i],
+                        h_context_length_buf_[i],
+                        tmp_input_len[i],
+                        (int)cached_seq_[i].token_ids.size(),
+                        (int)cached_seq_[i].cache_len);
+        }
+    }
+}
+
+template<typename T>
+void LlamaBatch<T>::contextDecode()
+{
+    int base = -1;
+    for (int i = 0; i < batch_size_; ++i) {
+        if (h_input_length_buf_[i] > 1) {
+            base = i;
+            break;
+        }
+    }
+    if (base >= 0) {
+        check_cuda_error(cudaStreamSynchronize(stream_));
+        const auto tick = std::chrono::high_resolution_clock::now();
+
+        const int context_decode_count = batch_size_ - base;
+        if (rank_ == 0) {
+            FT_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
+        }
+        invokePlusScalar(input_length_buf_ + base, -1, context_decode_count, stream_);
+        invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);
+
+        auto get_input_len   = [this](int index) { return h_input_length_buf_[index] - 1; };
+        auto get_context_len = [this](int index) { return h_context_length_buf_[index] - 1; };
+
+        auto token_num       = get_input_len(base);
+        auto max_input_len   = get_input_len(base);
+        auto max_context_len = get_context_len(base);
+        auto offset          = base;
+        for (int i = offset + 1; i <= batch_size_; ++i) {
+            if (i == batch_size_ || token_num + h_context_length_buf_[i] > max_context_token_num_) {
+                const int context_decode_batch_size = i - offset;
+                if (rank_ == 0) {
+                    FT_LOG_INFO(
+                        "[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
+                        base,
+                        context_decode_batch_size,
+                        token_num,
+                        max_input_len,
+                        max_context_len);
+                }
+                // construct context_decoder_ids w/o padding
+                // aaaa____
+                // bb______ -> aaaabbcccccccc
+                // cccccccc
+                auto context_decoder_ids = context_decoder_ids_buf_;
+                for (int j = offset; j < i; ++j) {
+                    check_cuda_error(cudaMemcpyAsync(context_decoder_ids,
+                                                     input_ids_buf_ + j * session_len_,
+                                                     sizeof(int) * get_input_len(j),
+                                                     cudaMemcpyDefault,
+                                                     stream_));
+                    context_decoder_ids += get_input_len(j);
+                }
+                llama_->contextDecode(nullptr,
+                                      k_cache_ptr_buf_ + offset,
+                                      v_cache_ptr_buf_ + offset,
+                                      context_decoder_input_buf_,
+                                      nullptr,
+                                      context_decoder_ids_buf_,
+                                      input_length_buf_ + offset,
+                                      history_length_buf_ + offset,
+                                      context_length_buf_ + offset,
+                                      token_num,
+                                      max_input_len,
+                                      max_context_len,
+                                      session_len_,
+                                      context_decode_batch_size);
+                if (i < batch_size_) {
+                    token_num       = get_input_len(i);
+                    max_input_len   = get_input_len(i);
+                    max_context_len = get_context_len(i);
+                    offset          = i;
+                }
+            }
+            else {
+                token_num += get_input_len(i);
+                max_input_len   = std::max(max_input_len, get_input_len(i));
+                max_context_len = std::max(max_context_len, get_context_len(i));
+            }
+        }
+
+        invokePlusScalar(context_length_buf_ + base, 1, context_decode_count, stream_);
+        invokePlusScalar(input_length_buf_ + base, 1, context_decode_count, stream_);
+
+        for (int i = offset; i < batch_size_; ++i) {
+            h_input_length_buf_[i] = 0;
+        }
+
+        check_cuda_error(cudaStreamSynchronize(stream_));
+        const auto tock = std::chrono::high_resolution_clock::now();
+        if (rank_ == 0) {
+            FT_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
+        }
+    }
+    else if (rank_ == 0) {
+        FT_LOG_INFO("[decodeContext] Context decoding is not needed.");
+    }
+}
+
+template<typename T>
+void LlamaBatch<T>::finish()
+{
+    // secure info needed by `synchronize()`
+    check_cuda_error(
+        cudaMemcpyAsync(h_finished_buf_, finished_buf_, sizeof(bool) * batch_size_, cudaMemcpyDefault, stream_));
+    check_cuda_error(
+        cudaMemcpyAsync(h_sequence_lengths_, sequence_lengths_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
+
+    setOutputTensors(step_);
+
+    check_cuda_error(cudaStreamSynchronize(stream_));
+
+    for (int i = 0; i < batch_size_; ++i) {
+        FT_CHECK(requests_[i] != nullptr);
+        if (requests_[i]->stream_cb && rank_ == 0) {
+            requests_[i]->stream_cb(&requests_[i]->outputs[rank_].get());
+        }
+    }
+
+    if (debug_ && rank_ == 0) {
+        std::stringstream ss;
+        for (int i = 0; i < batch_size_; ++i) {
+            ss << (i ? ", " : "") << "(" << h_sequence_lengths_[i] << "," << h_finished_buf_[i] << ")";
+        }
+        FT_LOG_INFO("[finish] [%s]", ss.str().c_str());
+    }
+
+    for (int i = 0; i < batch_size_; ++i) {
+        if (h_finished_buf_[i]) {
+            finishRequest(i, false);
+            ++finished_count_;
+        }
+    }
+}
+
+template<typename T>
+void LlamaBatch<T>::synchronize()
+{
+    // compact
+    int idx = 0;
+    for (int i = 0; i < batch_size_; ++i) {
+        if (requests_[i]) {
+            h_input_length_buf_[idx]   = 0;
+            h_history_length_buf_[idx] = 0;
+
+            h_context_length_buf_[idx] = h_sequence_lengths_[i] + 1;
+            h_sequence_lengths_[idx]   = h_context_length_buf_[idx];
+
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topk_curandstate_buf_ + idx,
+                                             llama_->dynamic_decode_layer_->topk_curandstate_buf() + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync((curandState_t*)topp_curandstate_buf_ + idx,
+                                             llama_->dynamic_decode_layer_->topp_curandstate_buf() + i,
+                                             sizeof(curandState_t),
+                                             cudaMemcpyDefault,
+                                             stream_));
+
+            if (i != idx) {
+                h_finished_buf_[idx]        = h_finished_buf_[i];
+                request_seq_len_limit_[idx] = request_seq_len_limit_[i];
+
+                h_k_cache_ptr_buf_[idx] = h_k_cache_ptr_buf_[i];
+                h_v_cache_ptr_buf_[idx] = h_v_cache_ptr_buf_[i];
+
+                requests_[idx]   = std::move(requests_[i]);
+                cached_seq_[idx] = std::move(cached_seq_[i]);
+                check_cuda_error(cudaMemcpyAsync(output_ids_buf_ + idx * session_len_,
+                                                 output_ids_buf_ + i * session_len_,
+                                                 sizeof(int) * h_context_length_buf_[idx],
+                                                 cudaMemcpyDefault,
+                                                 stream_));
+            }
+            ++idx;
+        }
+    }
+    batch_size_ = idx;
+
+    if (rank_ == 0) {
+        FT_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_);
+    }
+
+    finished_count_ = 0;
+}
+
+template<typename T>
+void LlamaBatch<T>::setOutputTensors(int max_gen_step)
+{
+    // [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
+    invokeGatherOutput(output_ids_buf_,
+                       token_ids_buf_,
+                       context_length_buf_,
+                       max_context_len_,
+                       max_gen_step,
+                       session_len_,
+                       batch_size_,
+                       stream_);
+    sync_check_cuda_error();
+
+    /// TODO: fuse the loop into a single kernel
+    for (int i = 0; i < batch_size_; ++i) {
+        if (requests_[i]) {
+            auto& output_ids      = requests_[i]->outputs[rank_].at("output_ids");
+            auto& sequence_length = requests_[i]->outputs[rank_].at("sequence_length");
+            check_cuda_error(cudaMemcpyAsync(output_ids.getPtr<int>(),
+                                             output_ids_buf_ + i * session_len_,
+                                             sizeof(int) * output_ids.shape.at(2),
+                                             cudaMemcpyDefault,
+                                             stream_));
+            check_cuda_error(cudaMemcpyAsync(
+                sequence_length.getPtr<int>(), sequence_lengths_ + i, sizeof(int), cudaMemcpyDefault, stream_));
+            if (max_gen_step > max_context_len_) {  // +1 for newly generated token
+                invokePlusScalar(sequence_length.getPtr<int>(), 1, 1, stream_);
+            }
+        }
+    }
+}
+
+template<typename T>
+void LlamaBatch<T>::finishRequest(int index, bool force_end)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id);
+    }
+
+    if (debug_ && rank_ == 0) {
+        std::vector<int> tokens(h_sequence_lengths_[index] + 1);
+        cudaMemcpyAsync(tokens.data(),
+                        output_ids_buf_ + index * session_len_,
+                        sizeof(int) * tokens.size(),
+                        cudaMemcpyDefault,
+                        stream_);
+        cudaStreamSynchronize(stream_);
+        std::stringstream ss;
+        for (const auto& t : tokens) {
+            ss << " " << t;
+        }
+        FT_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str());
+    }
+
+    auto&      output_ids_tensor = requests_[index]->outputs[rank_].at("output_ids");
+    const auto output_ids_data   = output_ids_tensor.getPtr<int>();
+    if (requests_[index]->end_flag || force_end) {
+        llama_->kv_cache_mgr_->erase(requests_[index]->id);
+    }
+    else {
+        // the last generated token is not processed by decoder thus dont have k/v cache
+        const int n_steps    = step_ - max_context_len_;
+        const int cache_len  = h_sequence_lengths_[index];
+        const int output_len = n_steps > 0 ? cache_len + 1 : cache_len;
+
+        auto& seq = cached_seq_[index];
+
+        seq.cache_len = cache_len;
+
+        // update token IDs
+        seq.token_ids.resize(output_len);
+        check_cuda_error(cudaMemcpyAsync(
+            seq.token_ids.data(), output_ids_data, sizeof(int) * output_len, cudaMemcpyDefault, stream_));
+
+        // update random states
+        seq.random_state_.resize(sizeof(curandState_t) * 2);
+        check_cuda_error(cudaMemcpyAsync(seq.random_state_.data(),
+                                         llama_->dynamic_decode_layer_->topk_curandstate_buf() + index,
+                                         sizeof(curandState_t),
+                                         cudaMemcpyDefault,
+                                         stream_));
+        check_cuda_error(cudaMemcpyAsync(seq.random_state_.data() + sizeof(curandState_t),
+                                         llama_->dynamic_decode_layer_->topp_curandstate_buf() + index,
+                                         sizeof(curandState_t),
+                                         cudaMemcpyDefault,
+                                         stream_));
+
+        check_cuda_error(cudaStreamSynchronize(stream_));
+
+        llama_->kv_cache_mgr_->update(cached_seq_[index], stream_);
+    }
+
+    if (rank_ == 0) {
+        requests_[index]->signal.set_value(0);
+    }
+
+    requests_[index] = nullptr;
+}
+
+template class LlamaBatch<half>;
+template class LlamaBatch<float>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaBatch.h b/src/fastertransformer/models/llama/LlamaBatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5efe30cfeb552fcd311bd90629b4f58b7aa1f3d8
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaBatch.h
@@ -0,0 +1,153 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaV2;
+
+template<typename T>
+class LlamaBatch {
+public:
+    int size() const noexcept
+    {
+        return batch_size_;
+    };
+
+    int maxSize() const noexcept
+    {
+        return max_batch_size_;
+    }
+
+    int finishedCount() const noexcept
+    {
+        return finished_count_;
+    }
+
+    void verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
+                        std::vector<std::shared_ptr<Request>>& infer_reqs);
+    void handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests);
+
+    void allocateBuffer(size_t batch_size, size_t session_len);
+    void allocatePersistantBuffer(size_t max_batch_size);
+    void freeBuffer();
+
+    void initializeSampling(int infer_request_count);
+
+    void initialize(const std::vector<std::shared_ptr<Request>>& infer_requests);
+    void contextDecode();
+
+    void initializeGeneration();
+    bool generate();
+
+    void finish();
+    void finishRequest(int index, bool force_end);
+
+    void synchronize();
+
+    void setOutputTensors(int max_gen_step);
+
+    explicit LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama);
+
+    ~LlamaBatch()
+    {
+        freeBuffer();
+    }
+
+private:
+    const int  max_batch_size_;
+    const int  max_context_token_num_;
+    const int  session_len_;
+    const int  rank_;
+    const bool debug_;
+
+    LlamaV2<T>* const llama_;
+
+    // active requests
+    std::vector<std::shared_ptr<Request>> requests_;
+
+    T* context_decoder_input_buf_{};  // CTXDEC
+    // T* context_decoder_output_buf_{};  // CTXDEC
+    int* context_decoder_ids_buf_{};
+
+    T* decoder_input_buf_{};   // CTXDEC, GENERATE
+    T* decoder_output_buf_{};  // CTXDEC, GENERATE
+
+    int* input_ids_buf_{};       // input token ids + cache missed token ids, CTXDEC
+    int* input_length_buf_{};    // input + cache missed length, CTXDEC, GENERATE
+    int* history_length_buf_{};  // history length, CTXDEC
+    int* context_length_buf_{};  // history length + input_length, CTXDEC, GENERATE
+
+    int* total_padding_count_{};  // GENERATE
+    int* sequence_lengths_{};     // current sequence length
+
+    uint64_t* k_cache_ptr_buf_{};
+    uint64_t* v_cache_ptr_buf_{};
+
+    float* logits_buf_{};        // combined logits
+    float* local_logits_buf_{};  // tensor parallel local logits
+
+    // used by dynamic decoder
+    int*      token_ids_buf_{};   // all token IDs in [S, B], indexed using `step`
+    int*      output_ids_buf_{};  // output ids in [B, S]
+    int*      end_ids_buf_{};
+    bool*     finished_buf_{};
+    uint32_t* seq_limit_len_{};
+
+    // pinned buffers
+    int*       h_input_ids_buf_{};
+    int*       h_input_length_buf_{};
+    int*       h_history_length_buf_{};
+    int*       h_context_length_buf_{};
+    int*       h_sequence_lengths_{};
+    bool*      h_finished_buf_{};
+    uintptr_t* h_k_cache_ptr_buf_{};
+    uintptr_t* h_v_cache_ptr_buf_{};
+    uint32_t*  h_seq_limit_len_{};
+
+    int*      stop_words_buf_{};  // [batch_size, 2, kMaxStopWordsLen]
+    int*      bad_words_buf_{};
+    int*      h_runtime_top_k_{};
+    float*    h_runtime_top_p_{};
+    float*    h_temperature_{};
+    float*    h_repetition_penalty_{};
+    uint64_t* h_random_seed_{};
+
+    void* topk_curandstate_buf_{};
+    void* topp_curandstate_buf_{};
+
+    // hard limits for persistant buffers
+    static constexpr int kMaxStopBadWordsLen = 32;
+
+    using CachedSeq = LlamaCacheManager::Sequence;
+
+    std::vector<CachedSeq> cached_seq_;
+    std::vector<int>       request_seq_len_limit_;
+
+    const DataType data_type_{};
+
+    int batch_size_{};
+    int max_context_len_{};
+    int step_{};
+    int finished_count_{};
+
+    bool is_allocate_persistant_buffer_ = false;
+    bool is_allocate_buffer_            = false;
+
+    TensorMap inputs_;
+    TensorMap outputs_;
+
+    std::unordered_map<std::string, void*> sampling_params_;
+
+    cudaStream_t     stream_{};
+    cublasMMWrapper* cublas_wrapper_{};
+    IAllocator*      allocator_{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaCacheManager.cc b/src/fastertransformer/models/llama/LlamaCacheManager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45b296c44c16366d12f7e4cd6df676c3792fb55c
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.cc
@@ -0,0 +1,192 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+
+namespace fastertransformer {
+
+LlamaCacheManager::~LlamaCacheManager()
+{
+    for (auto& p : device_mem_) {
+        allocator_->free(&p, false);
+    }
+}
+
+void* LlamaCacheManager::allocate(bool is_preallocte)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][allocate]");
+    }
+
+    void* mem_ptr{};
+
+    if (!device_free_.empty()) {
+        mem_ptr = device_free_.front();
+        device_free_.pop();
+
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
+        }
+    }
+    else if (entry_count_ < max_entry_count_) {
+        const auto   alloc_count     = std::min(chunk_size_, max_entry_count_ - entry_count_);
+        const size_t entry_byte_size = 2 * cache_byte_size_;  // 2 for k,v
+
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
+        }
+        const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
+        FT_CHECK(chunk_ptr);
+        device_mem_.push_back(chunk_ptr);
+        entry_count_ += alloc_count;
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
+        }
+
+        for (int i = 0; i < alloc_count; ++i) {
+            device_free_.push((uint8_t*)chunk_ptr + entry_byte_size * i);
+        }
+
+        if (!is_preallocte) {
+            mem_ptr = device_free_.front();
+            device_free_.pop();
+        }
+
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
+        }
+    }
+    else {
+        mem_ptr = evict();
+        FT_CHECK_WITH_INFO(mem_ptr, "No enough cache entries.");
+    }
+
+    return mem_ptr;
+}
+
+auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
+    }
+
+    for (const auto& e : device_cache_) {
+        if (e.id == id) {
+            if (rank_ == 0) {
+                FT_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
+            }
+            erase(id);
+        }
+    }
+
+    const auto mem_ptr = (uint8_t*)allocate(false);
+    check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
+
+    device_cache_.push_back({
+        id,
+        max_seq_len_,
+        {},
+        0,
+        mem_ptr,
+        mem_ptr + cache_byte_size_,
+        {},
+        static_cast<uint64_t>(-1),
+    });
+
+    return device_cache_.back();
+}
+
+auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::iterator
+{
+    auto pred = [&](const Sequence& s) { return s.id == id; };
+    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
+    if (it == device_cache_.end()) {
+        FT_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
+        FT_CHECK(0);
+    }
+    return it;
+}
+
+auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
+    }
+
+    auto entry = getEntryOrThrow(id);
+
+    if (entry->k_cache == nullptr) {
+        FT_CHECK(entry->cache_len == 0);
+        const auto mem_ptr = allocate(false);
+        check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
+        entry->k_cache = mem_ptr;
+        entry->v_cache = (uint8_t*)entry->k_cache + cache_byte_size_;
+    }
+
+    entry->timestamp = static_cast<uint64_t>(-1);
+    return *entry;
+}
+
+void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
+    }
+
+    auto entry = getEntryOrThrow(seq.id);
+
+    entry->timestamp = ++timestamp_;
+    entry->token_ids = seq.token_ids;
+    entry->cache_len = seq.cache_len;
+    FT_CHECK(seq.k_cache == entry->k_cache && seq.v_cache == entry->v_cache);
+}
+
+void LlamaCacheManager::erase(uint64_t id)
+{
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
+    }
+
+    auto entry = getEntryOrThrow(id);
+
+    if (entry->k_cache) {
+        device_free_.push(entry->k_cache);
+        if (rank_ == 0) {
+            FT_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
+        }
+    }
+    device_cache_.erase(entry);
+}
+
+void* LlamaCacheManager::evict()
+{
+    FT_CHECK(!device_cache_.empty());
+    auto it = std::min_element(device_cache_.begin(), device_cache_.end(), [](const auto& a, const auto& b) {
+        return a.timestamp < b.timestamp;
+    });
+
+    if (it->timestamp == static_cast<uint64_t>(-1)) {
+        return nullptr;
+    }
+
+    if (rank_ == 0) {
+        FT_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
+    }
+
+    FT_CHECK(it->k_cache);
+    auto mem_ptr = it->k_cache;
+    it->k_cache = it->v_cache = nullptr;
+    it->cache_len             = 0;
+    it->timestamp             = static_cast<uint64_t>(-1);
+    return mem_ptr;
+}
+
+bool LlamaCacheManager::contains(uint64_t id) const noexcept
+{
+    auto pred = [&](const Sequence& s) { return s.id == id; };
+    auto it   = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
+    return it != device_cache_.end();
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaCacheManager.h b/src/fastertransformer/models/llama/LlamaCacheManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..569d1d5cb666518453a300071a914d85459685f6
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.h
@@ -0,0 +1,102 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/logger.h"
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <queue>
+#include <unordered_map>
+#include <vector>
+
+namespace fastertransformer {
+
+// k-cache layout [L, H, D/x, S[s:], x]
+// v-cache layout [L, H, S[s:], D/x, x]
+
+class LlamaCacheManager {
+public:
+    LlamaCacheManager(size_t      layer_num,
+                      size_t      head_num,
+                      size_t      size_per_head,
+                      size_t      max_seq_len,
+                      size_t      elem_bits,
+                      size_t      max_entry_count,
+                      size_t      chunk_size,
+                      int         rank,
+                      IAllocator* allocator):
+        layer_num_(layer_num),
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        max_seq_len_(max_seq_len),
+        elem_bits_(elem_bits),
+        cache_byte_size_(layer_num_ * head_num_ * max_seq_len_ * size_per_head_ * elem_bits_ / 8),
+        max_entry_count_(max_entry_count),
+        chunk_size_(chunk_size),
+        rank_(rank),
+        allocator_(allocator)
+    {
+        if (rank == 0) {
+            FT_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
+            FT_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
+        }
+        allocate(true);
+    }
+
+    ~LlamaCacheManager();
+
+    struct Sequence {
+        // header
+        uint64_t id;
+        size_t   max_seq_len;
+
+        // payloads
+        std::vector<int> token_ids;  // all token ids
+        size_t           cache_len;  // cache_len == 0 -> cache miss
+        void*            k_cache;
+        void*            v_cache;
+
+        std::vector<uint8_t> random_state_;  // states for RNGs
+
+        // for LRU policy
+        uint64_t timestamp;
+    };
+
+    Sequence create(uint64_t id, cudaStream_t stream);
+
+    Sequence fetch(uint64_t id, cudaStream_t stream);
+
+    void update(const Sequence& seq, cudaStream_t stream);
+
+    void erase(uint64_t id);
+
+    bool contains(uint64_t id) const noexcept;
+
+private:
+    std::vector<Sequence>::iterator getEntryOrThrow(uint64_t id);
+
+    void* allocate(bool is_preallocte);
+
+    void* evict();
+
+private:
+    const size_t layer_num_{};
+    const size_t head_num_{};
+    const size_t size_per_head_{};
+    const size_t max_seq_len_{};
+    const size_t elem_bits_{};
+    const size_t cache_byte_size_{};
+    const size_t max_entry_count_{};
+    const size_t chunk_size_{};
+    const int    rank_{};
+    IAllocator*  allocator_{};
+
+    std::queue<void*>  device_free_;
+    std::vector<void*> device_mem_;
+    int                entry_count_{};
+
+    uint64_t timestamp_{};
+
+    std::vector<Sequence> device_cache_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31e92760d3a3cd7c76b32f8cabf29c646443bac1
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
+
+#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
+                                                   size_t num_token,
+                                                   size_t max_q_len,
+                                                   size_t max_k_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    // no padding
+    qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true);
+
+    // padding is rebuilt for q/k/v_buf_2_
+    q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * 3 * batch_size * max_q_len * local_hidden_units_, true);
+    k_buf_2_ = q_buf_2_ + batch_size * max_q_len * local_hidden_units_;
+    v_buf_2_ = k_buf_2_ + batch_size * max_q_len * local_hidden_units_;
+
+    if (use_fmha_) {
+        FlashAttentionOp<T> flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
+        if (flash_attention.get_workspace_size() > 0) {
+            qk_buf_float_ = (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), true);
+        }
+    }
+    else {
+        k_cache_buf_ = (T*)allocator_->reMalloc(
+            k_cache_buf_, 2 * sizeof(T) * batch_size * local_head_num_ * max_k_len * size_per_head_, true);
+        v_cache_buf_ = k_cache_buf_ + batch_size * local_head_num_ * max_k_len * size_per_head_;
+
+        qk_buf_ =
+            (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * max_q_len * max_k_len, true);
+
+        // qkv_buf_2_ has padding
+        qkv_buf_2_ =
+            (T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * max_q_len * local_hidden_units_, true);
+    }
+
+    // qkv_buf_3_ padding is removed
+    qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_hidden_units_, true);
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaContextAttentionLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+        allocator_->free((void**)(&qkv_buf_));
+        allocator_->free((void**)(&q_buf_2_));
+        if (use_fmha_) {
+            allocator_->free((void**)&qk_buf_float_);
+        }
+        else {
+            allocator_->free((void**)(&k_cache_buf_));
+            allocator_->free((void**)(&qk_buf_));
+            allocator_->free((void**)(&qkv_buf_2_));
+        }
+        allocator_->free((void**)(&qkv_buf_3_));
+
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+inline void LlamaContextAttentionLayer<T>::forward(TensorMap*                     output_tensors,
+                                                   const TensorMap*               input_tensors,
+                                                   const LlamaAttentionWeight<T>* weights)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    /**
+     * input_tensors:
+     *   \param input_query [token_num, hidden_dim]
+     *   \param attention_mask [batch_size, 1, max_q_len, max_kv_len]
+     *   \param padding_offset [token_num], int
+     *   \param input_lengths [batch_size], int
+     *   \param history_lengths [batch_size], int
+     *   \param context_lengths [batch_size], int
+     *   \param cu_seqlens [batch_size+1], int
+     *   \param max_seq_len [1], int on cpu
+     *   \param is_final_layer [1], bool on cpu
+     *   \param layer_id [1], int on cpu
+     *
+     * output_tensors:
+     *   \param hidden_features [token_num, hidden_dim]
+     *   \param key_cache [batch_size], uint64
+     *   \param value_cache [batch_size], uint64
+     */
+
+    /////////////////////////////////////////////
+    /// parse inputs
+    const int batch_size = input_tensors->at("attention_mask").shape[0];
+    const int max_q_len  = input_tensors->at("attention_mask").shape[2];
+    const int max_k_len  = input_tensors->at("attention_mask").shape[3];
+    const int layer_id   = input_tensors->getVal<int>("layer_id");
+
+    const int num_token = input_tensors->at("input_query").shape[0];
+
+    const int max_seq_len = input_tensors->at("max_seq_len").getVal<int>();
+
+    T* attention_out   = output_tensors->at("hidden_features").getPtr<T>();
+    T* attention_input = input_tensors->at("input_query").getPtr<T>();
+    T* attention_mask  = input_tensors->at("attention_mask").getPtr<T>();
+
+    const auto input_length   = input_tensors->at("input_lengths").getPtr<const int>();
+    const auto history_length = input_tensors->at("history_lengths").getPtr<const int>();
+    const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
+    int*       cu_seqlens     = input_tensors->at("cu_seqlens").getPtr<int>();
+
+    const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();
+
+    /////////////////////////////////////////////
+    /// allocate buffers
+    allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
+
+    //////////////////////////////////////////////
+    /// qkv gemm
+    // [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
+    linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);
+
+    //////////////////////////////////////////////
+    /// transpose qkv & apply rotary embedding & rebuild padding
+    /// qkv [B, s, 3, H, D] -> (q [B, H, s, D], k [B, H, s, D], v [B, H, s, D])
+    invokeAddFusedQKVBiasTranspose(q_buf_2_,
+                                   k_buf_2_,
+                                   v_buf_2_,
+                                   PrefixPromptBatchWeightsParam<T>{},
+                                   qkv_buf_,
+                                   (const T*)nullptr,  // qkv_bias
+                                   padding_offset,     // padding_offset,
+                                   history_length,     // used for applying rotary embedding
+                                   batch_size,
+                                   max_q_len,  // seq_len
+                                   num_token,  // batch_size * seq_len
+                                   local_head_num_,
+                                   size_per_head_,
+                                   rotary_embedding_dim_,
+                                   neox_rotary_style_,
+                                   nullptr,  // query_weight.scale_out
+                                   0,        // int8 mode
+                                   stream_);
+    sync_check_cuda_error();
+
+    const size_t layer_offset = layer_id * local_head_num_ * max_seq_len * size_per_head_;
+
+    auto k_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
+    auto v_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
+    //////////////////////////////////////////////////////////
+    /// insert the k/v computed from inputs into k/v cache
+    /// transpose kv -> kv cache
+    // put k/v_buf from shape [B, H, s, D] to
+    // k_buf_2 [B, H, s, D] -> key_cache [B, H, S[t:t+s], D/x, x]
+    // v_buf_2 [B, H, s, D] -> val_cache [B, H, S[t:t+s], D/x, x]
+    invokeExtendKVCache(k_cache_ptrs,
+                        v_cache_ptrs,
+                        layer_offset,
+                        k_buf_2_,
+                        v_buf_2_,
+                        batch_size,
+                        input_length,
+                        max_q_len,
+                        history_length,
+                        max_seq_len,
+                        size_per_head_,
+                        local_head_num_,
+                        stream_);
+    sync_check_cuda_error();
+
+    if (use_fmha_) {
+        fusedMultiHeadAttention(k_cache_ptrs,
+                                v_cache_ptrs,
+                                layer_offset,
+                                attention_mask,
+                                cu_seqlens,
+                                batch_size,
+                                max_q_len,
+                                max_k_len,
+                                max_seq_len);
+    }
+    else {
+        unfusedMultiHeadAttention(k_cache_ptrs,
+                                  v_cache_ptrs,
+                                  layer_offset,
+                                  attention_mask,
+                                  padding_offset,
+                                  context_length,
+                                  batch_size,
+                                  num_token,
+                                  max_q_len,
+                                  max_k_len,
+                                  max_seq_len);
+    }
+
+    //////////////////////////////////////////////
+    /// output gemm <Bs,HD> -> <Bs,HD>
+    linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
+
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_ == true) {
+        freeBuffer();
+    }
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptrs,
+                                                            T**    val_cache_ptrs,
+                                                            size_t cache_layer_offset,
+                                                            T*     attention_mask,
+                                                            int*   cu_seqlens,
+                                                            int    batch_size,
+                                                            int    max_q_len,
+                                                            int    max_k_len,
+                                                            int    max_seq_len)
+{
+    //////////////////////////////////////////////
+    // flash attention
+    using AttentionOp = FlashAttentionOp<T>;
+    using Layout      = typename AttentionOp::AttentionLayout;
+    Layout layout_q{.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
+                    .stride_seq   = int(size_per_head_),
+                    .stride_head  = int(max_q_len * size_per_head_)};
+    Layout layout_k{.stride_batch      = int(local_head_num_ * max_seq_len * size_per_head_),
+                    .stride_seq        = int(size_per_head_),
+                    .stride_head       = int(max_seq_len * size_per_head_),
+                    .batch_seqs_offset = int(cache_layer_offset),
+                    .batch_seqs        = key_cache_ptrs};
+    Layout layout_v{.stride_batch      = int(local_head_num_ * max_seq_len * size_per_head_),
+                    .stride_seq        = int(size_per_head_),
+                    .stride_head       = int(max_seq_len * size_per_head_),
+                    .batch_seqs_offset = int(cache_layer_offset),
+                    .batch_seqs        = val_cache_ptrs};
+    Layout layout_o{
+        .stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
+        .stride_seq   = int(local_head_num_ * size_per_head_),
+        .stride_head  = int(size_per_head_),
+        .use_seqlens  = true,
+    };
+    AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
+
+    typename AttentionOp::Params attn_params{.attn_out     = qkv_buf_3_,
+                                             .query        = q_buf_2_,
+                                             .key          = k_cache_buf_,
+                                             .val          = v_cache_buf_,
+                                             .mask         = attention_mask,
+                                             .out_accum    = qk_buf_float_,
+                                             .cu_seqlens_q = cu_seqlens,
+                                             .cu_seqlens_k = nullptr,
+                                             .layout_q     = layout_q,
+                                             .layout_k     = layout_k,
+                                             .layout_v     = layout_v,
+                                             .layout_o     = layout_o};
+
+    //
+    flash_attention(attn_params, stream_);
+}
+
+template<typename T>
+void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**        key_cache_ptrs,
+                                                              T**        val_cache_ptrs,
+                                                              size_t     cache_layer_offset,
+                                                              const T*   attention_mask,
+                                                              const int* padding_offset,
+                                                              const int* context_length,
+                                                              int        batch_size,
+                                                              int        num_token,
+                                                              int        max_q_len,
+                                                              int        max_k_len,
+                                                              int        max_seq_len)
+{
+    // key_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
+    // val_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
+    invokeTransposeKVCache(k_cache_buf_,
+                           v_cache_buf_,
+                           (const T**)key_cache_ptrs,
+                           (const T**)val_cache_ptrs,
+                           cache_layer_offset,
+                           batch_size,
+                           context_length,  // history_len + input_len = context_len
+                           max_k_len,
+                           max_seq_len,
+                           size_per_head_,
+                           local_head_num_,
+                           stream_);
+    sync_check_cuda_error();
+
+    const T qk_scale = static_cast<T>(1.f / sqrtf(size_per_head_ * 1.f));
+
+    //////////////////////////////////////////////
+    /// Q*K batch gemm
+    /// -> [B, H, s, t + s]
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        max_k_len,                      // m
+                                        max_q_len,                      // n
+                                        size_per_head_,                 // k
+                                        k_cache_buf_,                   // A
+                                        size_per_head_,                 // lda
+                                        max_k_len * size_per_head_,     // strideA
+                                        q_buf_2_,                       // B
+                                        size_per_head_,                 // ldb
+                                        max_q_len * size_per_head_,     // strideB
+                                        qk_buf_,                        // C
+                                        max_k_len,                      // ldc
+                                        max_q_len * max_k_len,          // strideC
+                                        batch_size * local_head_num_);  // batchCount
+
+    //////////////////////////////////////////////
+    /// ! masked softmax (kernel asserts k_length <= 4096)
+    MaskedSoftmaxParam<T, T> param{};
+    param.attention_score    = qk_buf_;
+    param.qk                 = qk_buf_;
+    param.attention_mask     = attention_mask;
+    param.batch_size         = batch_size;
+    param.q_length           = max_q_len;
+    param.k_length           = max_k_len;
+    param.num_heads          = local_head_num_;
+    param.qk_scale           = qk_scale;
+    param.linear_bias_slopes = nullptr;
+    invokeMaskedSoftmax(param, stream_);
+    sync_check_cuda_error();
+
+    //////////////////////////////////////////////
+    /// softmax(QK)*V batch gemm
+    // -> [B, H, S, D]
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        size_per_head_,                 // m
+                                        max_q_len,                      // n
+                                        max_k_len,                      // k
+                                        v_cache_buf_,                   // A
+                                        size_per_head_,                 // lda
+                                        max_k_len * size_per_head_,     // strideA,
+                                        qk_buf_,                        // B
+                                        max_k_len,                      // ldb
+                                        max_k_len * max_q_len,          // strideB
+                                        qkv_buf_2_,                     // C
+                                        size_per_head_,                 // ldc,
+                                        max_q_len * size_per_head_,     // strideC
+                                        batch_size * local_head_num_);  // batchCount
+
+    //////////////////////////////////////////////
+    /// transpose <B,h,s,D> -> <B,s,h,D>
+    invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
+                                             qkv_buf_3_,
+                                             num_token,
+                                             batch_size,
+                                             max_q_len,
+                                             local_head_num_,
+                                             size_per_head_,
+                                             padding_offset,
+                                             nullptr,
+                                             0,
+                                             stream_);
+    sync_check_cuda_error();
+}
+
+template class LlamaContextAttentionLayer<float>;
+template class LlamaContextAttentionLayer<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2572a696bb3f81d6cbe90d39b19c3ceb3b864e03
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/fastertransformer/models/llama/LlamaLinear.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaContextAttentionLayer {
+public:
+    void freeBuffer();
+    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
+
+    LlamaContextAttentionLayer(size_t           head_num,
+                               size_t           size_per_head,
+                               size_t           rotary_embedding_dim,
+                               bool             neox_rotary_style,
+                               NcclParam        tensor_para,
+                               cudaStream_t     stream,
+                               cublasMMWrapper* cublas_wrapper,
+                               IAllocator*      allocator,
+                               bool             is_free_buffer_after_forward,
+                               bool             use_fmha):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        hidden_units_(head_num * size_per_head),
+        local_head_num_(head_num / tensor_para.world_size_),
+        local_hidden_units_(hidden_units_ / tensor_para.world_size_),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        neox_rotary_style_(neox_rotary_style),
+        tensor_para_(tensor_para),
+        stream_(stream),
+        cublas_wrapper_(cublas_wrapper),
+        linear_(cublas_wrapper, stream),
+        allocator_(allocator),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward),
+        use_fmha_(use_fmha)
+    {
+    }
+
+    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
+
+    void fusedMultiHeadAttention(T**    key_cache_ptrs,
+                                 T**    val_cache_ptrs,
+                                 size_t cache_layer_offset,
+                                 T*     attention_mask,
+                                 int*   cu_seqlens,
+                                 int    batch_size,
+                                 int    max_q_len,
+                                 int    max_k_len,
+                                 int    max_seq_len);
+
+    void unfusedMultiHeadAttention(T**        key_cache_ptrs,
+                                   T**        val_cache_ptrs,
+                                   size_t     cache_layer_offset,
+                                   const T*   attention_mask,
+                                   const int* padding_offset,
+                                   const int* context_length,
+                                   int        batch_size,
+                                   int        num_token,
+                                   int        max_q_len,
+                                   int        max_k_len,
+                                   int        max_seq_len);
+
+private:
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t local_head_num_;
+    const size_t local_hidden_units_;
+    const size_t rotary_embedding_dim_;
+    const bool   is_free_buffer_after_forward_;
+
+    const bool neox_rotary_style_;
+
+    const bool use_fmha_;
+
+    NcclParam tensor_para_;
+
+    cudaStream_t     stream_;
+    IAllocator*      allocator_;
+    cublasMMWrapper* cublas_wrapper_;
+    LlamaLinear<T>   linear_;
+
+    T*     qkv_buf_{};
+    T*     q_buf_2_{};
+    T*     k_buf_2_{};
+    T*     v_buf_2_{};
+    T*     k_cache_buf_{};
+    T*     v_cache_buf_{};
+    T*     qk_buf_{};
+    float* qk_buf_float_{};
+    T*     qkv_buf_2_{};
+    T*     qkv_buf_3_{};
+
+    bool is_allocate_buffer_ = false;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.cc b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5333d11040d344d564cc84af6d2f73a5410a4beb
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
+
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/Tensor.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void LlamaContextDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    attn_ffn_io_    = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false);
+    attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
+    padding_offset_ = (int*)allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * max_q_len, false);
+    cu_seqlens_     = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false);
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)&attn_ffn_io_);
+        allocator_->free((void**)&padding_offset_);
+        allocator_->free((void**)&cu_seqlens_);
+        allocator_->free((void**)&attention_mask_);
+        allocator_->free((void**)&h_pinned_token_num_ptr_, true);
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::initialize(bool use_fmha)
+{
+    h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
+
+    context_attention_layer_ = new LlamaContextAttentionLayer<T>(head_num_,
+                                                                 size_per_head_,
+                                                                 rotary_embedding_dim_,
+                                                                 false,  // neox_rotary_style
+                                                                 tensor_para_,
+                                                                 stream_,
+                                                                 cublas_wrapper_,
+                                                                 allocator_,
+                                                                 is_free_buffer_after_forward_,
+                                                                 use_fmha);
+
+    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
+                                           size_per_head_,
+                                           inter_size_,
+                                           tensor_para_,
+                                           stream_,
+                                           cublas_wrapper_,
+                                           allocator_,
+                                           is_free_buffer_after_forward_);
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::forwardSelfAttn(const Session&                                 sess,
+                                             const std::unordered_map<std::string, Tensor>* input_tensors,
+                                             int                                            layer,
+                                             bool                                           is_final)
+{
+    // FT_LOG_ERROR(__PRETTY_FUNCTION__);
+    TensorMap self_attention_input_tensors{
+        {"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
+        {"attention_mask",
+         {MEMORY_GPU, data_type_, {sess.batch_size, 1, sess.max_query_len, sess.max_key_len}, attention_mask_}},
+        {"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &layer}},
+        {"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_final}},
+        {"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
+        {"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
+        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
+        {"history_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.history_length}},
+        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
+        {"max_seq_len", input_tensors->at("max_seq_len")}};
+
+    auto& k_cache = *sess.k_cache;
+    auto& v_cache = *sess.v_cache;
+
+    TensorMap self_attention_output_tensors{
+        {"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
+        {"key_cache", k_cache},
+        {"value_cache", v_cache},
+    };
+
+    context_attention_layer_->forward(&self_attention_output_tensors,  //
+                                      &self_attention_input_tensors,
+                                      &sess.weights->at(layer)->self_attn_weights);
+}
+
+template<typename T>
+LlamaContextDecoder<T>::LlamaContextDecoder(size_t           head_num,
+                                            size_t           size_per_head,
+                                            size_t           inter_size,
+                                            size_t           num_layer,
+                                            size_t           rotary_embedding_dim,
+                                            float            rmsnorm_eps,
+                                            NcclParam        tensor_para,
+                                            cudaStream_t     stream,
+                                            cublasMMWrapper* cublas_wrapper,
+                                            IAllocator*      allocator,
+                                            bool             is_free_buffer_after_forward,
+                                            bool             use_fmha):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    hidden_units_(head_num * size_per_head),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    rmsnorm_eps_(rmsnorm_eps),
+    tensor_para_(tensor_para),
+    data_type_(getTensorType<T>())
+{
+    initialize(use_fmha);
+}
+
+template<typename T>
+LlamaContextDecoder<T>::~LlamaContextDecoder()
+{
+    delete context_attention_layer_;
+    delete silu_ffn_layer_;
+    freeBuffer();
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                                     const std::vector<Tensor>*                      input_tensors,
+                                     const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                                     const std::unordered_map<std::string, Tensor>*  input_tensors,
+                                     const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    /**
+     * input tensors:
+     *   \param decoder_input [num_token, hidden_units], float
+     *   \param input_lengths [batch_size], int
+     *   \param history_lengths [batch_size], int
+     *   \param context_legnths [batch_size], int
+     *   \param output_norm_weight [hidden_dims], float
+     *   \param max_q_len [1], int on cpu
+     *   \param max_kv_len [1], int on cpu
+     *   \param max_seq_len [1], int on cpu
+     *
+     * output tensors:
+     *   \param decoder_output [batch_size, seq_len, hidden_units],
+     *   \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
+     *   \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
+     *   \param last_token_hidden_units [batch_size, hidden_units]
+     */
+
+    Session sess{};
+
+    sess.token_num     = input_tensors->at("decoder_input").shape[0];
+    sess.batch_size    = input_tensors->at("input_lengths").shape[0];
+    sess.max_query_len = input_tensors->at("max_q_len").getVal<int>();
+    sess.max_key_len   = input_tensors->at("max_kv_len").getVal<int>();
+    sess.weights       = decoder_layer_weights;
+
+    sess.input_length   = input_tensors->at("input_lengths").getPtr<int>();
+    sess.history_length = input_tensors->at("history_lengths").getPtr<int>();
+    sess.context_length = input_tensors->at("context_lengths").getPtr<int>();
+
+    T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
+    // T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+
+    sess.k_cache = &output_tensors->at("key_cache");
+    sess.v_cache = &output_tensors->at("value_cache");
+
+    allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);
+
+    size_t tmp_token_num{};
+    invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
+                                       &tmp_token_num,  // updated token num
+                                       padding_offset_,
+                                       cu_seqlens_,
+                                       input_tensors->at("input_lengths").getPtr<int>(),
+                                       sess.batch_size,
+                                       sess.max_query_len,
+                                       stream_);
+    sync_check_cuda_error();
+    FT_CHECK(tmp_token_num == sess.token_num);
+
+    invokeCreateCausalMasks(attention_mask_,
+                            sess.input_length,
+                            sess.context_length,
+                            sess.max_query_len,
+                            sess.max_key_len,
+                            sess.batch_size,
+                            stream_);
+    sync_check_cuda_error();
+
+    /////////////////////////////////////////////
+    /// RMSNorm
+    invokeRootMeanSquareNorm(attn_ffn_io_,
+                             decoder_input_output,
+                             decoder_layer_weights->at(0)->self_attn_norm_weights,
+                             rmsnorm_eps_,
+                             sess.token_num,
+                             hidden_units_,
+                             stream_);
+    sync_check_cuda_error();
+
+    for (size_t layer = 0; layer < num_layer_; ++layer) {
+        /////////////////////////////////////////////
+        /// self-attention
+        forwardSelfAttn(sess, input_tensors, layer, false);
+
+        invokeFusedAddResidualRMSNorm(decoder_input_output,
+                                      attn_ffn_io_,
+                                      decoder_layer_weights->at(layer)->ffn_norm_weights,
+                                      rmsnorm_eps_,
+                                      sess.token_num,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+
+        ////////////////////////////////////////////
+        /// feed-forward network
+        TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}};
+        TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}};
+        silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
+
+        auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
+                                                     input_tensors->at("output_norm_weight").getPtr<T>();
+        invokeFusedAddResidualRMSNorm(decoder_input_output,  //
+                                      attn_ffn_io_,
+                                      scale_weight,
+                                      rmsnorm_eps_,
+                                      sess.token_num,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+}
+
+template class LlamaContextDecoder<float>;
+template class LlamaContextDecoder<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaContextDecoder.h b/src/fastertransformer/models/llama/LlamaContextDecoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..82ecc2574a71be2193e2f055d98cb68ccefa799b
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
+
+#pragma once
+
+// #include "src/fastertransformer/kernels/add_residual_kernels.h"
+// #include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+// #include "src/fastertransformer/layers/FfnLayer.h"
+// #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaContextDecoder: public BaseLayer {
+protected:
+    void allocateBuffer() override;
+    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
+    void freeBuffer() override;
+
+    void initialize(bool use_fmha);
+
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    size_t hidden_units_;
+    float  rmsnorm_eps_;
+
+    NcclParam tensor_para_;
+
+    T*   attn_ffn_io_{};
+    T*   attention_mask_{};
+    int* padding_offset_{};
+    int* cu_seqlens_{};  // cu for cumulative
+
+    size_t* h_pinned_token_num_ptr_{};
+
+    LlamaContextAttentionLayer<T>* context_attention_layer_{};
+    LlamaFfnLayer<T>*              silu_ffn_layer_{};
+
+    const DataType data_type_;
+
+    struct Session {
+        size_t  batch_size;
+        size_t  token_num;
+        size_t  max_query_len;
+        size_t  max_key_len;
+        Tensor* k_cache;
+        Tensor* v_cache;
+        int*    input_length{};
+        int*    history_length{};
+        int*    context_length{};
+
+        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
+    };
+
+    void forwardSelfAttn(const Session&                                 sess,
+                         const std::unordered_map<std::string, Tensor>* input_tensors,
+                         int                                            layer,
+                         bool                                           is_final);
+
+public:
+    LlamaContextDecoder(size_t           head_num,
+                        size_t           size_per_head,
+                        size_t           inter_size,
+                        size_t           num_layer,
+                        size_t           rotary_embedding_dim,
+                        float            rmsnorm_eps,
+                        NcclParam        tensor_para,
+                        cudaStream_t     stream,
+                        cublasMMWrapper* cublas_wrapper,
+                        IAllocator*      allocator,
+                        bool             is_free_buffer_after_forward,
+                        bool             use_fmha);
+
+    ~LlamaContextDecoder() override;
+
+    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                         const std::unordered_map<std::string, Tensor>*  input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    virtual void forward(std::vector<Tensor>*                            output_tensors,
+                         const std::vector<Tensor>*                      input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.cc b/src/fastertransformer/models/llama/LlamaDecoder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..838860b061bd22789b50e8f792cfa51a2beaa147
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoder.cc
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022, SK Telecom Authored by A. Dialog
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.cc
+
+#include "src/fastertransformer/models/llama/LlamaDecoder.h"
+#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaDecoder<T>::LlamaDecoder(size_t           head_num,
+                              size_t           size_per_head,
+                              size_t           inter_size,
+                              size_t           num_layer,
+                              size_t           rotary_embedding_dim,
+                              float            rmsnorm_eps,
+                              NcclParam        tensor_para,
+                              cudaStream_t     stream,
+                              cublasMMWrapper* cublas_wrapper,
+                              IAllocator*      allocator,
+                              bool             is_free_buffer_after_forward):
+    BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    hidden_units_(head_num * size_per_head),
+    rmsnorm_eps_(rmsnorm_eps),
+    tensor_para_(tensor_para),
+    data_type_(getTensorType<T>())
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    initialize();
+}
+
+template<typename T>
+LlamaDecoder<T>::~LlamaDecoder()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    delete self_attention_layer_;
+    delete silu_ffn_layer_;
+}
+
+template<typename T>
+void LlamaDecoder<T>::initialize()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
+                                                                  size_per_head_,
+                                                                  rotary_embedding_dim_,
+                                                                  false,  // neox_rotary_style
+                                                                  tensor_para_,
+                                                                  stream_,
+                                                                  cublas_wrapper_,
+                                                                  allocator_,
+                                                                  is_free_buffer_after_forward_);
+
+    silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
+                                           size_per_head_,
+                                           inter_size_,
+                                           tensor_para_,
+                                           stream_,
+                                           cublas_wrapper_,
+                                           allocator_,
+                                           is_free_buffer_after_forward_);
+}
+
+template<typename T>
+void LlamaDecoder<T>::allocateBuffer()
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaDecoder<T>::freeBuffer()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (is_allocate_buffer_) {
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&                   sess,
+                                      T*                                             attn_io,
+                                      const std::unordered_map<std::string, Tensor>* input_tensors,
+                                      size_t                                         layer)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    TensorMap self_attention_input_tensors(*input_tensors);
+    self_attention_input_tensors.insert("input_query",
+                                        {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
+    const int layer_id = layer;
+    self_attention_input_tensors.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
+    auto& k_cache = *sess.k_cache;
+    auto& v_cache = *sess.v_cache;
+
+    TensorMap self_attention_output_tensors{
+        {"attention_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io}},
+        {"key_cache", k_cache},
+        {"value_cache", v_cache},
+    };
+
+    self_attention_layer_->forward(&self_attention_output_tensors,  //
+                                   &self_attention_input_tensors,
+                                   &sess.weights->at(layer)->self_attn_weights);
+}
+
+template<typename T>
+void LlamaDecoder<T>::forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer)
+{
+    TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
+    TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
+    silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &sess.weights->at(layer)->ffn_weights);
+}
+
+template<typename T>
+void LlamaDecoder<T>::forward(std::vector<Tensor>*                            output_tensors,
+                              const std::vector<Tensor>*                      input_tensors,
+                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    FT_CHECK(false);
+}
+
+template<typename T>
+void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                              const std::unordered_map<std::string, Tensor>*  input_tensors,
+                              const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    /**
+     * input_tensors:
+     *   \param decoder_input [batch_size, hidden_dims]
+     *   \param sequence_lengths [batch_size] int
+     *   \param output_norm_weight [hidden_dims]
+     *   \param step [1] on cpu
+     *   \param ite [1] on cpu
+     *   \param finished [batch_size] bool
+     *   \param total_padding_tokens [batch_size], int
+     *   \param max_seq_len [1] on cpu
+     *   \param masked_tokens [batch_size, memory_len] bool (optional), NOT USED YET
+     *
+     * output_tensors:
+     *   \param decoder_output [batch_size, hidden_dimension]
+     *   \param key_cache [batch_size] uint64_t
+     *   \param value_cache [batch_size] uint64_t
+     */
+
+    // for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp
+
+    Session sess{};
+    sess.batch_size = input_tensors->at("decoder_input").shape[0];
+    sess.weights    = decoder_layer_weights;
+
+    allocateBuffer(sess.batch_size);
+
+    sess.ite     = input_tensors->at("ite").getVal<const int>();
+    sess.k_cache = &output_tensors->at("key_cache");
+    sess.v_cache = &output_tensors->at("value_cache");
+
+    sess.max_memory_len = input_tensors->at("max_seq_len").getVal<int>();
+
+    T* decoder_input  = input_tensors->at("decoder_input").getPtr<T>();
+    T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
+
+    ////////////////////////////////////////////
+    /// RMSNorm
+    invokeRootMeanSquareNorm(decoder_output,
+                             decoder_input,
+                             decoder_layer_weights->at(0)->self_attn_norm_weights,
+                             rmsnorm_eps_,
+                             sess.batch_size,
+                             hidden_units_,
+                             stream_);
+    sync_check_cuda_error();
+
+    for (size_t layer = 0; layer < num_layer_; ++layer) {
+        // output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
+        forwardSelfAttn(sess, decoder_output, input_tensors, layer);
+
+        invokeFusedAddResidualRMSNorm(decoder_input,
+                                      decoder_output,
+                                      decoder_layer_weights->at(layer)->ffn_norm_weights,
+                                      rmsnorm_eps_,
+                                      sess.batch_size,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+
+        // decoder_layer_output_ = ffn(decoder_normed_input_)
+        forwardFfn(sess, decoder_output, layer);
+
+        auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
+                                                     input_tensors->at("output_norm_weight").getPtr<T>();
+        invokeFusedAddResidualRMSNorm(decoder_input,  //
+                                      decoder_output,
+                                      scale_weight,
+                                      rmsnorm_eps_,
+                                      sess.batch_size,
+                                      hidden_units_,
+                                      stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+}
+
+template class LlamaDecoder<half>;
+template class LlamaDecoder<float>;
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/LlamaDecoder.h b/src/fastertransformer/models/llama/LlamaDecoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd595678818536eb87190efeeb70913db008ef89
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022, SK Telecom Authored by A. Dialog
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
+
+
+#include "src/fastertransformer/layers/BaseLayer.h"
+// #include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
+#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaDecoder: public BaseLayer {
+protected:
+    void allocateBuffer() override;  // deprecated
+    void allocateBuffer(size_t batch_size);
+    void freeBuffer() override;
+    void initialize();
+
+    size_t head_num_;
+    size_t size_per_head_;
+    size_t inter_size_;
+    size_t num_layer_;
+    size_t rotary_embedding_dim_;
+    size_t hidden_units_;
+    float  rmsnorm_eps_;
+
+    NcclParam tensor_para_;
+
+    LlamaDecoderSelfAttentionLayer<T>* self_attention_layer_{};
+    LlamaFfnLayer<T>*                  silu_ffn_layer_{};
+
+    const DataType data_type_;
+
+    struct Session {
+        size_t                                          batch_size;
+        int                                             ite;
+        size_t                                          max_memory_len;
+        Tensor*                                         k_cache;
+        Tensor*                                         v_cache;
+        const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
+    };
+
+    void forwardSelfAttn(const Session&                                 sess,
+                         T*                                             attn_io,
+                         const std::unordered_map<std::string, Tensor>* input_tensors,
+                         size_t                                         layer);
+
+    void forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer);
+
+public:
+    LlamaDecoder(size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           inter_size,
+                 size_t           num_layer,
+                 size_t           rotary_embedding_dim,
+                 float            rmsnorm_eps,
+                 NcclParam        tensor_para,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward);
+
+    ~LlamaDecoder() override;
+
+    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
+                         const std::unordered_map<std::string, Tensor>*  input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+
+    virtual void forward(std::vector<Tensor>*                            output_tensors,
+                         const std::vector<Tensor>*                      input_tensors,
+                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e31c169931fdb23a7120a87644cc1e168f6ba663
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
+
+
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(
+    size_t hidden_units, size_t inter_size, WeightType weight_type, size_t tensor_para_size, size_t tensor_para_rank):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    weight_type_(weight_type),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank)
+{
+    self_attn_weights.qkv.input_dims     = hidden_units_;
+    self_attn_weights.qkv.output_dims    = 3 * hidden_units_ / tensor_para_size_;
+    self_attn_weights.qkv.type           = weight_type;
+
+    self_attn_weights.output.input_dims  = hidden_units_ / tensor_para_size_;
+    self_attn_weights.output.output_dims = hidden_units_;
+    self_attn_weights.output.type        = weight_type;
+
+    ffn_weights.gating.input_dims        = hidden_units_;
+    ffn_weights.gating.output_dims       = inter_size_ / tensor_para_size_;
+    ffn_weights.gating.type              = weight_type;
+
+    ffn_weights.intermediate.input_dims  = hidden_units_;
+    ffn_weights.intermediate.output_dims = inter_size_ / tensor_para_size_;
+    ffn_weights.intermediate.type        = weight_type;
+    
+    ffn_weights.output.input_dims        = inter_size_ / tensor_para_size_;
+    ffn_weights.output.output_dims       = hidden_units_;
+    ffn_weights.output.type              = weight_type;
+    mallocWeights();
+}
+
+template<typename T>
+void freeWeights(LlamaDenseWeight<T>& weights)
+{
+    cudaFree(weights.kernel);
+    cudaFree(weights.bias);
+    cudaFree(weights.scales);
+    cudaFree(weights.zeros);
+
+    weights.kernel = nullptr;
+    weights.bias   = nullptr;
+    weights.scales = nullptr;
+    weights.zeros  = nullptr;
+}
+
+template<typename T>
+void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
+{
+    if (bias) {
+        deviceMalloc((T**)&weights.bias, weights.output_dims);
+    }
+    const size_t bit_size = getBitSize(weights.type);
+    if (bit_size >= 16) {  // fp16, fp32
+        deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+        FT_CHECK(weights.input_dims % factor == 0);
+        deviceMalloc((float**)&weights.kernel, weights.input_dims / factor * weights.output_dims);
+        deviceMalloc((T**)&weights.scales, weights.output_dims);
+        deviceMalloc((T**)&weights.zeros, weights.output_dims);
+    }
+}
+
+template<typename T>
+void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, int rank, FtCudaDataType model_file_type)
+{
+    prefix += "." + std::to_string(rank);
+    const auto type = model_file_type;
+
+    if (w.bias) {
+        loadWeightFromBin((T*)w.bias, {w.output_dims}, prefix + ".bias", type);
+    }
+    const size_t bit_size = getBitSize(w.type);
+    if (bit_size >= 16) {  // fp16, fp32
+        loadWeightFromBin((T*)w.kernel, {w.input_dims, w.output_dims}, prefix + ".weight", type);
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+        FT_CHECK(w.input_dims % factor == 0);
+        const auto f32_type = FtCudaDataType::FP32;
+        loadWeightFromBin((float*)w.kernel, {w.input_dims / factor, w.output_dims}, prefix + ".qweight", f32_type);
+        loadWeightFromBin((T*)w.scales, {w.output_dims}, prefix + ".scales", type);
+        loadWeightFromBin((T*)w.zeros, {w.output_dims}, prefix + ".zeros", type);
+    }
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::mallocWeights()
+{
+    deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
+    deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
+
+    fastertransformer::mallocWeights(self_attn_weights.qkv, false);
+    fastertransformer::mallocWeights(self_attn_weights.output, false);
+
+    fastertransformer::mallocWeights(ffn_weights.gating, false);
+    fastertransformer::mallocWeights(ffn_weights.intermediate, false);
+    fastertransformer::mallocWeights(ffn_weights.output, false);
+}
+
+template<typename T>
+LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
+{
+    cudaFree((void*)self_attn_norm_weights);
+    cudaFree((void*)ffn_norm_weights);
+
+    freeWeights(self_attn_weights.qkv);
+    freeWeights(self_attn_weights.output);
+    freeWeights(ffn_weights.gating);
+    freeWeights(ffn_weights.intermediate);
+    freeWeights(ffn_weights.output);
+}
+
+template<typename T>
+void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
+{
+    const auto rank_spec = std::to_string(tensor_para_rank_);
+    const auto type      = model_file_type;
+
+    loadWeightFromBin(
+        (T*)self_attn_norm_weights, {hidden_units_}, dir_path + ".attention_norm.weight", model_file_type);
+    loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type);
+
+    loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type);
+    loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type);
+    loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type);
+    loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type);
+    loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type);
+}
+
+template struct LlamaDecoderLayerWeight<float>;
+template struct LlamaDecoderLayerWeight<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..1635ddd4d0e2609db2659398de824c669c560bc3
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LlamaDecoderLayerWeight {
+public:
+    LlamaDecoderLayerWeight() = delete;
+    LlamaDecoderLayerWeight(
+        size_t hidden_units, size_t inter_size, WeightType weight_type, size_t tensor_para_size, size_t tensor_para_rank);
+    ~LlamaDecoderLayerWeight();
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
+    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
+
+    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+
+    T*                      self_attn_norm_weights{};
+    T*                      ffn_norm_weights{};
+    LlamaAttentionWeight<T> self_attn_weights{};
+    LlamaFfnWeight<T>       ffn_weights{};
+
+private:
+    size_t hidden_units_;
+    size_t inter_size_;
+    WeightType weight_type_;
+    size_t bit_size_;
+    size_t tensor_para_size_;
+    size_t tensor_para_rank_;
+    bool   is_maintain_buffer_ = false;
+
+    void mallocWeights();
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9512671740eed1179261ae4b101ad01934e1934
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
+
+
+#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+#include <string>
+// #include <glog/logging.h>
+
+namespace fastertransformer {
+
+template<typename T>
+struct SATypeConverter {
+    using Type = T;
+};
+
+template<>
+struct SATypeConverter<half> {
+    using Type = uint16_t;
+};
+
+template<typename T>
+static inline void fusedQKV_masked_attention_dispatch(const T*     qkv_buf,
+                                                      const T*     qkv_bias,
+                                                      const T*     relative_attention_bias,
+                                                      T*           key_cache,
+                                                      T*           value_cache,
+                                                      T**          k_cache_per_sample,
+                                                      T**          v_cache_per_sample,
+                                                      size_t       kv_cache_per_sample_offset,
+                                                      const int*   cache_indir,
+                                                      T*           context_buf,
+                                                      const bool*  finished,
+                                                      const int*   sequence_lengths,
+                                                      const int    max_batch_size,
+                                                      const int    inference_batch_size,
+                                                      const int    beam_width,
+                                                      const int    head_num,
+                                                      const int    size_per_head,
+                                                      const int    rotary_embedding_dim,
+                                                      const int    memory_max_len,
+                                                      const int*   prefix_prompt_lengths,
+                                                      const int    max_prefix_prompt_length,
+                                                      const int    max_input_len,
+                                                      const int*   total_padding_tokens,
+                                                      const int    step,
+                                                      const float  q_scaling,
+                                                      const int    relative_attention_bias_stride,
+                                                      const T*     linear_bias_slopes,
+                                                      const bool*  masked_tokens,
+                                                      const int*   ia3_tasks,
+                                                      const T*     ia3_key_weights,
+                                                      const T*     ia3_value_weights,
+                                                      const float* qkv_scale_out,
+                                                      const float* attention_out_scale,
+                                                      const int    int8_mode,
+                                                      cudaStream_t stream)
+{
+    using DataType = typename SATypeConverter<T>::Type;
+    // Prepare the parameters.
+    Masked_multihead_attention_params<DataType> params;
+    memset(&params, 0, sizeof(params));
+    int hidden_units = head_num * size_per_head;
+    if (qkv_bias != nullptr) {
+        params.q_bias = reinterpret_cast<const DataType*>(qkv_bias);
+        params.k_bias = reinterpret_cast<const DataType*>(qkv_bias) + hidden_units;
+        params.v_bias = reinterpret_cast<const DataType*>(qkv_bias) + 2 * hidden_units;
+    }
+    else {
+        params.q_bias = nullptr;
+        params.k_bias = nullptr;
+        params.v_bias = nullptr;
+    }
+
+    // Set the output buffer.
+    params.out = reinterpret_cast<DataType*>(context_buf);
+
+    // Set the input buffers.
+    params.q = reinterpret_cast<const DataType*>(qkv_buf);
+    if (int8_mode != 2) {
+        params.k = reinterpret_cast<const DataType*>(qkv_buf) + hidden_units;
+        params.v = reinterpret_cast<const DataType*>(qkv_buf) + 2 * hidden_units;
+    }
+    else {
+        params.k = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + hidden_units);
+        params.v = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + 2 * hidden_units);
+    }
+    params.stride   = 3 * hidden_units;
+    params.finished = const_cast<bool*>(finished);
+
+    params.k_cache                    = reinterpret_cast<DataType*>(key_cache);
+    params.v_cache                    = reinterpret_cast<DataType*>(value_cache);
+    params.k_cache_per_sample         = reinterpret_cast<DataType**>(k_cache_per_sample);
+    params.v_cache_per_sample         = reinterpret_cast<DataType**>(v_cache_per_sample);
+    params.kv_cache_per_sample_offset = kv_cache_per_sample_offset;
+    params.k_cache_interleaved        = false;
+    params.cache_indir                = cache_indir;
+    params.batch_size                 = inference_batch_size;
+    params.beam_width                 = beam_width;
+    params.memory_max_len             = memory_max_len;
+    params.prefix_prompt_lengths      = prefix_prompt_lengths;
+    params.max_prefix_prompt_length   = max_prefix_prompt_length;
+    params.length_per_sample          = sequence_lengths;  // max_input_length + current output length
+    // timestep adding max_prefix_prompt_length for shared memory size calculation and rotary embedding computation
+    params.timestep             = step + max_prefix_prompt_length - 1;
+    params.num_heads            = head_num;
+    params.hidden_size_per_head = size_per_head;
+    params.rotary_embedding_dim = rotary_embedding_dim;
+    // Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust)
+    params.inv_sqrt_dh = 1.F / (sqrtf((float)params.hidden_size_per_head) * q_scaling);
+
+    params.total_padding_tokens = total_padding_tokens;
+    if (relative_attention_bias != nullptr) {
+        params.relative_attention_bias = reinterpret_cast<const DataType*>(relative_attention_bias);
+    }
+    params.relative_attention_bias_stride = relative_attention_bias_stride;
+    params.masked_tokens                  = masked_tokens;
+
+    // The slope of linear position bias per head, e.g., ALiBi.
+    if (linear_bias_slopes != nullptr) {
+        params.linear_bias_slopes = reinterpret_cast<const DataType*>(linear_bias_slopes);
+    }
+    params.max_input_length = max_input_len;
+
+    params.ia3_tasks         = ia3_tasks;
+    params.ia3_key_weights   = reinterpret_cast<const DataType*>(ia3_key_weights);
+    params.ia3_value_weights = reinterpret_cast<const DataType*>(ia3_value_weights);
+
+    params.int8_mode = int8_mode;
+    if (int8_mode == 2) {
+        params.qkv_scale_out       = qkv_scale_out;
+        params.attention_out_scale = attention_out_scale;
+    }
+
+    PUSH_RANGE("scaled dot-product fusion");
+    masked_multihead_attention(params, stream);
+    POP_RANGE;
+}
+
+template<typename T>
+void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    qkv_buf_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false));
+    context_buf_ =
+        reinterpret_cast<T*>(allocator_->reMalloc(context_buf_, sizeof(T) * batch_size * local_hidden_units_, false));
+
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaDecoderSelfAttentionLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)(&qkv_buf_));
+        allocator_->free((void**)(&context_buf_));
+        // allocator_->free((void**)(&k_cache_buf_));
+        // allocator_->free((void**)(&v_cache_buf_));
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     output_tensors,
+                                                const TensorMap*               input_tensors,
+                                                const LlamaAttentionWeight<T>* weights)
+{
+    /**
+     * input tensors:
+     *    \param input_query [batch_size, hidden_units],
+     *    \param sequence_lengths [batch_size]
+     *    \param step [1] on cpu
+     *    \param finished [batch_size]
+     *    \param total_padding_tokens [batch_size]
+     *    \param layer_id [1], int on cpu
+     *    \param max_seq_len [1] on cpu
+     *    \param masked_tokens [batch_size, memory_len], (optional), NOT USED YET
+     *    \param cache_indirection [batch_size / beam_width, beam_width, memory_max_len] (optional)
+     *
+     * output tensors:
+     *    \param attention_output [batch_size, hidden_units],
+     *    \param key_cache [batch, local_head_num, size_per_head / x, memory_max_len, x]
+     *    \param value_cache [batch, local_head_num, memory_max_len, size_per_head]
+     */
+
+    const T*    input_query_data      = input_tensors->getPtr<T>("input_query");
+    const int*  sequence_lengths_data = input_tensors->getPtr<int>("sequence_lengths");
+    const int*  total_padding_len     = input_tensors->getPtr<int>("total_padding_tokens");
+    const bool* finished_data         = input_tensors->getPtr<bool>("finished", nullptr);
+    const bool* masked_tokens_data    = input_tensors->getPtr<bool>("masked_tokens", nullptr);
+    const int*  cache_indir           = input_tensors->getPtr<int>("cache_indirection", nullptr);
+
+    T*  hidden_features_data = output_tensors->getPtr<T>("attention_output");
+    T** key_cache_ptrs       = output_tensors->getPtr<T*>("key_cache");
+    T** value_cache_ptrs     = output_tensors->getPtr<T*>("value_cache");
+
+    const int layer_id = input_tensors->getVal<int>("layer_id");
+
+    const int max_seq_len = input_tensors->getVal<int>("max_seq_len");
+    const int step        = input_tensors->getVal<int>("step");
+
+    const int step_1 = step - 1;
+
+    const int batch_size = input_tensors->at("input_query").shape[0];
+    const int beam_width = cache_indir != nullptr ? input_tensors->at("cache_indirection").shape[1] : 1;
+
+    allocateBuffer(batch_size, step, max_seq_len);
+
+    PUSH_RANGE("qkv_gemm");
+    linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
+    POP_RANGE;
+
+    const auto kv_cache_layer_offset = layer_id * local_head_num_ * max_seq_len * size_per_head_;
+    const int  memory_len            = max_seq_len;
+
+    fusedQKV_masked_attention_dispatch<T>(
+        qkv_buf_,
+        nullptr,  // query_weight.bias,
+        nullptr,  // relative_attention_bias,
+        nullptr,
+        nullptr,
+        key_cache_ptrs,
+        value_cache_ptrs,
+        kv_cache_layer_offset,
+        cache_indir,
+        context_buf_,
+        finished_data,
+        sequence_lengths_data,  // NOTE: current seq len including padding (fixed after meeting the finished id)
+        batch_size,
+        batch_size,
+        beam_width,
+        local_head_num_,
+        size_per_head_,
+        rotary_embedding_dim_,
+        memory_len,
+        nullptr,  // prefix_prompt_lengths
+        0,        // max_prefix_prompt_length
+        0,        // max_input_length, not used w/o linear_bias_slopes
+        input_tensors->getPtr<int>("total_padding_tokens", nullptr),
+        step,
+        1.f,      // q_scaling
+        0,        // relative_attention_bias_stride
+        nullptr,  // linear_bias_slopes
+        nullptr,  //  masked_tokens_data,
+        nullptr,  // ia3_tasks
+        nullptr,  // ia3_key_weights
+        nullptr,  // ia3_value_weights
+        nullptr,  // qkv_scale_out
+        nullptr,  // attention_out_scale
+        0,        // int8_mode
+        stream_);
+    sync_check_cuda_error();
+
+    linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
+
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(
+            hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+
+    // LOG(WARNING);
+}
+
+template class LlamaDecoderSelfAttentionLayer<float>;
+template class LlamaDecoderSelfAttentionLayer<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1365fa70385f39e45be26085b79952b3be1b7dac
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/fastertransformer/models/llama/LlamaLinear.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaDecoderSelfAttentionLayer {
+public:
+    void freeBuffer();
+    void allocateBuffer(size_t batch_size, int key_len, int max_memory_len);
+
+    LlamaDecoderSelfAttentionLayer(size_t           head_num,
+                                   size_t           size_per_head,
+                                   size_t           rotary_embedding_dim,
+                                   bool             neox_rotary_style,
+                                   NcclParam        tensor_para,
+                                   cudaStream_t     stream,
+                                   cublasMMWrapper* cublas_wrapper,
+                                   IAllocator*      allocator,
+                                   bool             is_free_buffer_after_forward):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        hidden_units_(head_num * size_per_head),
+        local_head_num_(head_num / tensor_para.world_size_),
+        local_hidden_units_(hidden_units_ / tensor_para.world_size_),
+        rotary_embedding_dim_(rotary_embedding_dim),
+        neox_rotary_style_(neox_rotary_style),
+        tensor_para_(tensor_para),
+        stream_(stream),
+        linear_(cublas_wrapper, stream),
+        allocator_(allocator),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward)
+    {
+    }
+
+    ~LlamaDecoderSelfAttentionLayer()
+    {
+        freeBuffer();
+    }
+
+    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
+
+private:
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t hidden_units_;
+    const size_t local_head_num_;
+    const size_t local_hidden_units_;
+    const size_t rotary_embedding_dim_;
+    const bool   is_free_buffer_after_forward_;
+
+    const bool neox_rotary_style_;
+
+    NcclParam tensor_para_;
+
+    cudaStream_t   stream_;
+    IAllocator*    allocator_;
+    LlamaLinear<T> linear_;
+
+    T* qkv_buf_     = nullptr;
+    T* context_buf_ = nullptr;
+    // T*   weight_buf_  = nullptr;
+    // T* k_cache_buf_{};
+    // T* v_cache_buf_{};
+
+    // T* tmp_k_cache_buf_{};
+    // T* tmp_v_cache_buf_{};
+    // T* tmp_cache_buf_{};
+
+    bool is_allocate_buffer_{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaDenseWeight.h b/src/fastertransformer/models/llama/LlamaDenseWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..fba6f1df20fe981c2166e55c75921baf7372e057
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaDenseWeight.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
+
+#pragma once
+
+#include "src/fastertransformer/layers/FfnWeight.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+enum class WeightType : int
+{
+    kFP32,
+    kFP16,
+    kFP8,  // not supported yet
+    kINT8,
+    kINT4
+};
+
+inline size_t getBitSize(WeightType type)
+{
+    switch (type) {
+        case WeightType::kFP32:
+            return 32;
+        case WeightType::kFP16:
+            return 16;
+        case WeightType::kFP8:
+            return 8;
+        case WeightType::kINT8:
+            return 8;
+        case WeightType::kINT4:
+            return 4;
+    }
+}
+
+template<typename T>
+struct LlamaDenseWeight {
+
+    size_t     input_dims;
+    size_t     output_dims;
+    void*      kernel;
+    WeightType type;
+    T*         bias;
+    T*         scales;
+    T*         zeros;
+};
+
+template<typename T>
+struct LlamaAttentionWeight {
+    LlamaDenseWeight<T> qkv;
+    LlamaDenseWeight<T> output;
+};
+
+template<typename T>
+struct LlamaFfnWeight {
+    LlamaDenseWeight<T> gating;
+    LlamaDenseWeight<T> intermediate;
+    LlamaDenseWeight<T> output;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaFfnLayer.cc b/src/fastertransformer/models/llama/LlamaFfnLayer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c8faae97be70d3c2824512841b64104c1622335
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
+
+#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/utils/nvtx_utils.h"
+// #include <glog/logging.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void LlamaFfnLayer<T>::allocateBuffer(size_t token_num)
+{
+    inter_buf_          = (T*)allocator_->reMalloc(inter_buf_, sizeof(T) * token_num * inter_size_, false);
+    gating_buf_         = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * token_num * inter_size_, false);
+    is_allocate_buffer_ = true;
+}
+
+template<typename T>
+void LlamaFfnLayer<T>::freeBuffer()
+{
+    if (is_allocate_buffer_) {
+        allocator_->free((void**)&inter_buf_);
+        allocator_->free((void**)&gating_buf_);
+        is_allocate_buffer_ = false;
+    }
+}
+
+template<typename T>
+void LlamaFfnLayer<T>::activation(int num_token)
+{
+    invokeGenericActivation<SiluActivation>(gating_buf_,
+                                            (const T*)nullptr,  // bias
+                                            inter_buf_,
+                                            (const T*)nullptr,  // gated_bias
+                                            nullptr,            // ia3_tasks
+                                            (const T*)nullptr,  // ia3_weights
+                                            num_token,          // m
+                                            inter_size_,        // n
+                                            0,                  // int8_mode
+                                            nullptr,            // activation_in
+                                            nullptr,            // activation_out
+                                            nullptr,            // padding_offset
+                                            0,                  // seq_len
+                                            stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
+                               const TensorMap*         input_tensors,
+                               const LlamaFfnWeight<T>* weights)
+{
+    /**
+     * input_tensors:
+     *   \param ffn_input [token_num, hidden_dimension]
+     *
+     * output_tensors:
+     *   \param ffn_output [token_num, hidden_dimension]
+     */
+
+    const size_t num_token = input_tensors->at("ffn_input").shape[0];
+    // LOG(WARNING);
+
+    allocateBuffer(num_token);
+
+    const T* ffn_input_data  = input_tensors->at("ffn_input").getPtr<T>();
+    T*       ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
+
+    PUSH_RANGE("ffn");
+    // TODO: fuse the two GEMMs with activation
+    linear_.forward(gating_buf_, ffn_input_data, num_token, weights->gating);
+
+    linear_.forward(inter_buf_, ffn_input_data, num_token, weights->intermediate);
+
+    activation(num_token);
+
+    linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output);
+    POP_RANGE;
+
+    if (tensor_para_.world_size_ > 1) {
+        NcclGuard nccl_guard(tensor_para_, stream_);
+        ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
+        sync_check_cuda_error();
+    }
+
+    if (is_free_buffer_after_forward_) {
+        freeBuffer();
+    }
+    // LOG(WARNING);
+}
+
+template class LlamaFfnLayer<float>;
+template class LlamaFfnLayer<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaFfnLayer.h b/src/fastertransformer/models/llama/LlamaFfnLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd5abf1dc03ea64bf45dee351ac5f25af15e2e9
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
+
+#pragma once
+
+// #include "src/fastertransformer/layers/FfnLayer.h"
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/models/llama/LlamaLinear.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <functional>
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaFfnLayer {
+public:
+    LlamaFfnLayer(size_t           head_num,
+                  size_t           size_per_head,
+                  size_t           inter_size,
+                  NcclParam        tensor_para,
+                  cudaStream_t     stream,
+                  cublasMMWrapper* cublas_wrapper,
+                  IAllocator*      allocator,
+                  bool             is_free_buffer_after_forward):
+        head_num_(head_num),
+        size_per_head_(size_per_head),
+        inter_size_(inter_size / tensor_para.world_size_),
+        hidden_units_(head_num * size_per_head),
+        stream_(stream),
+        linear_(cublas_wrapper, stream),
+        allocator_(allocator),
+        tensor_para_(tensor_para),
+        is_free_buffer_after_forward_(is_free_buffer_after_forward)
+    {
+    }
+
+    ~LlamaFfnLayer()
+    {
+        freeBuffer();
+    }
+
+    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
+
+private:
+    void allocateBuffer(size_t token_num);
+
+    void freeBuffer();
+
+    void activation(int num_token);
+
+    size_t         head_num_;
+    size_t         size_per_head_;
+    size_t         inter_size_;
+    size_t         hidden_units_;
+    cudaStream_t   stream_;
+    LlamaLinear<T> linear_;
+    IAllocator*    allocator_;
+    bool           is_free_buffer_after_forward_;
+
+    T* gating_buf_{};
+    T* inter_buf_{};
+
+    NcclParam tensor_para_;
+
+    bool is_allocate_buffer_{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaInstanceComm.h b/src/fastertransformer/models/llama/LlamaInstanceComm.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5294ca57711996369a416d4bd7ba0eaf68436db
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaInstanceComm.h
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/Barrier.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+
+namespace fastertransformer {
+
+class LlamaInstanceComm: public AbstractInstanceComm {
+public:
+    LlamaInstanceComm(int count): barrier_(count) {}
+
+    void barrier() override
+    {
+        barrier_.wait();
+    }
+
+    void setSharedObject(void* p) override
+    {
+        ptr = p;
+    }
+
+    void* getSharedObject() override
+    {
+        return ptr;
+    }
+
+private:
+    Barrier barrier_;
+    void*   ptr{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaLinear.h b/src/fastertransformer/models/llama/LlamaLinear.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c29548f2164e2daa0252d8faf757353f88445e6
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaLinear.h
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaLinear {
+public:
+    LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream)
+    {
+    }
+
+    void forward(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
+    {
+        switch (weight.type) {
+            case WeightType::kFP16:
+            case WeightType::kFP32:
+                forwardFp(output_data, input_data, batch_size, weight);
+                break;
+            case WeightType::kINT4:
+                forwardInt4(output_data, input_data, batch_size, weight);
+                break;
+            default:
+                FT_CHECK(0);
+        }
+    }
+
+private:
+    void forwardFp(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
+    {
+        cublas_wrapper_->Gemm(CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              weight.output_dims,
+                              batch_size,
+                              weight.input_dims,
+                              (const T*)weight.kernel,
+                              weight.output_dims,
+                              input_data,
+                              weight.input_dims,
+                              output_data,
+                              weight.output_dims);
+        sync_check_cuda_error();
+    }
+
+    void forwardInt4(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
+    {
+        FT_CHECK_WITH_INFO(0, "Not implemented");
+    }
+
+private:
+    cublasMMWrapper* cublas_wrapper_;
+    cudaStream_t     stream_{};
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaNcclGuard.h b/src/fastertransformer/models/llama/LlamaNcclGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..f350ede2ec454692a1ed8a4f0d9cba31b93d6275
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaNcclGuard.h
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <cuda_runtime.h>
+#include <mutex>
+
+namespace fastertransformer {
+
+struct NcclGuard {
+    static constexpr int kMaxGroupCount = 32;
+
+    static std::mutex& globalNcclMutex()
+    {
+        static std::mutex inst;
+        return inst;
+    }
+
+    struct GroupState {
+        std::mutex              mutex;
+        std::condition_variable cv;
+        int                     ref_count;
+    };
+
+    static GroupState& groupState(int group_id)
+    {
+        static std::array<GroupState, kMaxGroupCount> array{};
+        FT_CHECK(group_id < kMaxGroupCount);
+        return array[group_id];
+    }
+
+    NcclGuard(NcclParam tensor_para, cudaStream_t stream, bool barrier = false):
+        tensor_para_(tensor_para), stream_(stream), barrier_(barrier)
+    {
+        if (is_active()) {
+            auto& group = groupState(tensor_para_.group_id_);
+            if (tensor_para_.rank_ == 0) {
+                /// TODO: use std::optional after switching to C++17
+                global_nccl_lock_ = std::make_unique<std::lock_guard<std::mutex>>(globalNcclMutex());
+                {
+                    std::lock_guard<std::mutex> lock(group.mutex);
+                    group.ref_count = tensor_para_.world_size_;
+                }
+                group.cv.notify_all();
+            }
+            else {
+                std::unique_lock<std::mutex> lock(group.mutex);
+                group.cv.wait(lock, [&] { return group.ref_count > 0; });
+            }
+        }
+    }
+
+    ~NcclGuard()
+    {
+        if (is_active()) {
+            ftNcclStreamSynchronize(tensor_para_, NcclParam{}, stream_);
+
+            auto& group = groupState(tensor_para_.group_id_);
+
+            int value = -1;
+            {
+                std::lock_guard<std::mutex> lock(group.mutex);
+                value = --group.ref_count;
+            }
+            if (value == 0) {
+                group.cv.notify_all();
+            }
+            else if (barrier_ || tensor_para_.rank_ == 0) {
+                std::unique_lock<std::mutex> lock(group.mutex);
+                group.cv.wait(lock, [&] { return group.ref_count == 0; });
+            }
+
+            // rank 0 unlocks global NCCL mutex automatically
+        }
+    }
+
+    bool is_active()
+    {
+        return barrier_ || (ftNcclGroupCount() > 1 && tensor_para_.world_size_ > 1);
+    }
+
+    NcclParam                                    tensor_para_;
+    cudaStream_t                                 stream_;
+    bool                                         barrier_;
+    std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaV2.cc b/src/fastertransformer/models/llama/LlamaV2.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c8c39247dcb8a5131a4bc89ed0ba0a18e3c8bbc
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaV2.cc
@@ -0,0 +1,582 @@
+/* 
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ * Copyright (c) 2022, SK Telecom Authored by A. Dialog
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
+
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/kernels/decoding_kernels.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/models/llama/LlamaBatch.h"
+#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaV2<T>::LlamaV2(size_t                       head_num,
+                    size_t                       size_per_head,
+                    size_t                       inter_size,
+                    size_t                       num_layer,
+                    size_t                       vocab_size,
+                    size_t                       rotary_embedding_dim,
+                    float                        norm_eps,
+                    int                          max_batch_size,
+                    int                          max_context_token_num,
+                    int                          session_len,
+                    int                          step_length,
+                    int                          start_id,
+                    int                          end_id,
+                    int                          cache_max_entry_count,
+                    int                          cache_chunk_size,
+                    bool                         use_context_fmha,
+                    std::shared_ptr<SharedState> shared_state,
+                    LlamaWeight<T>*              weights,
+                    NcclParam                    tensor_para,
+                    cudaStream_t                 stream,
+                    cublasMMWrapper*             cublas_wrapper,
+                    IAllocator*                  allocator,
+                    bool                         is_free_buffer_after_forward,
+                    cudaDeviceProp*              cuda_device_prop):
+    head_num_(head_num),
+    size_per_head_(size_per_head),
+    inter_size_(inter_size),
+    num_layer_(num_layer),
+    vocab_size_(vocab_size),
+    rotary_embedding_dim_(rotary_embedding_dim),
+    rmsnorm_eps_(norm_eps),
+    start_id_(start_id),
+    end_id_(end_id),
+    hidden_units_(head_num * size_per_head),
+    local_head_num_(head_num / tensor_para.world_size_),
+    weights_(weights),
+    tensor_para_(tensor_para),
+    stream_(stream),
+    cublas_wrapper_(cublas_wrapper),
+    allocator_(allocator),
+    is_free_buffer_after_forward_(is_free_buffer_after_forward),
+    cuda_device_prop_(cuda_device_prop),
+    debug_(isDebug()),
+    step_length_(step_length),
+    batch_(max_batch_size, max_context_token_num, session_len, this),
+    shared_state_(shared_state)
+
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
+    FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
+
+    kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
+                                                        local_head_num_,
+                                                        size_per_head_,
+                                                        session_len,
+                                                        sizeof(T) * 8,
+                                                        cache_max_entry_count,
+                                                        cache_chunk_size,
+                                                        tensor_para.rank_,
+                                                        allocator);
+    initialize(use_context_fmha);
+    start();
+}
+
+template<typename T>
+LlamaV2<T>::~LlamaV2()
+{
+    internal_thread_.join();
+
+    delete decoder_;
+    delete dynamic_decode_layer_;
+    delete context_decoder_;
+}
+
+template<typename T>
+void LlamaV2<T>::initialize(bool use_context_fmha)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    context_decoder_ = new LlamaContextDecoder<T>(head_num_,
+                                                  size_per_head_,
+                                                  inter_size_,
+                                                  num_layer_,
+                                                  rotary_embedding_dim_,
+                                                  rmsnorm_eps_,
+                                                  tensor_para_,
+                                                  stream_,
+                                                  cublas_wrapper_,
+                                                  allocator_,
+                                                  is_free_buffer_after_forward_,
+                                                  use_context_fmha);
+
+    decoder_ = new LlamaDecoder<T>(head_num_,
+                                   size_per_head_,
+                                   inter_size_,
+                                   num_layer_,
+                                   rotary_embedding_dim_,
+                                   rmsnorm_eps_,
+                                   tensor_para_,
+                                   stream_,
+                                   cublas_wrapper_,
+                                   allocator_,
+                                   is_free_buffer_after_forward_);
+
+    dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
+                                                          vocab_size_,  // vocab_size_padded,
+                                                          0,            // end_id, deprecated
+                                                          stream_,
+                                                          cublas_wrapper_,
+                                                          allocator_,
+                                                          is_free_buffer_after_forward_,
+                                                          cuda_device_prop_);
+}
+
+template<typename T>
+void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // ! This kernel can't be used in context decoding
+    invokeEmbeddingLookupPosEncodingPadCount(embeddings,
+                                             weights_->pre_decoder_embedding_table,
+                                             static_cast<T*>(nullptr),  // position encoding
+                                             token_ids_buf,
+                                             static_cast<int*>(nullptr),  // padding count, not used w/o pos-code
+                                             batch_size,
+                                             hidden_units_,
+                                             static_cast<T>(1.),  // scale
+                                             step,                // step, used int index into output_ids_buf_
+                                             batch_size,          // token_num
+                                             0,                   // ite
+                                             stream_);
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void LlamaV2<T>::contextDecode(T*         deocder_output,
+                               uintptr_t* k_cache_ptr,
+                               uintptr_t* v_cache_ptr,
+                               T*         context_decoder_input_buf,
+                               T*         context_decoder_output_buf,
+                               const int* input_ids,
+                               const int* input_length,
+                               const int* history_length,
+                               const int* context_length,
+                               size_t     token_num,
+                               size_t     max_input_len,
+                               size_t     max_context_len,
+                               size_t     session_len,
+                               size_t     batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    if (tensor_para_.rank_ == 0) {
+        FT_LOG_INFO("context decoding start");
+    }
+
+    invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
+                                             nullptr,  // processed somewhere else
+                                             weights_->pre_decoder_embedding_table,
+                                             static_cast<T*>(nullptr),
+                                             pPromptTuningParam<T>{},
+                                             input_ids,
+                                             0,  // only used for postion encoding
+                                             token_num,
+                                             token_num,
+                                             1,
+                                             hidden_units_,
+                                             stream_);
+    sync_check_cuda_error();
+
+    const auto dtype = getTensorType<T>();
+    const auto bsz   = batch_size;
+
+    const int max_q_len   = max_input_len;
+    const int max_kv_len  = max_context_len;
+    const int max_seq_len = session_len;
+
+    std::unordered_map<std::string, Tensor> decoder_input_tensors{
+        {"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_input_buf}},
+        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
+        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, input_length}},
+        {"history_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, history_length}},
+        {"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, context_length}},
+        {"max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_q_len}},
+        {"max_kv_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_kv_len}},
+        {"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
+    };
+
+    std::unordered_map<std::string, Tensor> decoder_output_tensors{
+        {"decoder_output", {MEMORY_GPU, dtype, {bsz, max_input_len, hidden_units_}, context_decoder_output_buf}},
+        {"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_cache_ptr}},
+        {"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_cache_ptr}},
+        {"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, deocder_output}}};
+
+    context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
+
+    if (tensor_para_.rank_ == 0) {
+        FT_LOG_INFO("context decoding end");
+    }
+}
+
+template<typename T>
+void LlamaV2<T>::decoderForward(T*         decoder_output,
+                                uintptr_t* k_cache_ptr,
+                                uintptr_t* v_cache_ptr,
+                                T*         decoder_input,
+                                const int* sequence_length,
+                                const int* total_padding_count,
+                                bool*      finished,
+                                int        step,
+                                int        ite,
+                                size_t     session_len,
+                                size_t     batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    const int  max_seq_len = session_len;
+    const auto dtype       = getTensorType<T>();
+
+    // max_input_length is not used w/o linear_bias_slopes
+    // sequence_lengths_ will be incremented in dynamic decode
+    std::unordered_map<std::string, Tensor> decoder_input_tensors{
+        {"decoder_input", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_input}},
+        {"sequence_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
+        {"total_padding_tokens", {MEMORY_GPU, TYPE_INT32, {batch_size}, total_padding_count}},
+        {"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
+        {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
+        {"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
+        {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
+        {"ite", {MEMORY_CPU, TYPE_INT32, {1}, &ite}},
+    };
+
+    // LOG(ERROR) << key_cache_ << " " << value_cache_;
+    std::unordered_map<std::string, Tensor> decoder_output_tensors{
+        {"decoder_output", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_output}},
+        {"key_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, k_cache_ptr}},
+        {"value_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, v_cache_ptr}},
+    };
+
+    decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
+}
+
+template<typename T>
+void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cudaDataType_t data_type = getCudaDataType<T>();
+    float          alpha     = 1.f;
+    float          beta      = 0.f;
+    if (tensor_para_.world_size_ == 1) {
+        cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                              CUBLAS_OP_N,
+                              vocab_size_,  // n
+                              batch_size,
+                              hidden_units_,  // k
+                              &alpha,
+                              weights_->post_decoder_embedding_kernel,
+                              data_type,
+                              hidden_units_,  // k
+                              decoder_output,
+                              data_type,
+                              hidden_units_,  // k
+                              &beta,
+                              logits,
+                              CUDA_R_32F,
+                              vocab_size_,  // n
+                              CUDA_R_32F,
+                              cublasGemmAlgo_t(-1));
+    }
+    else {
+        FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
+        const size_t local_vocab_size = vocab_size_ / tensor_para_.world_size_;
+        cublas_wrapper_->Gemm(CUBLAS_OP_T,
+                              CUBLAS_OP_N,
+                              local_vocab_size,  // n
+                              batch_size,
+                              hidden_units_,  // k
+                              &alpha,
+                              weights_->post_decoder_embedding_kernel
+                                  + tensor_para_.rank_ * local_vocab_size * hidden_units_,
+                              data_type,
+                              hidden_units_,  // k
+                              decoder_output,
+                              data_type,
+                              hidden_units_,  // k
+                              &beta,
+                              local_logits + tensor_para_.rank_ * batch_size * local_vocab_size,
+                              CUDA_R_32F,
+                              local_vocab_size,  // n
+                              CUDA_R_32F,
+                              cublasGemmAlgo_t(-1));
+        {
+            NcclGuard nccl_guard(tensor_para_, stream_);
+            ftNcclAllGather(local_logits,                   // send_buf
+                            local_logits,                   // recv_buf
+                            batch_size * local_vocab_size,  // data_size
+                            tensor_para_.rank_,
+                            tensor_para_,
+                            stream_);
+        }
+        invokeTransposeAxis01(logits, local_logits, tensor_para_.world_size_, batch_size, local_vocab_size, stream_);
+        sync_check_cuda_error();
+    }
+}
+
+template<typename T>
+void LlamaV2<T>::dynamicDecode(int*            token_ids,
+                               bool*           finished,
+                               int*            sequence_length,
+                               bool*           should_stop,
+                               TensorMap*      inputs,
+                               TensorMap*      outputs,
+                               const float*    logits,
+                               const uint32_t* seq_limit_len,
+                               const int*      context_length,
+                               const int*      end_ids,
+                               int             step,
+                               int             ite,
+                               size_t          max_context_len,
+                               size_t          token_ids_len,
+                               size_t          batch_size)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    int local_batch_size = (int)batch_size;
+
+    std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
+        {"logits", {MEMORY_GPU, TYPE_FP32, {batch_size, (size_t)1, vocab_size_}, logits}},
+        {"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
+        {"max_input_length", {MEMORY_CPU, TYPE_INT32, {1}, &max_context_len}},
+        {"sequence_limit_length", {MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len}},
+        {"input_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size, 1}, context_length}},
+        {"ite", {MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+        {"end_id", {MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
+        {"local_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
+    };
+
+    const std::vector<std::string> optional_inputs{"stop_words_list",
+                                                   "bad_words_list",
+                                                   "runtime_top_k",
+                                                   "runtime_top_p",
+                                                   "temperature",
+                                                   "repetition_penalty",
+                                                   "random_seed"};
+    for (const auto& key : optional_inputs) {
+        if (inputs->isExist(key)) {
+            dynamic_decode_input_tensors.insert({key, inputs->at(key)});
+        }
+    }
+
+    std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
+        {"output_ids", {MEMORY_GPU, TYPE_INT32, {token_ids_len, batch_size, 1U}, token_ids}},
+        {"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
+        {"sequence_length", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
+        {"should_stop", {MEMORY_CPU, TYPE_BOOL, {1}, should_stop}}};
+
+    const std::vector<std::string> optional_outputs{"cum_log_probs", "output_log_probs"};
+    for (const auto& key : optional_outputs) {
+        if (outputs->isExist(key)) {
+            dynamic_decode_output_tensors.insert({key, outputs->at(key)});
+        }
+    }
+
+    dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+}
+
+template<typename T>
+void LlamaV2<T>::internalThreadEntry(int device_id)
+{
+    FT_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
+    check_cuda_error(cudaSetDevice(device_id));
+
+    auto& request_queue  = shared_state_->request_queue;
+    auto& infer_requests = shared_state_->infer_requests;
+    auto& stop_requests  = shared_state_->stop_requests;
+
+    while (1) {
+        if (tensor_para_.rank_ == 0) {
+            const int  free_slot_count = batch_.maxSize() - batch_.size() + batch_.finishedCount();
+            const bool is_empty        = free_slot_count == batch_.maxSize();
+
+            request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty);
+
+            batch_.verifyRequests(stop_requests, infer_requests);
+        }
+
+        // wait while rank-0 is dequeueing
+        shared_state_->barrier->wait();
+
+        bool modified = false;
+
+        if (!(batch_.finishedCount() == 0 && stop_requests.empty() && infer_requests.empty())) {
+            batch_.handleStopRequests(stop_requests);
+            batch_.synchronize();
+            modified = true;
+        }
+
+        const int infer_request_count = infer_requests.size();
+
+        if (!infer_requests.empty()) {
+            batch_.initialize(infer_requests);  // reinitialize when new requests come, possible buffer allocation
+            batch_.contextDecode();
+            modified = true;
+        }
+
+        // wait while shared stop/infer_requests is being used
+        shared_state_->barrier->wait();
+
+        if (batch_.size()) {
+            if (modified) {
+                batch_.initializeGeneration();
+                batch_.initializeSampling(infer_request_count);
+            }
+            for (int i = 0; i < step_length_; ++i) {
+                if (!batch_.generate()) {
+                    break;
+                }
+            }
+            batch_.finish();
+        }
+    }
+
+    FT_CHECK(0);
+}
+
+template<typename T>
+void LlamaV2<T>::start()
+{
+    int device_id = -1;
+    check_cuda_error(cudaGetDevice(&device_id));
+    internal_thread_ = std::thread(&LlamaV2<T>::internalThreadEntry, this, device_id);
+}
+
+static inline Tensor slice(const Tensor& tensor, int index)
+{
+    auto shape = tensor.shape;
+    if (shape.at(0) == 1) {
+        return tensor;
+    }
+    shape[0]          = 1;
+    const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{});
+    return tensor.slice(shape, offset);
+}
+
+// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors
+static inline TensorMap slice(const std::unordered_map<std::string, Tensor>& src, int index)
+{
+    TensorMap dst;
+    for (const auto& kv : src) {
+        dst.insert({kv.first, slice(kv.second, index)});
+    }
+    return dst;
+}
+
+template<typename T>
+void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
+                         const std::unordered_map<std::string, Tensor>* inputs,
+                         Control                                        control)
+{
+    if (debug_) {
+        if (tensor_para_.rank_ == 0) {
+            for (const auto& kv : *inputs) {
+                FT_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
+            }
+            for (const auto& kv : *outputs) {
+                FT_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
+            }
+        }
+    }
+
+    const int batch_size = outputs->at("output_ids").shape[0];
+
+    const auto rank = tensor_para_.rank_;
+
+    std::vector<std::shared_ptr<Request>> requests(batch_size);
+
+    // rank-0 allocates all requests for the batch
+    if (rank == 0) {
+        for (int i = 0; i < batch_size; ++i) {
+            requests[i] = std::make_shared<Request>();
+            requests[i]->inputs.resize(tensor_para_.world_size_);
+            requests[i]->outputs.resize(tensor_para_.world_size_);
+        }
+        control.comm->setSharedObject(&requests);
+    }
+
+    control.comm->barrier();
+
+    if (rank != 0) {
+        requests = *(std::vector<std::shared_ptr<Request>>*)control.comm->getSharedObject();
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+        auto& r = requests[i];
+
+        r->inputs[rank]  = slice(*inputs, i);
+        r->outputs[rank] = slice(*outputs, i);
+
+        if (rank == 0) {
+            r->id         = r->inputs[rank].getVal<uint64_t>("CORRID", i);
+            r->start_flag = r->inputs[rank].getVal<int>("START", 1);
+            r->end_flag   = r->inputs[rank].getVal<int>("END", 1);
+            r->stop_flag  = r->inputs[rank].getVal<int>("STOP", 0);
+            r->stream_cb  = control.callback;
+        }
+    }
+
+    control.comm->barrier();
+
+    // rank-0 now takes the ownership of `requests`
+    // rank-0 submits the tasks and wait for finish
+    std::vector<int> error_codes;
+    bool             has_error = 0;
+    if (rank == 0) {
+        FT_LOG_INFO("[forward] Enqueue requests");
+        auto futures = shared_state_->request_queue.enqueue(std::move(requests));
+
+        FT_LOG_INFO("[forward] Wait for requests to complete ...");
+        for (auto& f : futures) {
+            auto ec = f.get();
+            error_codes.push_back(ec);
+            if (ec) {
+                has_error = true;
+            }
+        }
+    }
+
+    // prevents request tensors being freed before the batch completes
+    control.comm->barrier();
+
+    if (rank == 0 && has_error) {
+        std::stringstream ss;
+        for (int i = 0; i < error_codes.size(); ++i) {
+            ss << (i ? "" : " ") << error_codes[i];
+        }
+        throw std::runtime_error(ss.str());
+    }
+}
+
+template class LlamaV2<half>;
+template class LlamaV2<float>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaV2.h b/src/fastertransformer/models/llama/LlamaV2.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb41a91ff35055ea9dbbda85c4e6ba4fe4eb294a
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaV2.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
+
+#pragma once
+
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/models/llama/Barrier.h"
+#include "src/fastertransformer/models/llama/LlamaBatch.h"
+#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
+#include "src/fastertransformer/models/llama/LlamaDecoder.h"
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+#include "src/fastertransformer/models/llama/Request.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <unordered_map>
+
+namespace fastertransformer {
+
+template<typename T>
+class LlamaV2 {
+public:
+    struct SharedState {
+        std::vector<std::shared_ptr<Request>> infer_requests;
+        std::vector<std::shared_ptr<Request>> stop_requests;
+        RequestQueue                          request_queue;
+        std::shared_ptr<Barrier>              barrier;
+    };
+
+    ~LlamaV2();
+
+    LlamaV2(size_t                       head_num,
+            size_t                       size_per_head,
+            size_t                       inter_size,
+            size_t                       num_layer,
+            size_t                       vocab_size,
+            size_t                       rotary_embedding_dim,
+            float                        norm_eps,
+            int                          max_batch_size,
+            int                          max_context_token_num,
+            int                          session_len,
+            int                          step_length,
+            int                          start_id,
+            int                          end_id,
+            int                          cache_max_entry_count,
+            int                          cache_chunk_size,
+            bool                         use_context_fmha,
+            std::shared_ptr<SharedState> shared_state,
+            LlamaWeight<T>*              weights,
+            NcclParam                    tensor_para,
+            cudaStream_t                 stream,
+            cublasMMWrapper*             cublas_wrapper,
+            IAllocator*                  allocator,
+            bool                         is_free_buffer_after_forward,
+            cudaDeviceProp*              cuda_device_prop);
+
+    struct Control {
+        AbstractInstanceComm* comm;
+        Request::Callback     callback;
+    };
+
+    void forward(std::unordered_map<std::string, Tensor>*       outputs,
+                 const std::unordered_map<std::string, Tensor>* inputs,
+                 Control                                        control);
+
+    void stop(const std::vector<uint64_t>& seq_ids);
+
+private:
+    friend class Batch;
+
+    void internalThreadEntry(int device_id);
+
+    void initialize(bool use_context_fmha);
+
+    void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);
+
+    void contextDecode(T*         deocder_output,
+                       uintptr_t* k_cache_ptr,
+                       uintptr_t* v_cache_ptr,
+                       T*         context_decoder_input_buf,
+                       T*         context_decoder_output_buf,
+                       const int* input_ids,
+                       const int* input_length,
+                       const int* history_length,
+                       const int* context_length,
+                       size_t     token_num,
+                       size_t     max_input_len,
+                       size_t     max_context_len,
+                       size_t     session_len,
+                       size_t     batch_size);
+
+    void decoderForward(T*         decoder_output,
+                        uintptr_t* k_cache_ptr,
+                        uintptr_t* v_cache_ptr,
+                        T*         decoder_input,
+                        const int* sequence_length,
+                        const int* total_padding_count,
+                        bool*      finished,
+                        int        step,
+                        int        ite,
+                        size_t     session_len,
+                        size_t     batch_size);
+
+    void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);
+
+    void dynamicDecode(int*            token_ids,
+                       bool*           finished,
+                       int*            sequence_length,
+                       bool*           should_stop,
+                       TensorMap*      inputs,
+                       TensorMap*      outputs,
+                       const float*    logits,
+                       const uint32_t* seq_limit_len,
+                       const int*      context_length,
+                       const int*      end_ids,
+                       int             step,
+                       int             ite,
+                       size_t          max_context_len,
+                       size_t          token_ids_len,
+                       size_t          batch_size);
+
+    void start();
+
+private:
+    friend class LlamaBatch<T>;
+
+    const size_t head_num_;
+    const size_t size_per_head_;
+    const size_t inter_size_;
+    const size_t num_layer_;
+    const size_t vocab_size_;
+    const size_t rotary_embedding_dim_;
+    float        rmsnorm_eps_ = 1e-6f;
+
+    static constexpr bool neox_rotary_style_ = false;
+
+    const int    start_id_;
+    const int    end_id_;
+    const size_t hidden_units_;
+
+    const size_t local_head_num_;
+    NcclParam    tensor_para_;
+
+    cudaStream_t     stream_;
+    cublasMMWrapper* cublas_wrapper_;
+    IAllocator*      allocator_;
+    bool             is_free_buffer_after_forward_;
+    cudaDeviceProp*  cuda_device_prop_;
+
+    const bool debug_{false};
+
+    std::unique_ptr<LlamaCacheManager> kv_cache_mgr_;
+
+    LlamaWeight<T>*            weights_{};
+    LlamaDecoder<T>*           decoder_{};
+    LlamaContextDecoder<T>*    context_decoder_{};
+    DynamicDecodeLayer<float>* dynamic_decode_layer_{};
+
+    const int                    step_length_;
+    LlamaBatch<T>                batch_;
+    std::shared_ptr<SharedState> shared_state_;
+
+    std::thread internal_thread_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaWeight.cc b/src/fastertransformer/models/llama/LlamaWeight.cc
new file mode 100644
index 0000000000000000000000000000000000000000..932ddde0ca8a5891b343b45cb052038fa368a8ad
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
+
+#include "src/fastertransformer/models/llama/LlamaWeight.h"
+
+namespace fastertransformer {
+
+template<typename T>
+LlamaWeight<T>::LlamaWeight(size_t     hidden_units,
+                            size_t     inter_size,
+                            size_t     vocab_size,
+                            size_t     num_layer,
+                            WeightType weight_type,
+                            size_t     tensor_para_size,
+                            size_t     tensor_para_rank,
+                            int        prefix_cache_len):
+    hidden_units_(hidden_units),
+    inter_size_(inter_size),
+    vocab_size_(vocab_size),
+    num_layer_(num_layer),
+    weight_type_(weight_type),
+    tensor_para_size_(tensor_para_size),
+    tensor_para_rank_(tensor_para_rank),
+    prefix_cache_len_(prefix_cache_len)
+{
+    decoder_layer_weights.reserve(num_layer_);
+    for (unsigned l = 0; l < num_layer_; ++l) {
+        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
+            hidden_units_, inter_size_, weight_type_, tensor_para_size_, tensor_para_rank_));
+    }
+
+    mallocWeights();
+}
+
+template<typename T>
+LlamaWeight<T>::~LlamaWeight()
+{
+    cudaFree((void*)pre_decoder_embedding_table);
+    cudaFree((void*)output_norm_weight);
+    cudaFree((void*)post_decoder_embedding_kernel);
+
+    if (prefix_cache_key) {
+        cudaFree((void*)prefix_cache_key);
+        cudaFree((void*)prefix_cache_token);
+    }
+
+    pre_decoder_embedding_table   = nullptr;
+    post_decoder_embedding_kernel = nullptr;
+
+    prefix_cache_token = nullptr;
+    prefix_cache_key   = nullptr;
+    prefix_cache_value = nullptr;
+}
+
+template<typename T>
+void LlamaWeight<T>::mallocWeights()
+{
+    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_ * hidden_units_);
+    deviceMalloc((T**)&output_norm_weight, hidden_units_);
+    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_);
+
+    if (prefix_cache_len_) {
+        size_t cache_size = num_layer_ * prefix_cache_len_ * hidden_units_ / tensor_para_size_;
+        deviceMalloc((T**)&prefix_cache_key, cache_size * 2);
+        prefix_cache_value = prefix_cache_key + cache_size;
+        deviceMalloc((int**)&prefix_cache_token, prefix_cache_len_);
+    }
+}
+
+template<typename T>
+void LlamaWeight<T>::loadModel(std::string dir_path)
+{
+    FtCudaDataType model_file_type = FtCudaDataType::FP16;
+    dir_path += '/';
+
+    loadWeightFromBin((T*)pre_decoder_embedding_table,
+                      {vocab_size_ * hidden_units_},
+                      dir_path + "tok_embeddings.weight",
+                      model_file_type);
+
+    loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);
+
+    loadWeightFromBin(
+        (T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_}, dir_path + "output.weight", model_file_type);
+
+    if (prefix_cache_len_) {
+        loadWeightFromBin((float*)prefix_cache_token, {prefix_cache_len_}, dir_path + "prefix_cache.token");
+        loadWeightFromBin((T*)prefix_cache_key,
+                          {num_layer_ * prefix_cache_len_, hidden_units_ / tensor_para_size_},
+                          dir_path + "prefix_cache." + std::to_string(tensor_para_rank_) + ".key",
+                          model_file_type);
+        loadWeightFromBin((T*)prefix_cache_value,
+                          {num_layer_ * prefix_cache_len_, hidden_units_ / tensor_para_size_},
+                          dir_path + "prefix_cache." + std::to_string(tensor_para_rank_) + ".value",
+                          model_file_type);
+    }
+
+    for (unsigned layer = 0; layer < num_layer_; ++layer) {
+        decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
+    }
+}
+
+template struct LlamaWeight<float>;
+template struct LlamaWeight<half>;
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/LlamaWeight.h b/src/fastertransformer/models/llama/LlamaWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..76d1733da4873e87a16aca818c5354189f18beee
--- /dev/null
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct LlamaWeight {
+    LlamaWeight() = default;
+    LlamaWeight(size_t     hidden_units,
+                size_t     inter_size,
+                size_t     vocab_size,
+                size_t     num_layer,
+                WeightType weight_type,
+                size_t     tensor_para_size,
+                size_t     tensor_para_rank,
+                int        prefix_cache_len);
+
+    ~LlamaWeight();
+
+    LlamaWeight(const LlamaWeight& other) = delete;
+    LlamaWeight& operator=(const LlamaWeight& other) = delete;
+
+    void loadModel(std::string dir_path);
+
+    std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
+    const T*                                 pre_decoder_embedding_table{};
+    const T*                                 output_norm_weight{};
+    const T*                                 post_decoder_embedding_kernel{};
+
+    size_t prefix_cache_len_;
+    int*   prefix_cache_token{};
+    T*     prefix_cache_key{};
+    T*     prefix_cache_value{};
+
+private:
+    void mallocWeights();
+
+    size_t     hidden_units_;
+    size_t     inter_size_;
+    size_t     vocab_size_;
+    size_t     num_layer_;
+    WeightType weight_type_;
+    size_t     tensor_para_size_;
+    size_t     tensor_para_rank_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/Request.h b/src/fastertransformer/models/llama/Request.h
new file mode 100644
index 0000000000000000000000000000000000000000..e02a9f3e05fd9cc6c0b405f7607358d30d385c40
--- /dev/null
+++ b/src/fastertransformer/models/llama/Request.h
@@ -0,0 +1,91 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include <condition_variable>
+#include <cstdint>
+#include <future>
+#include <limits>
+#include <queue>
+#include <unordered_map>
+
+namespace fastertransformer {
+
+struct Request {
+    uint64_t id;
+    bool     start_flag;
+    bool     end_flag;
+    bool     stop_flag;
+
+    // per rank inputs/outputs
+    std::vector<TensorMap> inputs;
+    std::vector<TensorMap> outputs;
+
+    using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
+    Callback stream_cb;
+
+    enum
+    {
+        kInvalid  = 1,
+        kConflict = 2,
+        kBusy     = 3,
+        kInactive = 4,
+        kFail     = 5
+    };
+    std::promise<int> signal;
+};
+
+class RequestQueue {
+public:
+    std::vector<std::future<int>> enqueue(std::vector<std::shared_ptr<Request>> requests)
+    {
+        std::vector<std::future<int>> futures;
+        futures.reserve(requests.size());
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            for (auto& r : requests) {
+                futures.push_back(r->signal.get_future());
+                if (r->stop_flag) {
+                    stop_queue_.push(std::move(r));
+                }
+                else {
+                    infer_queue_.push(std::move(r));
+                }
+            }
+        }
+        cv_.notify_one();
+        return futures;
+    }
+
+    void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
+                 std::vector<std::shared_ptr<Request>>& infer_requests,
+                 unsigned                               max_infer_count,
+                 bool                                   blocking)
+    {
+        std::unique_lock<std::mutex> lock(mutex_);
+        if (blocking) {
+            cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()); });
+        }
+
+        stop_requests.clear();
+        while (!stop_queue_.empty()) {
+            stop_requests.push_back(std::move(stop_queue_.front()));
+            stop_queue_.pop();
+        }
+
+        infer_requests.clear();
+        while (!infer_queue_.empty() && infer_requests.size() < max_infer_count) {
+            infer_requests.push_back(std::move(infer_queue_.front()));
+            infer_queue_.pop();
+        }
+    }
+
+private:
+    std::queue<std::shared_ptr<Request>> stop_queue_;
+    std::queue<std::shared_ptr<Request>> infer_queue_;
+    std::mutex                           mutex_;
+    std::condition_variable              cv_;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/llama_decoder_kernels.cu b/src/fastertransformer/models/llama/llama_decoder_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ed975be9608ee2e5ad149de5a49b68e67995694
--- /dev/null
+++ b/src/fastertransformer/models/llama/llama_decoder_kernels.cu
@@ -0,0 +1,155 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+#include <cuda_fp16.h>
+
+namespace cg = cooperative_groups;
+
+namespace fastertransformer {
+
+template<typename T>
+struct res_norm_ops_t {};
+
+template<typename T>
+struct res_norm_t {
+    res_norm_ops_t<T> f;
+    __device__ uint4  addvec(const uint4& a, const uint4& b, float& accum) const
+    {
+        uint4 c;
+        c.x = f.cast(f.add(f.cast(a.x), f.cast(b.x), accum));
+        c.y = f.cast(f.add(f.cast(a.y), f.cast(b.y), accum));
+        c.z = f.cast(f.add(f.cast(a.z), f.cast(b.z), accum));
+        c.w = f.cast(f.add(f.cast(a.w), f.cast(b.w), accum));
+        return c;
+    }
+    __device__ uint4 normvec(const uint4& u, const uint4& s, float factor) const
+    {
+        uint4 v;
+        v.x = f.cast(f.norm(f.cast(u.x), f.cast(s.x), factor));
+        v.y = f.cast(f.norm(f.cast(u.y), f.cast(s.y), factor));
+        v.z = f.cast(f.norm(f.cast(u.z), f.cast(s.z), factor));
+        v.w = f.cast(f.norm(f.cast(u.w), f.cast(s.w), factor));
+        return v;
+    }
+};
+
+template<>
+struct res_norm_ops_t<half> {
+    __device__ float2 cast(const uint& x) const
+    {
+        return __half22float2(reinterpret_cast<const half2&>(x));
+    }
+    __device__ uint cast(const float2& x) const
+    {
+        auto y = __float22half2_rn(x);
+        return reinterpret_cast<uint&>(y);
+    }
+    __device__ float2 add(const float2& a, const float2& b, float& accum) const
+    {
+        float2 c{a.x + b.x, a.y + b.y};
+        accum += c.x * c.x + c.y * c.y;
+        return c;
+    }
+    __device__ float2 norm(const float2& a, const float2& s, float factor) const
+    {
+        return {a.x * s.x * factor, a.y * s.y * factor};
+    }
+};
+
+template<>
+struct res_norm_ops_t<float> {
+    __device__ float cast(const uint& x) const
+    {
+        return reinterpret_cast<const float&>(x);
+    }
+    __device__ uint cast(const float& x) const
+    {
+        return reinterpret_cast<const uint&>(x);
+    }
+    __device__ float add(const float& a, const float& b, float& accum) const
+    {
+        float c = a + b;
+        accum += c * c;
+        return c;
+    }
+    __device__ float norm(const float& a, const float& s, float factor) const
+    {
+        return a * s * factor;
+    }
+};
+
+template<typename T>
+__device__ T blockReduceSum(const cg::thread_block& block, T value)
+{
+    __shared__ float partial[32];
+
+    auto tile = cg::tiled_partition<32>(block);
+    value     = cg::reduce(tile, value, cg::plus<float>{});
+
+    if (tile.thread_rank() == 0) {
+        partial[tile.meta_group_rank()] = value;
+    }
+
+    block.sync();
+
+    value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{};
+    return cg::reduce(tile, value, cg::plus<float>{});
+}
+
+template<typename T>
+__global__ void fusedAddResidualNorm(
+    T* __restrict__ r_data, T* __restrict__ x_data, const T* __restrict__ scale, float eps, int batch_size, int n_dims)
+{
+    auto block = cg::this_thread_block();
+    auto grid  = cg::this_grid();
+
+    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
+
+    const auto b              = grid.block_rank();
+    uint4* __restrict__ r_ptr = reinterpret_cast<uint4*>(r_data + b * n_dims);
+    uint4* __restrict__ x_ptr = reinterpret_cast<uint4*>(x_data + b * n_dims);
+
+    res_norm_t<T> ops;
+
+    float thread_sum{};
+    for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
+        auto r   = r_ptr[i];
+        auto x   = x_ptr[i];
+        r        = ops.addvec(r, x, thread_sum);
+        r_ptr[i] = r;
+    }
+
+    auto total_sum = blockReduceSum(block, thread_sum);
+
+    float s_inv_mean = rsqrt(total_sum / n_dims + eps);
+
+    const uint4* __restrict__ s_ptr = reinterpret_cast<const uint4*>(scale);
+    for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
+        auto r   = r_ptr[i];
+        auto s   = s_ptr[i];
+        auto o   = ops.normvec(r, s, s_inv_mean);
+        x_ptr[i] = o;
+    }
+}
+
+template<typename T>
+void invokeFusedAddResidualRMSNorm(
+    T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
+{
+    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
+    FT_CHECK(n_dims % PACK_DIM == 0);
+    const int n_pack    = n_dims / PACK_DIM;
+    const int n_iter    = ((n_pack + 1023) / 1024);        // iterations when block size == 1024
+    int       n_threads = (n_pack + n_iter - 1) / n_iter;  // adjust block size to avoid tail effect
+    n_threads           = (n_threads + 31) / 32 * 32;      // round up to the nearest multiple of warp size
+
+    fusedAddResidualNorm<<<batch_size, n_threads, 0, stream>>>(residual, inout, scale, eps, batch_size, n_dims);
+}
+
+template void invokeFusedAddResidualRMSNorm(float*, float*, const float*, float, int, int, cudaStream_t);
+template void invokeFusedAddResidualRMSNorm(half*, half*, const half*, float, int, int, cudaStream_t);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/llama_decoder_kernels.h b/src/fastertransformer/models/llama/llama_decoder_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..a020b68ff83e9d428e011dee79ee032eddf44c86
--- /dev/null
+++ b/src/fastertransformer/models/llama/llama_decoder_kernels.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeFusedAddResidualRMSNorm(
+    T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/models/llama/llama_kernels.cu b/src/fastertransformer/models/llama/llama_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..967b0a6582c9bbd552843350aa2b94dcad84b13e
--- /dev/null
+++ b/src/fastertransformer/models/llama/llama_kernels.cu
@@ -0,0 +1,492 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+
+namespace fastertransformer {
+
+// fp16, bf16
+// n is divided by 2 for this impl
+template<typename T>
+__global__ void rootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n)
+{
+    using T2 = typename TypeConverter<T>::Type;
+    __shared__ float s_inv_mean;
+    float            mean = 0.f;
+
+    T2*       out_ptr   = (T2*)out;
+    const T2* input_ptr = (const T2*)input;
+    const T2* scale_ptr = (const T2*)scale;
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float2 tmp2 = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
+        mean += tmp2.x * tmp2.x;
+        mean += tmp2.y * tmp2.y;
+    }
+
+    mean = blockReduceSum<float>(mean);
+    if (threadIdx.x == 0) {
+        s_inv_mean = rsqrt(.5f * mean / (float)n + eps);
+    }
+    __syncthreads();
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float2 tmp2                   = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
+        float2 sca2                   = cuda_cast<float2>(scale_ptr[idx]);
+        tmp2.x                        = tmp2.x * s_inv_mean * sca2.x;
+        tmp2.y                        = tmp2.y * s_inv_mean * sca2.y;
+        out_ptr[blockIdx.x * n + idx] = cuda_cast<T2>(tmp2);
+    }
+}
+
+template<>
+__global__ void rootMeanSquareNorm(float* out, const float* input, const float* scale, float eps, int m, int n)
+{
+    __shared__ float s_inv_mean;
+    float            mean = 0.f;
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float tmp = input[blockIdx.x * n + idx];
+        mean += tmp * tmp;
+    }
+
+    mean = blockReduceSum<float>(mean);
+    if (threadIdx.x == 0) {
+        s_inv_mean = rsqrt(mean / static_cast<float>(n) + eps);
+    }
+    __syncthreads();
+
+    for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
+        float tmp                 = input[blockIdx.x * n + idx];
+        out[blockIdx.x * n + idx] = tmp * s_inv_mean * scale[idx];
+    }
+}
+
+template<typename T>
+void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream)
+{
+    if (sizeof(T) == 2) {
+        FT_CHECK(n % 2 == 0);
+        n /= 2;
+    }
+    dim3 grid(m);
+    dim3 block(std::min(n, 1024));
+    rootMeanSquareNorm<<<grid, block, 0, stream>>>(out, input, scale, eps, m, n);
+}
+
+template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t);
+template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t);
+
+// #ifdef ENABLE_BF16
+
+// template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t);
+
+// #endif
+
+template<typename T, typename T0>
+__device__ T saturate_cast(T0 x)
+{
+    return x;
+}
+
+template<>
+__device__ half saturate_cast<half, float>(float x)
+{
+    return (x > 64512.f || x < -64512.f) ? (x > 0.f ? 64512.f : -64512.f) : x;
+}
+
+template<typename T>
+__global__ void addResidual(T* out, const T* in, size_t n)
+{
+    auto idx = threadIdx.x + (size_t)blockIdx.x * blockDim.x;
+    if (idx < n) {
+        out[idx] = static_cast<T>(static_cast<float>(out[idx]) + static_cast<float>(in[idx]));
+    }
+}
+
+template<typename T>
+void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream)
+{
+    auto total = static_cast<size_t>(m) * n;
+    dim3 block(std::min(total, 1024UL));
+    dim3 grid((total + block.x - 1) / block.x);
+
+    addResidual<<<grid, block, 0, stream>>>(out, in, total);
+}
+
+template void invokeAddResidual(float*, const float*, int, int, cudaStream_t);
+template void invokeAddResidual(half*, const half*, int, int, cudaStream_t);
+
+// ids [seq_len, batch_size]
+// input_ids [batch_size, max_input_len]
+__global__ void
+fixInputIds(int* ids, const int* input_ids, const int* input_lengths, int batch_size, int seq_len, int max_input_len)
+{
+    int seq_id   = threadIdx.x;
+    int batch_id = blockIdx.x;
+    for (; seq_id < input_lengths[batch_id]; seq_id += blockDim.x) {
+        ids[seq_id * batch_size + batch_id] = input_ids[batch_id * max_input_len + seq_id];
+    }
+}
+
+void invokeFixInputIds(int*         ids,
+                       const int*   input_ids,
+                       const int*   input_lengths,
+                       int          batch_size,
+                       int          seq_len,
+                       int          max_input_len,
+                       cudaStream_t st)
+{
+    dim3 block(std::min(1024, max_input_len));
+    dim3 grid(batch_size);
+    fixInputIds<<<grid, block, 0, st>>>(ids, input_ids, input_lengths, batch_size, seq_len, max_input_len);
+}
+
+template<typename T>
+__global__ void sliceCausalMask(T* mask, int seq_len, int key_len, int step)
+{
+    mask += (size_t)blockIdx.x * seq_len * key_len;
+    for (int i = threadIdx.x; i < seq_len * key_len; i += blockDim.x) {
+        int row = i / key_len;
+        int col = i % key_len;
+        if (col <= row + step) {
+            mask[i] = static_cast<T>(1.f);
+        }
+        else {
+            mask[i] = static_cast<T>(0.f);
+        }
+    }
+}
+
+// [step: step+Q, :] of the K*K causal mask
+template<typename T>
+void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream)
+{
+    FT_CHECK(step == key_len - seq_len);
+    sliceCausalMask<<<batch_size, 256, 0, stream>>>(mask, seq_len, key_len, step);
+}
+
+template void invokeSliceCausalMask(half*, int, int, int, int, cudaStream_t);
+template void invokeSliceCausalMask(float*, int, int, int, int, cudaStream_t);
+
+// mask [bsz, max_q_len, max_k_len]
+
+template<typename T>
+__global__ void createCausalMasks(T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len)
+{
+    const auto q_len = q_lens[blockIdx.x];
+    const auto k_len = k_lens[blockIdx.x];
+    mask += blockIdx.x * max_q_len * max_k_len;
+    for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) {
+        const int q        = i / max_k_len;  // [0, max_q_len)
+        const int k        = i % max_k_len;  // [0, max_k_len)
+        bool      is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len);
+        mask[i]            = static_cast<T>(is_valid);
+    }
+}
+
+template<typename T>
+void invokeCreateCausalMasks(
+    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream)
+{
+    createCausalMasks<<<batch_size, 512, 0, stream>>>(mask, q_lens, k_lens, max_q_len, max_k_len);
+}
+
+template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t);
+template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t);
+
+template<typename T>
+__global__ void extend_key_cache(T**          k_dst,
+                                 const size_t dst_offset,
+                                 const T*     k_src,
+                                 const int    head_num,
+                                 const int    size_per_head,
+                                 const int*   query_length,
+                                 const int*   history_length,
+                                 const int    max_q_len,
+                                 const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto key_src = reinterpret_cast<const uint4*>(k_src);
+    const auto key_dst = reinterpret_cast<uint4*>(k_dst[batch_id] + dst_offset);
+
+    const auto seq_len  = query_length[batch_id];
+    const auto t_offset = history_length[batch_id];
+
+    const int k_head_size_id = idx % size_per_head_div_x;
+    const int k_seq_len_id   = idx / size_per_head_div_x;
+
+    if (k_seq_len_id < seq_len) {
+        // [B, H, s, D/x] -> [H, D/x, S[t:t+s]]
+
+        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +  // H
+                                k_head_size_id * max_seq_len +                 // D/x
+                                t_offset + k_seq_len_id;                       // s + offset
+
+        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
+                                head_id * size_per_head_div_x * max_q_len +              // H
+                                k_seq_len_id * size_per_head_div_x +                     // s
+                                k_head_size_id;                                          // D/x
+
+        key_dst[dst_idx] = key_src[src_idx];
+    }
+}
+
+template<typename T>
+__global__ void extend_value_cache(T**          v_dst,
+                                   const size_t dst_offset,
+                                   const T*     v_src,
+                                   const int    head_num,
+                                   const int    size_per_head,
+                                   const int*   query_length,
+                                   const int*   history_length,
+                                   const int    max_q_len,
+                                   const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto val_src = reinterpret_cast<const uint4*>(v_src);
+    const auto val_dst = reinterpret_cast<uint4*>(v_dst[batch_id] + dst_offset);
+
+    const auto seq_len  = query_length[batch_id];
+    const auto t_offset = history_length[batch_id];
+
+    const int v_head_size_id = idx % size_per_head_div_x;
+    const int v_seq_len_id   = idx / size_per_head_div_x;
+
+    if (v_seq_len_id < seq_len) {
+        // [B, H, s, D/x] -> [H, S[t:t+s], D/x]
+        const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len +      // H
+                                (v_seq_len_id + t_offset) * size_per_head_div_x +  // s + offset
+                                v_head_size_id;                                    // D/x
+
+        const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len +  // B
+                                head_id * size_per_head_div_x * max_q_len +              // H
+                                v_seq_len_id * size_per_head_div_x +                     // s
+                                v_head_size_id;                                          // D/x
+
+        val_dst[dst_idx] = val_src[src_idx];
+    }
+}
+
+template<typename T>
+void invokeExtendKVCache(T**          k_dst,
+                         T**          v_dst,
+                         size_t       dst_offset,
+                         const T*     k_src,
+                         const T*     v_src,
+                         int          local_batch_size,
+                         const int*   query_length,
+                         int          max_q_len,
+                         const int*   history_length,
+                         int          max_seq_len,
+                         int          size_per_head,
+                         int          local_head_num,
+                         cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+
+    dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
+
+    extend_value_cache<<<grid, block_sz, 0, stream>>>(
+        k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+
+    extend_value_cache<<<grid, block_sz, 0, stream>>>(
+        v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+}
+
+template void invokeExtendKVCache(float**,
+                                  float**,
+                                  size_t,
+                                  const float*,
+                                  const float*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  int,
+                                  int,
+                                  cudaStream_t stream);
+
+template void invokeExtendKVCache(half**,
+                                  half**,
+                                  size_t,
+                                  const half*,
+                                  const half*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  const int*,
+                                  int,
+                                  int,
+                                  int,
+                                  cudaStream_t stream);
+
+template<typename T>
+__global__ void transpose_key_cache(T*           k_dst,
+                                    const T**    k_src,
+                                    const size_t src_offset,
+                                    const int    head_num,
+                                    const int    size_per_head,
+                                    const int*   seq_length,
+                                    const int    max_kv_len,
+                                    const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto key_src = reinterpret_cast<const uint4*>(k_src[batch_id] + src_offset);
+    const auto key_dst = reinterpret_cast<uint4*>(k_dst);
+
+    const auto seq_len = seq_length[batch_id];
+
+    const int k_head_size_id = idx % size_per_head_div_x;
+    const int k_seq_len_id   = idx / size_per_head_div_x;
+
+    if (k_seq_len_id < seq_len) {
+        // [B, H, s, D/x] <- [B, H, D/x, S[:s]]
+
+        const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len +  // H
+                                k_head_size_id * max_seq_len +                 // D/x
+                                k_seq_len_id;                                  // s
+
+        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
+                                head_id * size_per_head_div_x * max_kv_len +              // H
+                                k_seq_len_id * size_per_head_div_x +                      // s
+                                k_head_size_id;                                           // D/x
+
+        key_dst[dst_idx] = key_src[src_idx];
+    }
+}
+
+template<typename T>
+__global__ void transpose_value_cache(T*           v_dst,  //
+                                      const T**    v_src,
+                                      const size_t src_offset,
+                                      const int    head_num,
+                                      const int    size_per_head,
+                                      const int*   seq_length,
+                                      const int    max_kv_len,
+                                      const int    max_seq_len)
+{
+    const int     batch_id = blockIdx.y;
+    const int     head_id  = blockIdx.z;
+    constexpr int X_ELEMS  = (sizeof(T) == 4) ? 4 : 8;
+
+    const int idx                 = blockIdx.x * blockDim.x + threadIdx.x;
+    int       size_per_head_div_x = size_per_head / X_ELEMS;
+
+    // x dim is now handled by uint4 type
+    const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
+    const auto val_dst = reinterpret_cast<uint4*>(v_dst);
+
+    const auto seq_len = seq_length[batch_id];
+
+    const int v_head_size_id = idx % size_per_head_div_x;
+    const int v_seq_len_id   = idx / size_per_head_div_x;
+
+    if (v_seq_len_id < seq_len) {
+        // [B, H, s, D/x] <- [B, H, S[:s], D/x]
+        const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len +  // H
+                                v_seq_len_id * size_per_head_div_x +           // s
+                                v_head_size_id;                                // D/x
+
+        const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len +  // B
+                                head_id * size_per_head_div_x * max_kv_len +              // H
+                                v_seq_len_id * size_per_head_div_x +                      // s
+                                v_head_size_id;                                           // D/x
+
+        val_dst[dst_idx] = val_src[src_idx];
+    }
+}
+
+template<typename T>
+void invokeTransposeKVCache(T*           key_cache_trans,
+                            T*           val_cache_trans,
+                            const T**    key_cache,
+                            const T**    val_cache,
+                            size_t       src_offset,
+                            int          batch_size,
+                            const int*   key_length,
+                            int          max_kv_len,
+                            int          max_seq_len,
+                            int          size_per_head,
+                            int          head_num,
+                            cudaStream_t stream)
+{
+    constexpr int block_sz = 128;
+    constexpr int x        = (sizeof(T) == 4) ? 4 : 8;
+
+    dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
+
+    transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+        key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+
+    transpose_value_cache<<<grid, block_sz, 0, stream>>>(
+        val_cache_trans, val_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
+}
+
+template void invokeTransposeKVCache(
+    float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
+template void invokeTransposeKVCache(
+    half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
+
+__global__ void gatherOutput(int*       output_ids,
+                             const int* ids,
+                             const int* context_length,
+                             int        max_context_len,
+                             int        max_gen_step,
+                             int        max_output_len,
+                             int        batch_size)
+{
+    const int batch_id    = blockIdx.x;
+    const int context_len = context_length[batch_id];
+    output_ids += batch_id * max_output_len;
+    for (int src_idx = threadIdx.x; src_idx < max_gen_step; src_idx += blockDim.x) {
+        // skip padding for src
+        if (context_len <= src_idx && src_idx < max_context_len) {
+            continue;
+        }
+        // skip padding for dst
+        const int dst_idx   = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
+        output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
+    }
+}
+
+void invokeGatherOutput(int*         output_ids,
+                        const int*   ids,
+                        const int*   context_length,
+                        int          max_context_len,
+                        int          max_gen_step,
+                        int          max_output_len,
+                        int          batch_size,
+                        cudaStream_t stream)
+{
+    int block_size = 512;
+    int grid_size  = batch_size;
+    gatherOutput<<<grid_size, block_size, 0, stream>>>(
+        output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/llama_kernels.h b/src/fastertransformer/models/llama/llama_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..492ab2dfc1a31393e5c9d9379fd71a687a7d964e
--- /dev/null
+++ b/src/fastertransformer/models/llama/llama_kernels.h
@@ -0,0 +1,181 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <numeric>
+
+namespace fastertransformer {
+
+template<typename T>
+void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream);
+
+template<typename T>
+void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream);
+
+void invokeFixInputIds(int*         ids,
+                       const int*   input_ids,
+                       const int*   input_lengths,
+                       int          batch_size,
+                       int          seq_len,
+                       int          max_input_len,
+                       cudaStream_t st);
+
+template<typename T>
+void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream);
+
+template<typename T>
+void invokeCreateCausalMasks(
+    T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream);
+
+template<typename T>
+void invokeExtendKVCache(T**          k_dst,
+                         T**          v_dst,
+                         size_t       layer_offset,
+                         const T*     k_src,
+                         const T*     v_src,
+                         int          batch_size,
+                         const int*   query_length,
+                         int          max_q_len,
+                         const int*   history_length,
+                         int          max_seq_len,
+                         int          size_per_head,
+                         int          local_head_num,
+                         cudaStream_t stream);
+
+template<typename T>
+void invokeTransposeKVCache(T*           key_cache_trans,
+                            T*           val_cache_trans,
+                            const T**    key_cache,
+                            const T**    val_cache,
+                            size_t       layer_offset,
+                            int          batch_size,
+                            const int*   key_length,
+                            int          max_kv_len,
+                            int          max_seq_len,
+                            int          size_per_head,
+                            int          head_num,
+                            cudaStream_t stream);
+
+void invokeGatherOutput(int*         output_ids,
+                        const int*   ids,
+                        const int*   context_length,
+                        int          max_context_len,
+                        int          max_gen_step,
+                        int          max_output_len,
+                        int          batch_size,
+                        cudaStream_t stream);
+
+void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
+
+template<typename T>
+class FlashAttentionOp {
+public:
+    struct AttentionLayout {
+        int  stride_batch;
+        int  stride_seq;
+        int  stride_head;
+        bool use_seqlens       = false;
+        int  batch_seqs_offset = 0;
+        T**  batch_seqs        = nullptr;
+    };
+
+    struct Params {
+        T*              attn_out;
+        T*              query;
+        T*              key;
+        T*              val;
+        T*              mask;
+        float*          out_accum    = nullptr;
+        int*            cu_seqlens_q = nullptr;
+        int*            cu_seqlens_k = nullptr;
+        AttentionLayout layout_q;
+        AttentionLayout layout_k;
+        AttentionLayout layout_v;
+        AttentionLayout layout_o;
+    };
+
+public:
+    FlashAttentionOp(int batch_size, int head_num, int key_len, int seq_len, int size_per_head);
+    ~FlashAttentionOp();
+
+    int get_workspace_size() const;
+
+    void operator()(Params& params, cudaStream_t st) const;
+
+private:
+    class impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+template<typename T>
+inline void dump(const T* x, int size, cudaStream_t st, const char* msg, bool full = false)
+{
+    std::vector<T> h_x(size);
+    cudaMemcpyAsync(h_x.data(), x, sizeof(T) * size, cudaMemcpyDefault, st);
+    cudaStreamSynchronize(st);
+    fprintf(stderr, "\n%s:\n", msg);
+    std::vector<float> h_y(h_x.begin(), h_x.end());
+    float              asum = 0.f;
+    for (const auto& x : h_y) {
+        asum += std::fabs(x);
+    }
+    if (full) {
+        for (int i = 0; i < size; ++i) {
+            printf("%d %.8f\n", i, h_y[i]);
+        }
+    }
+    else {
+        for (int i = 0; i < 8; ++i) {
+            fprintf(stderr, "%.8f\n", h_y[i]);
+        }
+        for (int i = size - 8; i < size; ++i) {
+            fprintf(stderr, "%.8f\n", h_y[i]);
+        }
+    }
+    fprintf(stderr, "\nasum = %f\n", asum);
+    // getchar();
+}
+
+template<typename T>
+struct TempBuffer {
+    TempBuffer(size_t size)
+    {
+        deviceMalloc(&data, size, false);
+    }
+    T* data;
+};
+
+template<typename T>
+inline T*
+transpose_key_cache(T* key_cache, size_t head_num, size_t size_per_head_by_x, size_t mem_len, size_t x, cudaStream_t st)
+{
+    static TempBuffer<T> buf(8192 * 8192);
+    // from: H Dx, S, x
+    // to  : S, H Dx, x
+    invokeTransposeAxis01(buf.data, key_cache, head_num * size_per_head_by_x, mem_len, x, st);
+    return buf.data;
+}
+
+template<typename T>
+inline T* transpose_value_cache(T* value_cache, size_t head_num, size_t mem_len, size_t size_per_head, cudaStream_t st)
+{
+    static TempBuffer<T> buf(8192 * 8192);
+    invokeTransposeAxis01(buf.data, value_cache, head_num, mem_len, size_per_head, st);
+    return buf.data;
+}
+
+inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_t st)
+{
+    int h_seq_len = -1;
+    cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st);
+    cudaStreamSynchronize(st);
+    FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/llama_utils.cu b/src/fastertransformer/models/llama/llama_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..213ce7d65b456e25ee4fe51ca3aff68e4fbb6e1a
--- /dev/null
+++ b/src/fastertransformer/models/llama/llama_utils.cu
@@ -0,0 +1,160 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
+#include "src/fastertransformer/models/llama/llama_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <vector>
+
+namespace fastertransformer {
+
+CmpMode compare_mode = kCmpNone;
+
+template<typename T>
+struct abs_diff_t {
+    using type = T;
+};
+
+template<>
+struct abs_diff_t<half> {
+    using type = float;
+};
+
+template<typename T>
+struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
+    __host__ __device__ float operator()(thrust::tuple<T, T> x) const
+    {
+        using R = typename abs_diff_t<T>::type;
+        auto r  = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
+        return r < R(0) ? -r : r;
+    }
+};
+
+template<typename T>
+void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
+{
+    std::vector<T> h_data(size);
+    cudaMemcpyAsync(h_data.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream);
+
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    size_t nan_cnt = 0;
+    for (const auto& x : h_data) {
+        nan_cnt += std::isnan(static_cast<float>(x));
+    }
+    if (nan_cnt) {
+        std::cerr << key << ": NaN count " << nan_cnt << "\n";
+    }
+}
+
+template<typename T>
+void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
+{
+    // wait for b
+    check_cuda_error(cudaStreamSynchronize(stream));
+    // read a from file
+    thrust::host_vector<T> h_a(size);
+    {
+        const auto    filename = "tmp/" + key + ".cmp";
+        std::ifstream ifs(filename, std::ios::binary);
+        if (!ifs.is_open()) {
+            std::cerr << key << ": failed to open " + filename << "\n";
+            return;
+        }
+        ifs.seekg(0, ifs.end);
+        const auto actual_size_in_bytes = ifs.tellg();
+        ifs.seekg(0, ifs.beg);
+        const auto expect_size_in_bytes = sizeof(T) * size;
+        if (actual_size_in_bytes != expect_size_in_bytes) {
+            std::cerr << key << ": file size in bytes mismatch, expect " << expect_size_in_bytes << ", got "
+                      << actual_size_in_bytes << "\n";
+            return;
+        }
+        ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
+    }
+    // copy a to device
+    thrust::device_vector<T> a = h_a;
+    // create abs(a - b) iterator
+    thrust::device_ptr<T> dev_ptr(ptr);
+    auto                  zip_iter       = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
+    auto                  transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
+    // sum(abs(a - b))
+    auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
+    std::cerr << key << ": " << asum << " " << asum / size << "\n";
+}
+
+template<typename T>
+void CmpWrite(T* ptr, size_t size, std::string key, cudaStream_t stream)
+{
+    std::vector<T> a(size);
+    // copy a to host
+    check_cuda_error(cudaMemcpyAsync(a.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+    // write to file
+    {
+        std::ofstream ofs("tmp/" + key + ".cmp", std::ios::binary);
+        ofs.write((char*)a.data(), sizeof(T) * a.size());
+    }
+}
+
+template<typename T>
+void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream)
+{
+    // std::cerr << "Comparing " << key << "\n";
+    if (mode == kCmpRead) {
+        CmpRead(ptr, size, key, stream);
+    }
+    else if (mode == kCmpWrite) {
+        CmpWrite(ptr, size, key, stream);
+    }
+    else {
+        // kCmpNone
+    }
+}
+
+template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+
+template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream);
+template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream);
+
+std::string format(const std::pair<std::string, Tensor>& p)
+{
+    std::stringstream ss;
+    ss << p.first << " [";
+    bool first = true;
+    for (const auto& x : p.second.shape) {
+        ss << (first ? "" : ", ") << x;
+        first = false;
+    }
+    ss << "]";
+    return ss.str();
+}
+
+size_t curandStateGetSize()
+{
+    return sizeof(curandState_t);
+}
+
+bool isDebug()
+{
+    static const bool is_debug = [] {
+        const auto level = std::getenv("FT_DEBUG_LEVEL");
+        if (level && level == std::string("DEBUG")) {
+            return true;
+        }
+        return false;
+    }();
+    return is_debug;
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/llama_utils.h b/src/fastertransformer/models/llama/llama_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7889fab073eb6b15a2b907fa005e7b7d70dcf89
--- /dev/null
+++ b/src/fastertransformer/models/llama/llama_utils.h
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#pragma once
+#include "src/fastertransformer/utils/Tensor.h"
+#include <cuda_runtime.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace fastertransformer {
+
+enum CmpMode
+{
+    kCmpNone,
+    kCmpRead,
+    kCmpWrite,
+};
+
+extern CmpMode compare_mode;
+
+template<typename T>
+void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
+
+template<typename T>
+void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream);
+
+namespace detail {
+
+template<typename T>
+std::string to_string(T x)
+{
+    return std::to_string(x);
+}
+
+inline std::string to_string(std::string x)
+{
+    return x;
+}
+
+}  // namespace detail
+
+template<typename... Args>
+std::string Concat(std::string key, Args&&... args)
+{
+    std::vector<std::string> args_str{detail::to_string((Args &&) args)...};
+    for (const auto& s : args_str) {
+        key.append("_");
+        key.append(s);
+    }
+    return key;
+}
+
+std::string format(const std::pair<std::string, Tensor>& p);
+
+size_t curandStateGetSize();
+
+bool isDebug();
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/models/llama/prefix_cache.cu b/src/fastertransformer/models/llama/prefix_cache.cu
new file mode 100644
index 0000000000000000000000000000000000000000..80afb55fd4880534ecf0e849174d5fcb689d4d39
--- /dev/null
+++ b/src/fastertransformer/models/llama/prefix_cache.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "src/fastertransformer/models/llama/prefix_cache.h"
+
+// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
+template<typename T>
+__global__ void insertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, size_t S)
+{
+    for (int i = threadIdx.x; i < L * H * Dx * s * X; i += blockDim.x) {
+        int i0 = i / X;
+        int x  = i % X;
+
+        int i1 = i0 / s;
+        int t  = i0 % s;
+
+        size_t j     = (i1 * S + t) * X + x;
+        key_cache[j] = src[i];
+    }
+}
+
+template<typename T>
+void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st)
+{
+    insertKeyCache<<<1, 512, 0, st>>>(key_cache, src, L, H, Dx, s, X, S);
+}
+template void
+invokeInsertKeyCache(float* key_cache, const float* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
+template void
+invokeInsertKeyCache(half* key_cache, const half* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
+
+// <L,H,s,D> -> <L,H,S[:s],D>
+template<typename T>
+__global__ void insertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, size_t S)
+{
+    for (int i = threadIdx.x; i < L * H * s * D; i += blockDim.x) {
+        int i0 = i / D;
+        int d  = i % D;
+
+        int i1 = i0 / s;
+        int t  = i0 % s;
+
+        size_t j       = (i1 * S + t) * D + d;
+        value_cache[j] = src[i];
+    }
+}
+
+template<typename T>
+void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st)
+{
+    insertValueCache<<<1, 512, 0, st>>>(value_cache, src, L, H, s, D, S);
+}
+template void
+invokeInsertValueCache(float* value_cache, const float* src, int L, int H, int s, int D, int S, cudaStream_t st);
+template void
+invokeInsertValueCache(half* value_cache, const half* src, int L, int H, int s, int D, int S, cudaStream_t st);
diff --git a/src/fastertransformer/models/llama/prefix_cache.h b/src/fastertransformer/models/llama/prefix_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ee9eb010977bee45070857c8194cc30364216ef
--- /dev/null
+++ b/src/fastertransformer/models/llama/prefix_cache.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <cuda_fp16.h>
+
+template<typename T>
+void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
+
+template<typename T>
+void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st);
\ No newline at end of file
diff --git a/src/fastertransformer/triton_backend/CMakeLists.txt b/src/fastertransformer/triton_backend/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..09144a40a0ebeafcae1926f61be54cc1e1a3bfdb
--- /dev/null
+++ b/src/fastertransformer/triton_backend/CMakeLists.txt
@@ -0,0 +1,288 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required (VERSION 3.18)
+
+project(tritonfastertransformerbackend LANGUAGES C CXX)
+
+#
+# Options
+#
+option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+
+set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
+set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
+
+set(TRITON_BACKEND_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/backend repo")
+set(TRITON_CORE_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_COMMON_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/common repo")
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(USE_TRITONSERVER_DATATYPE "ON")
+message("-- Enable USE_TRITONSERVER_DATATYPE")
+
+#
+# Dependencies
+#
+# FetchContent's composability isn't very good. We must include the
+# transitive closure of all repos so that we can override the tag.
+#
+include(FetchContent)
+
+FetchContent_Declare(
+  repo-common
+  GIT_REPOSITORY https://github.com/triton-inference-server/common.git
+  GIT_TAG ${TRITON_COMMON_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-core
+  GIT_REPOSITORY https://github.com/triton-inference-server/core.git
+  GIT_TAG ${TRITON_CORE_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_Declare(
+  repo-backend
+  GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
+  GIT_TAG ${TRITON_BACKEND_REPO_TAG}
+  GIT_SHALLOW ON
+)
+FetchContent_MakeAvailable(repo-common repo-core repo-backend)
+
+#
+# CUDA
+#
+if(${TRITON_ENABLE_GPU})
+  find_package(CUDAToolkit REQUIRED)
+endif() # TRITON_ENABLE_GPU
+
+#
+# Shared library implementing the Triton Backend API
+#
+configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY)
+
+add_library(
+  triton-fastertransformer-backend SHARED
+  libfastertransformer.cc
+)
+
+add_library(
+  TritonFasterTransformerBackend::triton-fastertransformer-backend ALIAS triton-fastertransformer-backend
+)
+
+find_package(CUDAToolkit REQUIRED)
+find_package(CUDA 10.1 REQUIRED)
+if (${CUDA_VERSION} GREATER_EQUAL 11.0)
+  message(STATUS "Add DCUDA11_MODE")
+  add_definitions("-DCUDA11_MODE")
+endif()
+
+set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+
+target_compile_definitions(triton-fastertransformer-backend
+  PUBLIC
+  USE_TRITONSERVER_DATATYPE
+  BUILD_MULTI_GPU)
+
+target_include_directories(
+  triton-fastertransformer-backend
+  PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${TRITON_PYTORCH_INCLUDE_PATHS}
+  ${Python3_INCLUDE_DIRS}
+  ${repo-ft_SOURCE_DIR}
+  ${repo-ft_SOURCE_DIR}/3rdparty/cutlass/include
+  ${repo-core_SOURCE_DIR}/include
+  )
+
+target_link_directories(
+  triton-fastertransformer-backend
+  PRIVATE
+  ${CUDA_PATH}/lib64
+  )
+
+target_compile_features(triton-fastertransformer-backend PRIVATE cxx_std_14)
+
+target_compile_options(
+  triton-fastertransformer-backend PRIVATE
+  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
+    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror>
+)
+
+if(${TRITON_ENABLE_GPU})
+  target_compile_definitions(
+    triton-fastertransformer-backend
+    PRIVATE TRITON_ENABLE_GPU=1
+  )
+endif() # TRITON_ENABLE_GPU
+
+set_target_properties(
+  triton-fastertransformer-backend
+  PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+    OUTPUT_NAME triton_fastertransformer
+    SKIP_BUILD_RPATH TRUE
+    BUILD_WITH_INSTALL_RPATH TRUE
+    INSTALL_RPATH_USE_LINK_PATH FALSE
+    INSTALL_RPATH "$\{ORIGIN\}"
+    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_fastertransformer.ldscript
+    LINK_FLAGS "-Wl,--no-as-needed,--version-script libtriton_fastertransformer.ldscript"
+)
+
+# Need to turn off unused-but-set-variable due to Torchvision
+# Need to turn off unknown-pragmas due to ATen OpenMP
+set_target_properties(
+  triton-fastertransformer-backend
+  PROPERTIES COMPILE_FLAGS
+    "-Wno-unknown-pragmas -Wno-unused-but-set-variable"
+)
+
+set(TRITON_PYTORCH_LDFLAGS "")
+FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
+  set(TRITON_PYTORCH_LDFLAGS ${TRITON_PYTORCH_LDFLAGS} "-L${p}")
+ENDFOREACH(p)
+
+target_link_libraries(
+  triton-fastertransformer-backend
+  PRIVATE
+    triton-core-serverapi  # from repo-core
+    triton-core-backendapi # from repo-core
+    triton-core-serverstub # from repo-core
+    triton-backend-utils   # from repo-backend
+    transformer-shared     # from repo-ft
+    ${TRITON_PYTORCH_LDFLAGS}
+    -lcublas
+    -lcublasLt
+    -lcudart
+    -lcurand
+)
+
+if (BUILD_MULTI_GPU)
+  target_compile_definitions(
+    triton-fastertransformer-backend
+    PUBLIC
+      BUILD_MULTI_GPU
+  )
+  target_include_directories(
+    triton-fastertransformer-backend
+    PRIVATE
+      ${MPI_INCLUDE_PATH}
+  )
+  target_link_directories(
+    triton-fastertransformer-backend
+    PRIVATE
+      ${MPI_Libraries}
+      /usr/local/mpi/lib
+  )
+  target_link_libraries(
+    triton-fastertransformer-backend
+    PRIVATE
+      ${NCCL_LIBRARIES}
+      ${MPI_LIBRARIES}
+  )
+endif()
+
+if(${TRITON_ENABLE_GPU})
+  target_link_libraries(
+    triton-fastertransformer-backend
+    PRIVATE
+      CUDA::cudart
+  )
+endif() # TRITON_ENABLE_GPU
+
+#
+# Install
+#
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonFasterTransformerBackend)
+
+install(
+  TARGETS
+    triton-fastertransformer-backend
+  EXPORT
+    triton-fastertransformer-backend-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+)
+
+install(
+  EXPORT
+    triton-fastertransformer-backend-targets
+  FILE
+    TritonFasterTransformerBackendTargets.cmake
+  NAMESPACE
+    TritonFasterTransformerBackend::
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_SOURCE_DIR}/cmake/TritonFasterTransformerBackendConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+
+#
+# Export from build tree
+#
+export(
+  EXPORT triton-fastertransformer-backend-targets
+  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendTargets.cmake
+  NAMESPACE TritonFasterTransformerBackend::
+)
+
+export(PACKAGE TritonFasterTransformerBackend)
+
+
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
+target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
+
+add_subdirectory(llama)
diff --git a/src/fastertransformer/triton_backend/libfastertransformer.cc b/src/fastertransformer/triton_backend/libfastertransformer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edfc47c26d4a64a039ef404c8512f69181a010e2
--- /dev/null
+++ b/src/fastertransformer/triton_backend/libfastertransformer.cc
@@ -0,0 +1,1909 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Modified from https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
+
+#include <stdint.h>
+
+#include <exception>
+#include <string>
+#include <thread>
+#include <vector>
+
+#pragma GCC diagnostic push
+//#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wcast-function-type"
+#pragma warning(push, 0)
+#pragma warning(pop)
+#pragma GCC diagnostic pop
+
+// must include triton libraries first
+#include "triton/backend/backend_common.h"
+#include "triton/backend/backend_input_collector.h"
+#include "triton/backend/backend_memory.h"
+#include "triton/backend/backend_model.h"
+#include "triton/backend/backend_model_instance.h"
+#include "triton/backend/backend_output_responder.h"
+#include "triton/core/tritonbackend.h"
+
+// FT's libraries have dependency with triton's lib
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+std::exception_ptr ptr[8];
+
+namespace ft = fastertransformer;
+
+namespace triton {
+namespace backend {
+namespace fastertransformer_backend {
+
+#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X)                                                 \
+    do {                                                                                                               \
+        TRITONSERVER_Error* raarie_err__ = (X);                                                                        \
+        if (raarie_err__ != nullptr) {                                                                                 \
+            SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__);                                           \
+            return;                                                                                                    \
+        }                                                                                                              \
+    } while (false)
+
+// Cuda Error handling
+TRITONSERVER_Error*
+ConvertCUDAStatusToTritonError(cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
+{
+    if (cuda_error != cudaSuccess) {
+        return TRITONSERVER_ErrorNew(code, cudaGetErrorString(cuda_error));
+    }
+    return nullptr;  // success
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Ragged Baching
+
+struct RaggedBatchingParams {
+    bool           is_input_ragged      = false;
+    int32_t        max_seq_length       = 0;
+    int32_t        max_elements_per_seq = 0;
+    const int32_t* batch_input_ptr      = nullptr;
+    size_t         batch_intput_size    = 0;
+    size_t         total_input_elements = 0;
+};
+
+using RaggedBatchingParam_Map = std::unordered_map<std::string, RaggedBatchingParams>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// ModelState
+//
+// State associated with a model that is using this backend. An object
+// of this class is created and associated with each
+// TRITONBACKEND_Model.
+//
+class ModelState: public BackendModel {
+public:
+    static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state);
+    virtual ~ModelState() = default;
+
+    TRITONSERVER_Error* LoadModel(const std::string&                                                 artifact_name,
+                                  const int32_t                                                      node_id,
+                                  const int32_t                                                      device_id,
+                                  const int32_t                                                      device_id_start,
+                                  const int32_t                                                      stream_id,
+                                  std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params,
+                                  std::shared_ptr<ft::AbstractCustomComm>            custom_all_reduce_comms,
+                                  std::string*                                       model_path,
+                                  std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance);
+
+    int GetGpuSize()
+    {
+        return gpu_size;
+    };
+    int GetWorldSize()
+    {
+        return world_size;
+    };
+    int GetParallelSize()
+    {
+        return tp_pp_size;
+    };
+    int GetInstanceId()
+    {
+        return current_model_instance_id++;
+    };
+    int GetInstanceGroupCount()
+    {
+        return instance_group_count;
+    };
+    bool SequenceBatchingEnabled()
+    {
+        return sequence_batching_enabled;
+    };
+    bool DynamicBatchingEnabled()
+    {
+        return dynamic_batching_enabled;
+    };
+    std::shared_ptr<AbstractTransformerModel> GetFtModel()
+    {
+        return ft_model;
+    };
+
+private:
+    ModelState(TRITONBACKEND_Model* triton_model);
+    TRITONSERVER_Error*                       AutoCompleteConfig();
+    std::string                               GetParameter(const char* parameter);
+    int                                       current_model_instance_id = 0;
+    bool                                      sequence_batching_enabled = false;
+    bool                                      dynamic_batching_enabled  = false;
+    int                                       instance_group_count      = 1;
+    std::shared_ptr<AbstractTransformerModel> ft_model;
+    int                                       node_id, gpu_size, world_size, tp_pp_size;
+    std::vector<cudaStream_t>                 streams_;
+
+    std::shared_ptr<AbstractTransformerModel> ModelFactory(common::TritonJson::Value& param,
+                                                           const std::string&         model_filename);
+};
+
+TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
+{
+    try {
+        *state = new ModelState(triton_model);
+    }
+    catch (const BackendModelException& ex) {
+        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
+                             TRITONSERVER_ERROR_INTERNAL,
+                             std::string("unexpected nullptr in BackendModelException"));
+        RETURN_IF_ERROR(ex.err_);
+    }
+
+    // Auto-complete the configuration if requested, or T5-Encoder
+    bool auto_complete_config = false;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(triton_model, &auto_complete_config));
+    auto_complete_config |=
+        (*state)->GetParameter("model_type") == "T5-Encoder" || (*state)->GetParameter("model_type") == "bert";
+    if (auto_complete_config) {
+        RETURN_IF_ERROR((*state)->AutoCompleteConfig());
+
+        triton::common::TritonJson::WriteBuffer json_buffer;
+        (*state)->ModelConfig().Write(&json_buffer);
+
+        TRITONSERVER_Message* message;
+        RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(&message, json_buffer.Base(), json_buffer.Size()));
+        RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(triton_model, 1 /* config_version */, message));
+    }
+
+    return nullptr;  // success
+}
+
+std::string param_get(common::TritonJson::Value& param, const char* field, const std::string& fallback = "")
+{
+    common::TritonJson::Value key;
+    std::string               value = fallback;
+    param.MemberAsObject(field, &key);
+    key.MemberAsString("string_value", &value);
+    return value;
+}
+
+int param_get_int(common::TritonJson::Value& param, const char* field, int fallback = 0)
+{
+    int ret = fallback;
+    try {
+        ret = std::stoi(param_get(param, field));
+    }
+    catch (std::invalid_argument& ia) {
+        LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
+                    (std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
+    }
+    return ret;
+}
+
+float param_get_float(common::TritonJson::Value& param, const char* field, float fallback = 0.0)
+{
+    float ret = fallback;
+    try {
+        ret = std::stof(param_get(param, field));
+    }
+    catch (std::invalid_argument& ia) {
+        LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
+                    (std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
+    }
+    return ret;
+}
+
+bool param_get_bool(common::TritonJson::Value& param, const char* field, bool fallback = false)
+{
+    return static_cast<bool>(param_get_int(param, field, static_cast<int>(fallback)));
+}
+
+std::shared_ptr<AbstractTransformerModel> ModelState::ModelFactory(common::TritonJson::Value& param,
+                                                                   const std::string&         model_filename)
+{
+    std::shared_ptr<AbstractTransformerModel> ft_model;
+
+    const std::string model_dir = param_get(
+        param, "model_checkpoint_path", JoinPath({RepositoryPath(), std::to_string(Version()), model_filename}));
+    const std::string model_type = param_get(param, "model_type", "GPT");
+    const std::string data_type  = param_get(param, "data_type");
+    const int         tp         = param_get_int(param, "tensor_para_size");
+    const int         pp         = param_get_int(param, "pipeline_para_size");
+    const int         custom_ar  = param_get_int(param, "enable_custom_all_reduce");
+
+    const std::string dt_message = std::string("Invalid configuration argument 'data_type': ") + data_type;
+
+    if (model_type == "Llama") {
+        if (data_type == "fp16") {
+            ft_model = std::make_shared<LlamaTritonModel<half>>(tp, pp, custom_ar, model_dir);
+        }
+        else {
+            ft_model = std::make_shared<LlamaTritonModel<float>>(tp, pp, custom_ar, model_dir);
+        }
+    }
+    else {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, ("Unknown model \"" + model_type + "\"").c_str()));
+    }
+
+    return ft_model;
+}
+
+ModelState::ModelState(TRITONBACKEND_Model* triton_model): BackendModel(triton_model, true)
+{
+    node_id       = ft::mpi::getCommWorldRank();
+    int num_nodes = ft::mpi::getCommWorldSize();
+
+    triton::common::TritonJson::WriteBuffer buffer;
+    ModelConfig().PrettyWrite(&buffer);
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("model configuration:\n") + buffer.Contents()).c_str());
+
+    common::TritonJson::Value param;
+    model_config_.MemberAsObject("parameters", &param);
+
+    // instance groups
+    triton::common::TritonJson::Value instance_group, instance_obj, instance_group_count_val, instance_group_kind;
+    if (!ModelConfig().Find("instance_group", &instance_group) || instance_group.ArraySize() > 1) {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Only supports one instance group !"));
+    }
+    instance_group.IndexAsObject(0, &instance_obj);
+    instance_obj.Find("count", &instance_group_count_val);
+    instance_obj.Find("kind", &instance_group_kind);
+    std::string instance_group_kind_str;
+    int64_t     instance_group_count_int64 = 1;
+    instance_group_kind.AsString(&instance_group_kind_str);
+    instance_group_count_val.AsInt(&instance_group_count_int64);
+    instance_group_count = (int)instance_group_count_int64;
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        ("Instance group type: " + instance_group_kind_str + " count: " + std::to_string(instance_group_count_int64))
+            .c_str());
+    if (instance_group_kind_str != "KIND_CPU") {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Instance Group: only KIND_CPU supports!"));
+    }
+
+    // instance group validation
+    bool multi_node_enabled  = num_nodes > 1;
+    tp_pp_size               = param_get_int(param, "tensor_para_size") * param_get_int(param, "pipeline_para_size");
+    gpu_size                 = ft::getDeviceCount();
+    world_size               = gpu_size * num_nodes;
+    int  model_instance_size = num_nodes > 1 ? gpu_size : tp_pp_size;
+    bool multi_model_instance_valid = (multi_node_enabled && tp_pp_size == world_size && instance_group_count == 1)
+                                      || (!multi_node_enabled && gpu_size % tp_pp_size == 0
+                                          && model_instance_size * instance_group_count >= gpu_size);
+
+    printf("num_nodes=%d\n", num_nodes);
+    printf("tp_pp_size=%d\n", tp_pp_size);
+    printf("gpu_size=%d\n", gpu_size);
+    printf("world_size=%d\n", world_size);
+    printf("model_instance_size=%d\n", model_instance_size);
+    if (!multi_model_instance_valid) {
+        THROW_IF_BACKEND_MODEL_ERROR(
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
+                                  "1. Number of visible GPUs must be evenly divisble by TP * PP \n"
+                                  "2. Number of visible GPUs must be <= instance count * TP * PP \n"
+                                  "3. Multi-Node Inference only support one model instance \n"));
+    }
+
+    int64_t max_batch_size = 0;
+    model_config_.MemberAsInt("max_batch_size", &max_batch_size);
+
+    // sequence batching
+    triton::common::TritonJson::Value sequence_batching;
+    sequence_batching_enabled         = ModelConfig().Find("sequence_batching", &sequence_batching);
+    std::string sequence_batching_log = sequence_batching_enabled ? "enabled" : "disabled";
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Sequence Batching: ") + sequence_batching_log).c_str());
+    // if (sequence_batching_enabled && max_batch_size != 1) {
+    //   THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
+    //     "Sequence Batching for interactive text generation: only supports max
+    //     batch size = 1 currently !"));
+    // }
+
+    // dynamic batching
+    triton::common::TritonJson::Value dynamic_batching;
+    dynamic_batching_enabled         = ModelConfig().Find("dynamic_batching", &dynamic_batching);
+    std::string dynamic_batching_log = dynamic_batching_enabled ? "enabled" : "disabled";
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Dynamic Batching: ") + dynamic_batching_log).c_str());
+    if (dynamic_batching_enabled && sequence_batching_enabled) {
+        THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
+                                                           "Sequence Batching cannot work with dynamic "
+                                                           "batching at the same time !"));
+    }
+
+    std::string model_filename;
+    model_config_.MemberAsString("default_model_filename", &model_filename);
+
+    if (model_filename == "") {
+        model_filename = std::to_string(param_get_int(param, "tensor_para_size")) + "-gpu";
+    }
+
+    ft_model = ModelFactory(param, model_filename);
+
+    std::cout << ft_model->toString();
+
+    int total_weight_gpu_size = (instance_group_count * model_instance_size) >= gpu_size ?
+                                    gpu_size :
+                                    (instance_group_count * model_instance_size);
+    streams_.resize(instance_group_count * model_instance_size);
+
+    /* create shared weights
+    assume 8 gpus, 8 model instances, Tensor Para Size 2
+    then we will distribute model instances to [0, 1], [2, 3], [4, 5], [6, 7],
+    [0, 1], [2, 3], [4, 5], [6, 7] GPUs;
+    two instance instances on GPUs [0, 1] will share the same weights
+    */
+    std::vector<std::thread> threads;
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Weights:")).c_str());
+    ft::print_mem_usage();
+    for (int gid = 0; gid < total_weight_gpu_size; gid++) {
+        int rank = node_id * gpu_size + gid % tp_pp_size;
+        threads.push_back(std::thread(&AbstractTransformerModel::createSharedWeights, ft_model, gid, rank));
+    }
+    for (auto& t : threads) {
+        t.join();
+    }
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Weights:")).c_str());
+    ft::print_mem_usage();
+}
+
+TRITONSERVER_Error*
+ModelState::LoadModel(const std::string&                                                 artifact_name,
+                      const int32_t                                                      node_id,
+                      const int32_t                                                      device_id,
+                      const int32_t                                                      device_id_start,
+                      const int32_t                                                      stream_id,
+                      std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params_instance,
+                      std::shared_ptr<ft::AbstractCustomComm>                            custom_all_reduce_comms,
+                      std::string*                                                       model_path,
+                      std::unique_ptr<AbstractTransformerModelInstance>*                 ft_model_instance)
+{
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
+                     cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
+                 "Failed to set cuda device");
+
+    std::string cc_model_filename = artifact_name;
+    if (cc_model_filename.empty()) {
+        cc_model_filename = "gpt3-model";
+    }
+
+    if (!node_id && !device_id) {
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Model:")).c_str());
+    }
+    ft::print_mem_usage();
+
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaStreamCreate(&streams_[stream_id]),
+                                                TRITONSERVER_ERROR_INTERNAL,
+                                                "Failed to create the stream"),
+                 "Failed to create the stream");
+
+    const int rank = node_id * GetGpuSize() + device_id - device_id_start;
+
+    auto model_instance = ft_model->createModelInstance(
+        device_id, rank, streams_[stream_id], nccl_params_instance, custom_all_reduce_comms);
+    ft_model_instance->reset(model_instance.release());
+
+    if (!node_id && !device_id) {
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Model:")).c_str());
+    }
+    ft::print_mem_usage();
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* ModelState::AutoCompleteConfig()
+{
+    if (GetParameter("model_type") == "T5-Encoder") {
+        const std::string         data_type = GetParameter("data_type");
+        auto&                     config    = ModelConfig();
+        common::TritonJson::Value outputs, output, dtype_object;
+        std::string               name;
+        config.MemberAsArray("output", &outputs);
+
+        std::unordered_map<std::string, std::string> return_type_map{
+            {"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
+
+        std::set<std::string> outputs_to_modify = {"output_hidden_state", "output_attentions"};
+        for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
+            outputs.IndexAsObject(idx, &output);
+            output.MemberAsString("name", &name);
+            if (outputs_to_modify.find(name) == outputs_to_modify.end()) {
+                continue;
+            }
+            output.Find("data_type", &dtype_object);
+            dtype_object.SetString(return_type_map[data_type]);
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                        ("Automatically setting return data_type for \"" + name + "\" to \""
+                         + return_type_map[data_type] + "\"")
+                            .c_str());
+        }
+    }
+    else if (GetParameter("model_type") == "bert") {
+        const std::string         data_type = GetParameter("data_type");
+        auto&                     config    = ModelConfig();
+        common::TritonJson::Value inputs, input, dtype_object;
+        common::TritonJson::Value outputs, output;
+        std::string               name;
+        config.MemberAsArray("input", &inputs);
+        config.MemberAsArray("output", &outputs);
+
+        std::unordered_map<std::string, std::string> return_type_map{
+            {"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
+
+        for (size_t idx = 0; idx < inputs.ArraySize(); idx++) {
+            inputs.IndexAsObject(idx, &input);
+            input.MemberAsString("name", &name);
+            if (name != "input_hidden_state") {
+                continue;
+            }
+            input.Find("data_type", &dtype_object);
+            dtype_object.SetString(return_type_map[data_type]);
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                        ("Automatically setting return data_type for "
+                         "\"input_hidden_state\" to \""
+                         + return_type_map[data_type] + "\"")
+                            .c_str());
+        }
+
+        for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
+            outputs.IndexAsObject(idx, &output);
+            output.MemberAsString("name", &name);
+            if (name != "output_hidden_state") {
+                continue;
+            }
+            output.Find("data_type", &dtype_object);
+            dtype_object.SetString(return_type_map[data_type]);
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                        ("Automatically setting return data_type for "
+                         "\"output_hidden_state\" to \""
+                         + return_type_map[data_type] + "\"")
+                            .c_str());
+        }
+    }
+    else {
+        // Auto-complete configuration is not supported since fastertransformer does
+        // not store/capture sufficient model metadata so just log error instead.
+        LOG_MESSAGE(TRITONSERVER_LOG_WARN,
+                    (std::string("skipping model configuration auto-complete for '") + Name()
+                     + "': not supported for fastertransformer backend")
+                        .c_str());
+    }
+
+    return nullptr;  // success
+}
+
+std::string ModelState::GetParameter(const char* parameter)
+{
+    auto&                     config = ModelConfig();
+    common::TritonJson::Value parameters, model_type_obj;
+    std::string               model_type;
+    config.MemberAsObject("parameters", &parameters);
+    parameters.MemberAsObject(parameter, &model_type_obj);
+    model_type_obj.MemberAsString("string_value", &model_type);
+    return model_type;
+}
+
+struct stream_callback_ctx_t {
+    size_t                                       total_batch_size;
+    TRITONBACKEND_Request**                      requests;
+    uint32_t                                     request_count;
+    std::vector<TRITONBACKEND_Response*>*        responses;
+    std::vector<TRITONBACKEND_ResponseFactory*>* factories;
+    BackendModelInstance*                        model;
+};
+
+void generate_response_placeholders(std::vector<TRITONBACKEND_Response*>*        responses,
+                                    std::vector<TRITONBACKEND_ResponseFactory*>* factories)
+{
+    TRITONSERVER_Error* err = nullptr;
+    for (auto factory : *factories) {
+        TRITONBACKEND_Response* response;
+        err = TRITONBACKEND_ResponseNewFromFactory(&response, factory);
+        if (err) {
+            LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response from factory");
+            TRITONSERVER_ErrorDelete(err);
+        }
+        responses->push_back(response);
+    }
+}
+
+//
+// ModelInstanceState
+//
+// State associated with a model instance. An object of this class is
+// created and associated with each TRITONBACKEND_ModelInstance.
+//
+class ModelInstanceState: public BackendModelInstance {
+public:
+    static TRITONSERVER_Error*
+    Create(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state);
+    virtual ~ModelInstanceState();
+
+    // Get the state of the model that corresponds to this instance.
+    ModelState* StateForModel() const
+    {
+        return model_state_;
+    }
+
+    // Execute...
+    void ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count);
+
+    std::shared_ptr<std::unordered_map<std::string, Tensor>>
+    Execute(std::vector<TRITONBACKEND_Response*>*                    responses,
+            stream_callback_ctx_t*                                   context,
+            const uint32_t                                           response_count,
+            std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors);
+
+    void ReadOutputTensors(size_t                                                   total_batch_size,
+                           std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
+                           TRITONBACKEND_Request**                                  requests,
+                           const uint32_t                                           request_count,
+                           std::vector<TRITONBACKEND_Response*>*                    responses);
+
+    int GetModelInstanceCount()
+    {
+        return model_instance_count_;
+    };
+    int GetModelInstanceId()
+    {
+        return model_instance_id_;
+    };
+
+private:
+    ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance);
+    TRITONSERVER_Error* ValidateInputs();
+    TRITONSERVER_Error* ValidateOutputs();
+
+    void SetInputTensors(size_t                                                    total_batch_size,
+                         TRITONBACKEND_Request**                                   requests,
+                         const uint32_t                                            request_count,
+                         std::vector<TRITONBACKEND_Response*>*                     responses,
+                         BackendInputCollector*                                    collector,
+                         std::vector<const char*>*                                 input_names,
+                         std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
+                         std::vector<BackendMemory*>*                              input_memories,
+                         bool*                                                     cuda_copy);
+
+    void BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors);
+
+    ModelState* model_state_;
+
+    // model instance id
+    int model_instance_count_           = 1;
+    int model_instance_id_              = 0;
+    int model_instance_gpu_size_        = 1;
+    int model_instance_device_id_start_ = 0;
+
+    // output tensor stream
+    cudaStream_t output_stream_;
+
+    // tensor parallel + pipeline parallel
+    int gpu_size_   = 1;
+    int world_size_ = 1;
+    int tp_pp_size_ = 1;
+
+    // Should we use the streaming API?
+    bool is_decoupled_ = false;
+
+    // The full path to the FT model file.
+    std::string model_path_;
+
+    std::vector<std::unique_ptr<AbstractTransformerModelInstance>> ft_model_instance_;
+
+    std::unique_ptr<ft::AbstractInstanceComm> instance_comm_;
+
+    // inter-node broadcast buffer
+    std::vector<char*> bcast_buffers;
+
+    // Map from configuration name for an input to the index of
+    // that input in the model.
+    std::unordered_map<std::string, int> input_index_map_;
+
+    // Map from configuration name for an output to the index of
+    // that output in the model.
+    std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
+
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params_;
+
+    // custom all reduce comms
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>> custom_all_reduce_comms_;
+};
+
+TRITONSERVER_Error* ModelInstanceState::Create(ModelState*                  model_state,
+                                               TRITONBACKEND_ModelInstance* triton_model_instance,
+                                               ModelInstanceState**         state)
+{
+    try {
+        *state = new ModelInstanceState(model_state, triton_model_instance);
+    }
+    catch (const BackendModelInstanceException& ex) {
+        RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
+                             TRITONSERVER_ERROR_INTERNAL,
+                             std::string("unexpected nullptr in BackendModelInstanceException"));
+        RETURN_IF_ERROR(ex.err_);
+    }
+
+    return nullptr;  // success
+}
+
+int ThreadLoadModel(ModelState*                                                       model_state,
+                    const std::string&                                                artifact_name,
+                    const int32_t                                                     node_id,
+                    const int32_t                                                     device_id,
+                    const int32_t                                                     device_id_start,
+                    const int32_t                                                     stream_id,
+                    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comms,
+                    std::string*                                                      model_path,
+                    std::unique_ptr<AbstractTransformerModelInstance>*                ft_model_instance)
+{
+    THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(artifact_name,
+                                                           node_id,
+                                                           device_id,
+                                                           device_id_start,
+                                                           stream_id,
+                                                           nccl_params,
+                                                           custom_all_reduce_comms,
+                                                           model_path,
+                                                           ft_model_instance));
+    return 0;
+}
+
+ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance):
+    BackendModelInstance(model_state, triton_model_instance), model_state_(model_state)
+{
+    int node_id   = ft::mpi::getCommWorldRank();
+    int num_nodes = ft::mpi::getCommWorldSize();
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Model name ") + ArtifactFilename()).c_str());
+
+    triton::common::TritonJson::Value transaction_policy;
+    is_decoupled_ = false;
+    model_state_->ModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy);
+    transaction_policy.MemberAsBool("decoupled", &is_decoupled_);
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_VERBOSE,
+        (std::string("Use ") + (is_decoupled_ ? "DECOUPLED (streaming)" : "COUPLED (classic)") + " API.").c_str());
+
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs());
+    THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
+
+    // NOTE:  model instance params
+    model_instance_id_    = model_state->GetInstanceId();
+    model_instance_count_ = model_state->GetInstanceGroupCount();
+    tp_pp_size_           = model_state->GetParallelSize();
+    gpu_size_             = model_state->GetGpuSize();
+    world_size_           = model_state->GetWorldSize();
+
+    model_instance_gpu_size_ = num_nodes > 1 ? gpu_size_ : tp_pp_size_;
+    ft_model_instance_.resize(model_instance_gpu_size_);
+    std::vector<std::thread> threads;
+
+    std::shared_ptr<AbstractTransformerModel> shared_ft_model = model_state->GetFtModel();
+
+    // NOTE: CPU_KIND only, the backend fully controls how to distribute models to
+    // GPUs
+    model_instance_device_id_start_ = (model_instance_id_ * model_instance_gpu_size_) % gpu_size_;
+    // create output tensor stream
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaSetDevice(model_instance_device_id_start_),
+                                                TRITONSERVER_ERROR_INTERNAL,
+                                                "Failed to set cuda device"),
+                 "Failed to set cuda device");
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
+                     cudaStreamCreate(&output_stream_), TRITONSERVER_ERROR_INTERNAL, "Failed to create the stream"),
+                 "Failed to create the stream");
+
+    // create nccl params
+    nccl_params_ = shared_ft_model->createNcclParams(node_id, model_instance_device_id_start_, num_nodes > 1);
+
+    shared_ft_model->createCustomComms(&custom_all_reduce_comms_, world_size_);
+    std::string model_instance_gpu_ids = "[ ";
+    for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
+         gid++) {
+        model_instance_gpu_ids += (std::to_string(gid) + " ");
+        threads.push_back(std::thread(ThreadLoadModel,
+                                      model_state,
+                                      ArtifactFilename(),
+                                      node_id,
+                                      gid,
+                                      model_instance_device_id_start_,
+                                      model_instance_id_ * model_instance_gpu_size_ + gid,
+                                      nccl_params_,
+                                      custom_all_reduce_comms_[gid - model_instance_device_id_start_],
+                                      &model_path_,
+                                      &ft_model_instance_[gid - model_instance_device_id_start_]));
+    }
+    model_instance_gpu_ids += "]";
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    instance_comm_ = shared_ft_model->createInstanceComm(tp_pp_size_);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("Model instance is created on GPU ") + model_instance_gpu_ids).c_str());
+}
+
+ModelInstanceState::~ModelInstanceState()
+{
+#ifdef TRITON_ENABLE_GPU
+#endif  // TRITON_ENABLE_GPU
+    for (auto bcast_buffer : bcast_buffers) {
+        free(bcast_buffer);
+    }
+}
+
+TRITONSERVER_Error* ModelInstanceState::ValidateInputs()
+{
+    triton::common::TritonJson::Value ios, bios;
+    // input
+    std::string                       name, data_type;
+    triton::common::TritonJson::Value jshape;
+    // batch input
+    std::string                       kind, target_name, source_input;
+    triton::common::TritonJson::Value target_name_array, source_input_array;
+    model_state_->ModelConfig().MemberAsArray("input", &ios);
+    model_state_->ModelConfig().MemberAsArray("batch_input", &bios);
+
+    std::vector<std::string> valid_batch_input;
+
+    // batch input
+    for (size_t size = 0; size < bios.ArraySize(); size++) {
+        triton::common::TritonJson::Value batch_input;
+        bios.IndexAsObject(size, &batch_input);
+        batch_input.MemberAsString("kind", &kind);
+        batch_input.MemberAsArray("target_name", &target_name_array);
+        batch_input.MemberAsString("data_type", &data_type);
+        batch_input.MemberAsArray("source_input", &source_input_array);
+        target_name_array.IndexAsString(0, &target_name);
+        source_input_array.IndexAsString(0, &source_input);
+
+        LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                    (std::string("Get batch input kind: " + kind + ", target_name: " + target_name
+                                 + ", data_type: " + data_type + ", source_input: " + source_input)
+                         .c_str()));
+
+        if (kind == "BATCH_ITEM_SHAPE" && data_type == "TYPE_INT32" && source_input + "_item_shape" == target_name) {
+            valid_batch_input.emplace_back(std::move(source_input));
+        }
+    }
+
+    // input
+    for (size_t size = 0; size < ios.ArraySize(); size++) {
+        triton::common::TritonJson::Value input;
+        ios.IndexAsObject(size, &input);
+        input.MemberAsString("name", &name);
+        input.MemberAsString("data_type", &data_type);
+        input.MemberAsArray("dims", &jshape);
+
+        triton::common::TritonJson::Value allow_ragged_batch_json;
+        bool                              allow_ragged_batch = false;
+        if (input.Find("allow_ragged_batch", &allow_ragged_batch_json)) {
+            RETURN_IF_ERROR(allow_ragged_batch_json.AsBool(&allow_ragged_batch));
+        }
+
+        if (allow_ragged_batch
+            && std::find(valid_batch_input.begin(), valid_batch_input.end(), name) == valid_batch_input.end()) {
+            return TRITONSERVER_ErrorNew(
+                TRITONSERVER_ERROR_INVALID_ARG,
+                std::string("Ragged Batch [ " + name + " ] needs the corresponding batch_input item shape !").c_str());
+        }
+
+        std::vector<int64_t> shape;
+        for (size_t size = 0; size < jshape.ArraySize(); size++) {
+            int64_t value = 0;
+            jshape.IndexAsInt(size, &value);
+            shape.push_back(value);
+        }
+
+        std::string str_shape = "[";
+        for (uint i = 0; i < shape.size(); i++) {
+            str_shape = str_shape + std::to_string(shape[i]);
+            if (i != shape.size() - 1) {
+                str_shape = str_shape + ", ";
+            }
+            else {
+                str_shape = str_shape + "]";
+            }
+        }
+
+        std::string allow_ragged_batch_str = allow_ragged_batch ? "true" : "false";
+
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                    (std::string("Get input name: " + name + ", type: " + data_type + ", shape: " + str_shape
+                                 + ", allow_ragged_batch: " + allow_ragged_batch_str)
+                         .c_str()));
+    }
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* ModelInstanceState::ValidateOutputs()
+{
+    triton::common::TritonJson::Value ios;
+    RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
+
+    std::string                       name, data_type;
+    triton::common::TritonJson::Value jshape;
+    model_state_->ModelConfig().MemberAsArray("output", &ios);
+    for (size_t size = 0; size < ios.ArraySize(); size++) {
+        triton::common::TritonJson::Value input;
+        ios.IndexAsObject(size, &input);
+        input.MemberAsString("name", &name);
+        input.MemberAsString("data_type", &data_type);
+        input.MemberAsArray("dims", &jshape);
+
+        std::vector<int64_t> shape;
+        for (size_t size = 0; size < jshape.ArraySize(); size++) {
+            int64_t value = 0;
+            jshape.IndexAsInt(size, &value);
+            shape.push_back(value);
+        }
+
+        std::string str_shape = "[";
+        for (uint i = 0; i < shape.size(); i++) {
+            str_shape = str_shape + std::to_string(shape[i]);
+            if (i != shape.size() - 1) {
+                str_shape = str_shape + ", ";
+            }
+            else {
+                str_shape = str_shape + "]";
+            }
+        }
+
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("Get output name: " + name + ", type: " + data_type + ", shape: " + str_shape).c_str()));
+    }
+
+    return nullptr;  // success
+}
+
+void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count)
+{
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + std::to_string(request_count)
+                 + " requests")
+                    .c_str());
+    uint64_t exec_start_ns = 0;
+    SET_TIMESTAMP(exec_start_ns);
+
+    const int max_batch_size = model_state_->MaxBatchSize();
+
+    // For each request collect the total batch size for this inference
+    // execution. The batch-size, number of inputs, and size of each
+    // input has already been checked so don't need to do that here.
+    size_t total_batch_size = 0;
+
+    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
+    // size_t real_batch_dim = (int) sequence_batching_enabled;
+    constexpr size_t real_batch_dim = 0;
+
+    // only one batch slot per model instance when sequence_batching enabled
+    for (size_t i = 0; i < request_count; i++) {
+        // If we get a nullptr request then something is badly wrong. Fail
+        // and release all requests.
+        if (requests[i] == nullptr) {
+            RequestsRespondWithError(
+                requests,
+                request_count,
+                TRITONSERVER_ErrorNew(
+                    TRITONSERVER_ERROR_INTERNAL,
+                    std::string("null request given to FasterTransformer backend for '" + Name() + "'").c_str()));
+            return;
+        }
+
+        if (max_batch_size > 0) {
+            // Retrieve the batch size from one of the inputs, if the model
+            // supports batching, the first dimension size is batch size
+            int index = 0;
+            while (true) {
+                TRITONBACKEND_Input* input;
+                TRITONSERVER_Error*  err_0 = TRITONBACKEND_RequestInputByIndex(requests[i], index, &input);
+                if (err_0 == nullptr) {
+                    const char*         input_name;
+                    const int64_t*      shape;
+                    TRITONSERVER_Error* err_1 =
+                        TRITONBACKEND_InputProperties(input, &input_name, nullptr, &shape, nullptr, nullptr, nullptr);
+                    std::string input_name_str = std::string(input_name);
+                    if (err_1 == nullptr) {
+                        if (input_name_str != "START" && input_name_str != "END" && input_name_str != "READY") {
+                            total_batch_size += shape[real_batch_dim];
+                            break;
+                        }
+                        index++;
+                    }
+                    else {
+                        RequestsRespondWithError(requests, request_count, err_1);
+                        return;
+                    }
+                }
+                else {
+                    RequestsRespondWithError(requests, request_count, err_0);
+                    return;
+                }
+            }
+        }
+        else {
+            total_batch_size += 1;
+        }
+    }
+
+    // If there are no valid payloads then no need to run the inference.
+    if (total_batch_size == 0) {
+        return;
+    }
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("get total batch_size = ") + std::to_string(total_batch_size)).c_str());
+
+    // Make sure the maximum batch size is not exceeded. The
+    // total_batch_size must be 1 for models that don't support batching
+    // (i.e. max_batch_size == 0). If max_batch_size is exceeded then
+    // scheduler has done something badly wrong so fail and release all
+    // requests.
+    if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
+        RequestsRespondWithError(
+            requests,
+            request_count,
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
+                                  std::string("batch size " + std::to_string(total_batch_size) + " for '" + Name()
+                                              + "', max allowed is " + std::to_string(max_batch_size))
+                                      .c_str()));
+        return;
+    }
+
+    // At this point we are committed to running inference with all
+    // 'requests'. Create a response for each request. During input
+    // processing if there is an error with any request that error will
+    // be sent immediately with the corresponding response (and the
+    // response unique_ptr will then be nullptr). The request object
+    // itself will not be released until after all inferencing is done
+    // (below) as we may need to access the request object when
+    // determine how to process outputs (for example, even if we don't
+    // need the outputs for a request that has an error, we do need to
+    // know the size of those outputs associated with the request so we
+    // can skip them in the output tensors).
+    //
+    // When operating in the decoupled mode, responses should be created
+    // from factories. Here, we instantiate a factory for each request and
+    // generate the first response. At each new result from the model the
+    // generated response is filled, sent, and another response is created
+    // from the factory. The last response is send just like in the
+    // non-decoupled mode.
+    std::vector<TRITONBACKEND_Response*> responses;
+    responses.reserve(request_count);
+    std::vector<TRITONBACKEND_ResponseFactory*> factories;
+
+    for (size_t i = 0; i < request_count; i++) {
+        if (is_decoupled_) {
+            TRITONBACKEND_ResponseFactory* factory;
+            auto                           err = TRITONBACKEND_ResponseFactoryNew(&factory, requests[i]);
+            if (err == nullptr) {
+                factories.emplace_back(factory);
+            }
+            else {
+                factories.emplace_back(nullptr);
+                LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response factory");
+                TRITONSERVER_ErrorDelete(err);
+            }
+        }
+        else {
+            TRITONBACKEND_Response* response;
+            auto                    err = TRITONBACKEND_ResponseNew(&response, requests[i]);
+            if (err == nullptr) {
+                responses.emplace_back(response);
+            }
+            else {
+                responses.emplace_back(nullptr);
+                LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
+                TRITONSERVER_ErrorDelete(err);
+            }
+        }
+    }
+
+    std::vector<const char*>                                 input_names;
+    std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors =
+        std::make_shared<std::unordered_map<std::string, Tensor>>();
+    std::vector<BackendMemory*> input_memories;
+    bool                        cuda_copy = false;
+    if (is_decoupled_) {
+        generate_response_placeholders(&responses, &factories);
+    }
+    BackendInputCollector collector(requests,
+                                    request_count,
+                                    &responses,
+                                    model_state_->TritonMemoryManager(),
+                                    model_state_->EnablePinnedInput(),
+                                    CudaStream());
+    SetInputTensors(total_batch_size,
+                    requests,
+                    request_count,
+                    &responses,
+                    &collector,
+                    &input_names,
+                    &input_tensors,
+                    &input_memories,
+                    &cuda_copy);
+
+    // Wait for any in-flight input tensor copies to complete.
+#ifdef TRITON_ENABLE_GPU
+    if (cuda_copy) {
+        cudaStreamSynchronize(CudaStream());
+    }
+#endif
+
+    uint64_t compute_start_ns = 0;
+    SET_TIMESTAMP(compute_start_ns);
+
+    stream_callback_ctx_t context = {total_batch_size, requests, request_count, &responses, &factories, this};
+
+    auto output_tensors = Execute(&responses, &context, request_count, input_tensors);
+
+    uint64_t compute_end_ns = 0;
+    SET_TIMESTAMP(compute_end_ns);
+
+    // Free BackendMemory used for inputs
+    for (BackendMemory* mem : input_memories) {
+        delete mem;
+    }
+    input_memories.clear();
+
+    ReadOutputTensors(total_batch_size, output_tensors, requests, request_count, &responses);
+
+    uint64_t exec_end_ns = 0;
+    SET_TIMESTAMP(exec_end_ns);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("get response size = ") + std::to_string(responses.size())).c_str());
+
+    // Send all the responses that haven't already been sent because of
+    // an earlier error. Note that the responses are not set to nullptr
+    // here as we need that indication below to determine if the request
+    // we successful or not.
+    for (auto& response : responses) {
+        if (response != nullptr) {
+            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
+                         "failed to send FasterTransformer backend response");
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str());
+        }
+        else {
+            LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("response is nullptr")).c_str());
+        }
+    }
+
+    // Report statistics for each request.
+    for (uint32_t r = 0; r < request_count; ++r) {
+        auto& request = requests[r];
+        LOG_IF_ERROR(TRITONBACKEND_ModelInstanceReportStatistics(TritonModelInstance(),
+                                                                 request,
+                                                                 (responses[r] != nullptr) /* success */,
+                                                                 exec_start_ns,
+                                                                 compute_start_ns,
+                                                                 compute_end_ns,
+                                                                 exec_end_ns),
+                     "failed reporting request statistics");
+
+        LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
+                     "failed releasing request");
+    }
+
+    // Report the entire batch statistics.
+    LOG_IF_ERROR(
+        TRITONBACKEND_ModelInstanceReportBatchStatistics(
+            TritonModelInstance(), total_batch_size, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
+        "failed reporting batch request statistics");
+}
+
+void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, void* ctx)
+{
+    stream_callback_ctx_t* context = reinterpret_cast<stream_callback_ctx_t*>(ctx);
+    ModelInstanceState*    model   = reinterpret_cast<ModelInstanceState*>(context->model);
+
+    std::vector<TRITONBACKEND_Response*>* responses = context->responses;
+
+    model->ReadOutputTensors(
+        context->total_batch_size, output_tensors, context->requests, context->request_count, responses);
+
+    for (auto& response : *responses) {
+        if (response != nullptr) {
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
+            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr),
+                         "failed to send FasterTransformer backend response");
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
+        }
+        else {
+            LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("streaming response is nullptr")).c_str());
+        }
+    }
+    responses->clear();
+    generate_response_placeholders(responses, context->factories);
+}
+
+int ThreadForward(std::unique_ptr<AbstractTransformerModelInstance>*        ft_model_instance,
+                  std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
+                  std::shared_ptr<std::unordered_map<std::string, Tensor>>* output_tensors,
+                  ft::AbstractInstanceComm*                                 instance_comm,
+                  std::exception_ptr*                                       exception_ptr,
+                  const int                                                 device_id,
+                  const int                                                 use_stream_cb,
+                  stream_callback_ctx_t*                                    context)
+{
+    LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
+                     cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
+                 "Failed to set cuda device");
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Start to forward")).c_str());
+    if (use_stream_cb) {
+        (*ft_model_instance)->registerCallback(streaming_callback, (void*)context);
+    }
+    *output_tensors = (*ft_model_instance)->forward(*input_tensors, instance_comm);
+    if (use_stream_cb) {
+        (*ft_model_instance)->unRegisterCallback();
+    }
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Stop to forward")).c_str());
+
+    if ((*output_tensors)->count("error_message")) {
+        *exception_ptr = *((std::exception_ptr*)((*output_tensors)->at("error_message").data));
+    }
+    return 0;
+}
+
+void triton_check_inputs(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, const char* filename)
+{
+    auto& output = output_tensors->at("output_ids");
+    auto  shape  = output.shape;
+    assert(shape.size() == 3);
+    assert(output.type == TYPE_UINT32);
+    auto        batch_size = shape[0];
+    auto        length     = shape[2];
+    std::string fName      = filename;
+    auto        file       = std::ofstream(fName, std::ios::out);
+    if (!file.is_open()) {}
+    else {
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t j = 0; j < length; j++) {
+                file << ((uint32_t*)output.data)[i * length + j] << " ";
+            }
+            file << std::endl;
+        }
+    }
+}
+
+void ModelInstanceState::BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors)
+{
+    int node_id = ft::mpi::getCommWorldRank();
+
+    uint32_t input_count = node_id ? 0 : (*input_tensors)->size();
+    ft::mpi::bcast(&input_count, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
+    if (input_count > bcast_buffers.size()) {
+        bcast_buffers.resize(input_count);
+    }
+
+    if (node_id) {
+        for (uint input_index = 0; input_index < input_count; input_index++) {
+            std::vector<size_t> batchn_shape;
+            int64_t             shape_size  = 0;
+            int64_t             buffer_size = 1;
+            ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            for (int s_id = 0; s_id < shape_size; s_id++) {
+                int64_t val;
+                ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+                batchn_shape.push_back(val);
+                buffer_size *= val;
+            }
+            int64_t data_type_size = 1;
+            ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            buffer_size *= data_type_size;
+            bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], buffer_size);
+            char* input_buffer         = bcast_buffers[input_index];
+            ft::mpi::bcast(input_buffer, buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
+
+            int64_t name_size = 0;
+            ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            char char_name[1024] = {0};
+            ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
+            uint32_t data_type_num = 0;
+            ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
+            TRITONSERVER_DataType triton_data_type = TRITONSERVER_DataType(data_type_num);
+
+            (*input_tensors)
+                ->insert({std::string(char_name),
+                          Tensor{TRITONSERVER_MEMORY_CPU, triton_data_type, batchn_shape, input_buffer}});
+        }
+    }
+    else {
+        int input_index = 0;
+        for (auto it = (*input_tensors)->begin(); it != (*input_tensors)->end(); ++it) {
+            std::vector<size_t> batchn_shape = it->second.shape;
+            int64_t             shape_size   = batchn_shape.size();
+            int64_t             buffer_size  = 1;
+            ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            for (int s_id = 0; s_id < shape_size; s_id++) {
+                int64_t val = batchn_shape[s_id];
+                ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+                buffer_size *= val;
+            }
+
+            ft::Tensor tmp{
+                ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, nullptr};  // TODO change the getDataTypeByteNum function to static
+            int64_t data_type_size = tmp.getTypeSize(triton::Tensor::convertTritonTypeToFt(it->second.type));
+            ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            buffer_size *= data_type_size;
+
+            ft::mpi::bcast(
+                const_cast<void*>(it->second.data), buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
+
+            std::string name      = it->first;
+            int64_t     name_size = name.size();
+            ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
+            bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], name_size);
+            char*   char_name          = bcast_buffers[input_index];
+            int64_t length             = (int64_t)name.copy(char_name, name_size);
+            ft::FT_CHECK(length == name_size);
+            ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
+
+            uint32_t data_type_num = (uint32_t)(it->second.type);
+            ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
+            input_index++;
+        }
+    }
+}
+
+std::shared_ptr<std::unordered_map<std::string, Tensor>>
+ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*                    responses,
+                            stream_callback_ctx_t*                                   context,
+                            const uint32_t                                           response_count,
+                            std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors)
+{
+    int node_id = ft::mpi::getCommWorldRank();
+
+    if (node_id == 0) {
+        // Debug: input array
+        // triton_check_inputs(input_tensors, "triton_in");
+    }
+    if (node_id) {
+        input_tensors = std::make_shared<std::unordered_map<std::string, Tensor>>();
+    }
+
+    ft::mpi::barrier();
+
+    BroadcastInputTensors(&input_tensors);
+    std::vector<std::thread>                                 threads;
+    std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors_list[model_instance_gpu_size_];
+    std::exception_ptr                                       exception_ptr[model_instance_gpu_size_];
+    for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
+         gid++) {
+        int instance_local_id = gid - model_instance_device_id_start_;
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("before ThreadForward " + std::to_string(gid))).c_str());
+        threads.push_back(std::thread(ThreadForward,
+                                      &ft_model_instance_[instance_local_id],
+                                      &input_tensors,
+                                      &output_tensors_list[instance_local_id],
+                                      instance_comm_.get(),
+                                      &exception_ptr[instance_local_id],
+                                      gid,
+                                      is_decoupled_ && gid == model_instance_device_id_start_,
+                                      context));
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("after ThreadForward " + std::to_string(gid))).c_str());
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+
+    try {
+        for (int gid = model_instance_device_id_start_;
+             gid < model_instance_device_id_start_ + model_instance_gpu_size_;
+             gid++) {
+            int instance_local_id = gid - model_instance_device_id_start_;
+            if (exception_ptr[instance_local_id]) {
+                std::rethrow_exception(exception_ptr[instance_local_id]);
+            }
+        }
+    }
+    catch (std::exception& ex) {
+        SendErrorForResponses(
+            responses,
+            response_count,
+            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
+                                  ("FasterTransformer execute failure: " + std::string(ex.what())).c_str()));
+    }
+    auto output_tensors = output_tensors_list[0];
+    return output_tensors;
+}
+
+void ModelInstanceState::SetInputTensors(
+    size_t                                                            total_batch_size,
+    TRITONBACKEND_Request**                                           requests,
+    const uint32_t                                                    request_count,
+    std::vector<TRITONBACKEND_Response*>*                             responses,
+    BackendInputCollector*                                            collector,
+    std::vector<const char*>*                                         input_names,
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* input_tensors,
+    std::vector<BackendMemory*>*                                      input_memories,
+    bool*                                                             cuda_copy)
+{
+    const int max_batch_size = model_state_->MaxBatchSize();
+    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
+    bool dynamic_batching_enabled = model_state_->DynamicBatchingEnabled() || model_state_->SequenceBatchingEnabled();
+
+    // All requests must have equally-sized input tensors so use any
+    // request as the representative for the input tensors.
+    uint32_t input_count;
+    RESPOND_ALL_AND_RETURN_IF_ERROR(
+        responses, request_count, TRITONBACKEND_RequestInputCount(requests[0], &input_count));
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("get input count = ") + std::to_string(input_count)).c_str());
+
+    // Process batch input if any
+    RaggedBatchingParam_Map batch_input_param_map;
+
+    if (dynamic_batching_enabled) {
+        // Handle batch inputs for ragged batching
+        for (const auto& batch_input : model_state_->BatchInputs()) {
+            std::vector<int64_t> shape;
+            collector->BatchInputShape(batch_input, &shape);
+
+            auto batch_input_kind = batch_input.BatchInputKind();
+            auto batch_input_name = batch_input.TargetNames()[0];
+
+            // we only take care of the ragged input_ids
+            // Assume the first dimention (length) are different and others are the
+            // same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
+            // batch dimension)]
+            if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
+                && (batch_input_name == "input_ids_item_shape"
+                    || batch_input_name == "request_prompt_embedding_item_shape")) {
+                RaggedBatchingParams param{};
+
+                size_t                  num_feature_dimensions = (size_t)shape[1];
+                const char*             dst_buffer             = nullptr;
+                size_t                  dst_buffer_byte_size;
+                TRITONSERVER_MemoryType dst_memory_type;
+                int64_t                 dst_memory_type_id;
+
+                // Batch inputs are always created on CPU
+                RESPOND_ALL_AND_SET_NULL_IF_ERROR((*responses),
+                                                  responses->size(),
+                                                  collector->ProcessBatchInput(batch_input,
+                                                                               nullptr,
+                                                                               0,
+                                                                               {{TRITONSERVER_MEMORY_CPU, 0}},
+                                                                               &dst_buffer,
+                                                                               &dst_buffer_byte_size,
+                                                                               &dst_memory_type,
+                                                                               &dst_memory_type_id));
+
+                param.batch_input_ptr = reinterpret_cast<const int32_t*>(dst_buffer);
+
+                // concat all feature dimensions
+                param.batch_intput_size = (dst_buffer_byte_size / sizeof(int32_t)) / num_feature_dimensions;
+                if (num_feature_dimensions > 1) {
+                    BackendMemory* batch_item_shape_memory;
+                    RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                                    request_count,
+                                                    BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                                          {BackendMemory::AllocationType::CPU},
+                                                                          0,
+                                                                          dst_buffer_byte_size / num_feature_dimensions,
+                                                                          &batch_item_shape_memory));
+                    int32_t* batch_item_shape_memory_ptr =
+                        reinterpret_cast<int32_t*>(batch_item_shape_memory->MemoryPtr());
+                    for (size_t idx = 0; idx < param.batch_intput_size; idx++) {
+                        int32_t concat_dimensions = 1;
+                        for (size_t dim_idx = 0; dim_idx < num_feature_dimensions; dim_idx++) {
+                            concat_dimensions *= param.batch_input_ptr[idx * num_feature_dimensions + dim_idx];
+                            // dim0 is seq length dimension
+                            if (dim_idx == 0) {
+                                param.max_seq_length =
+                                    std::max(param.max_seq_length, param.batch_input_ptr[idx * num_feature_dimensions]);
+                            }
+                        }
+                        batch_item_shape_memory_ptr[idx] = concat_dimensions;
+                    }
+                    param.batch_input_ptr = reinterpret_cast<const int32_t*>(batch_item_shape_memory_ptr);
+                }
+                else {
+                    param.max_seq_length =
+                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
+                }
+
+                // check if padding is needed
+                param.is_input_ragged = std::any_of(param.batch_input_ptr,
+                                                    param.batch_input_ptr + param.batch_intput_size,
+                                                    [&](int x) { return x != param.batch_input_ptr[0]; });
+
+                // calculate statics of elements
+                if (param.is_input_ragged) {
+                    param.max_elements_per_seq =
+                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
+                    param.total_input_elements =
+                        std::accumulate(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size, 0);
+                    batch_input_param_map.insert({batch_input_name, param});
+                    // verbose logging for debugging
+                    if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
+                        std::string value_str = "[ ";
+                        for (size_t i = 0; i < param.batch_intput_size; i++) {
+                            value_str += std::to_string(param.batch_input_ptr[i]) + " ";
+                        }
+                        value_str += "]";
+
+                        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                                    (std::string("collect batch input name: ") + batch_input_name + "\n size: "
+                                     + std::to_string(dst_buffer_byte_size) + " bytes\n value: " + value_str
+                                     + "\n max sequence length: " + std::to_string(param.max_seq_length)
+                                     + "\n max elements per sequence: " + std::to_string(param.max_elements_per_seq))
+                                        .c_str());
+                    }
+                }
+            }
+        }
+    }
+
+    // Process user-defined inputs
+    for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
+        TRITONBACKEND_Input* input;
+        RESPOND_ALL_AND_RETURN_IF_ERROR(
+            responses, request_count, TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
+
+        const char*           input_name;
+        TRITONSERVER_DataType input_datatype;
+        const int64_t*        input_shape;
+        uint32_t              input_dims_count;
+        RESPOND_ALL_AND_RETURN_IF_ERROR(
+            responses,
+            request_count,
+            TRITONBACKEND_InputProperties(
+                input, &input_name, &input_datatype, &input_shape, &input_dims_count, nullptr, nullptr));
+
+        input_names->emplace_back(input_name);
+
+        std::string input_name_str = std::string(input_name);
+
+        // Pad input ids from different requests
+        RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
+        if (batch_input_param_map.find(input_name_str + "_item_shape") != batch_input_param_map.end()
+            && batch_input_param_map[input_name_str + "_item_shape"].is_input_ragged) {
+            RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
+
+            const int64_t total_batch_size_int64     = (int64_t)total_batch_size;
+            const int64_t max_elements_per_seq_int64 = (int64_t)param.max_elements_per_seq;
+            const size_t  padded_input_ids_buffer_size =
+                GetByteSize(input_datatype, std::vector<int64_t>{total_batch_size_int64, max_elements_per_seq_int64});
+            // Always host memory
+            BackendMemory* padded_input_memory;
+            BackendMemory* request_input_memory;
+            RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                            request_count,
+                                            BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                                  {BackendMemory::AllocationType::CPU},
+                                                                  0,
+                                                                  padded_input_ids_buffer_size,
+                                                                  &padded_input_memory));
+            RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                            request_count,
+                                            BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                                  {BackendMemory::AllocationType::CPU},
+                                                                  0,
+                                                                  padded_input_ids_buffer_size,
+                                                                  &request_input_memory));
+
+            memset(padded_input_memory->MemoryPtr(), 0, padded_input_ids_buffer_size);
+
+            collector->ProcessTensor(
+                input_name,
+                request_input_memory->MemoryPtr(),
+                GetByteSize(input_datatype, std::vector<int64_t>{(int64_t)param.total_input_elements}),
+                request_input_memory->MemoryType(),
+                request_input_memory->MemoryTypeId());
+
+            int64_t accumulated_elements_offset = 0;
+
+            char* padded_input_ids_ptr = padded_input_memory->MemoryPtr();
+            char* base_input_ids       = request_input_memory->MemoryPtr();
+
+            // copy each request buffer to padded buffer
+            for (int64_t single_batch_idx = 0; single_batch_idx < total_batch_size_int64; single_batch_idx++) {
+                int32_t sequence_elements = param.batch_input_ptr[single_batch_idx];
+                std::memcpy(padded_input_ids_ptr
+                                + GetByteSize(input_datatype,
+                                              std::vector<int64_t>{single_batch_idx, max_elements_per_seq_int64}),
+                            base_input_ids
+                                + GetByteSize(input_datatype, std::vector<int64_t>{accumulated_elements_offset}),
+                            GetByteSize(input_datatype, std::vector<int64_t>{sequence_elements}));
+
+                accumulated_elements_offset += sequence_elements;
+            }
+
+            // modify batch dimension shape, and sequence length dimension shape after
+            // padding
+            std::vector<size_t> batchn_shape(input_shape, input_shape + input_dims_count);
+            if (max_batch_size != 0) {
+                batchn_shape[0] = total_batch_size;
+                batchn_shape[1] = (size_t)param.max_seq_length;
+                // assume all non-seq-length dimensions have the same shape
+                if (input_dims_count > 2) {
+                    batchn_shape[2] = (size_t)(param.max_elements_per_seq / param.max_seq_length);
+                }
+            }
+            (*input_tensors)
+                ->insert({std::string(input_name),
+                          triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape, padded_input_ids_ptr}});
+
+            continue;
+        }
+
+        // bool start_end_ready_flag = (input_name_str == "START" || input_name_str
+        // == "END"
+        //   || input_name_str == "READY");
+
+        // int shape_dims_start = (int) (sequence_batching_enabled &&
+        // !start_end_ready_flag);
+
+        // The shape for the entire input patch, [total_batch_size, ...]
+        std::vector<int64_t> batchn_shape(input_shape, input_shape + input_dims_count);
+        if (max_batch_size != 0) {
+            batchn_shape[0] = total_batch_size;
+        }
+
+        std::vector<size_t> batchn_shape_2(input_shape, input_shape + input_dims_count);
+        if (max_batch_size != 0) {
+            batchn_shape_2[0] = total_batch_size;
+        }
+
+        // std::vector<int64_t> batchn_shape(
+        //     input_shape + shape_dims_start, input_shape + input_dims_count);
+        // if (max_batch_size != 0 && !start_end_ready_flag) {
+        //   batchn_shape[0] = total_batch_size;
+        // }
+
+        // std::vector<size_t> batchn_shape_2(
+        //     input_shape + shape_dims_start, input_shape + input_dims_count);
+        // if (max_batch_size != 0 && !start_end_ready_flag) {
+        //   batchn_shape_2[0] = total_batch_size;
+        // }
+
+        // The input must be in contiguous CPU/GPU memory.
+        const int64_t batchn_byte_size = GetByteSize(input_datatype, batchn_shape);
+
+        // Always host memory
+        BackendMemory* input_memory;
+        RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
+                                        request_count,
+                                        BackendMemory::Create(model_state_->TritonMemoryManager(),
+                                                              {BackendMemory::AllocationType::CPU},
+                                                              0,
+                                                              batchn_byte_size,
+                                                              &input_memory));
+        input_memories->push_back(input_memory);
+
+        TRITONSERVER_MemoryType memory_type    = input_memory->MemoryType();
+        int64_t                 memory_type_id = input_memory->MemoryTypeId();
+        char*                   input_buffer   = input_memory->MemoryPtr();
+
+        collector->ProcessTensor(input_name, input_buffer, batchn_byte_size, memory_type, memory_type_id);
+
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("collect name: ") + input_name + " size: " + std::to_string(batchn_byte_size) + " bytes")
+                .c_str());
+        (*input_tensors)
+            ->insert({std::string(input_name),
+                      triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape_2, input_buffer}});
+    }
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
+    // Finalize...
+    *cuda_copy |= collector->Finalize();
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
+}
+
+void ModelInstanceState::ReadOutputTensors(size_t                                                   total_batch_size,
+                                           std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
+                                           TRITONBACKEND_Request**                                  requests,
+                                           const uint32_t                                           request_count,
+                                           std::vector<TRITONBACKEND_Response*>*                    responses)
+{
+    BackendOutputResponder responder(requests,
+                                     request_count,
+                                     responses,
+                                     model_state_->MaxBatchSize(),
+                                     model_state_->TritonMemoryManager(),
+                                     model_state_->EnablePinnedInput(),
+                                     output_stream_);
+
+    bool cuda_copy = false;
+    // bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
+    std::vector<std::vector<char>> string_buffers;
+
+    int idx = 0;
+    for (auto it = output_tensors->begin(); it != output_tensors->end(); ++it) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_VERBOSE,
+            (std::string("Get output_tensors ") + std::to_string(idx) + std::string(": ") + std::string(it->first))
+                .c_str());
+        idx++;
+        auto& output = it->second;
+
+        // Verify output datatype matches datatype from model config
+        TRITONSERVER_DataType output_dtype = output.type;
+        LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                    (std::string("    output_type: ") + TRITONSERVER_DataTypeString(output_dtype)).c_str());
+
+        const char* output_buffer = static_cast<const char*>(output.data);
+
+        //  Set output shape
+        // std::vector<int64_t> batchn_shape = sequence_batching_enabled ?
+        // std::vector<int64_t>{1} :
+        //   std::vector<int64_t>{};
+        std::vector<int64_t> batchn_shape;
+        if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
+            // std::string batch_shape_str = sequence_batching_enabled ? "    output
+            // shape: [1, " :
+            //   "    output shape: [";
+            std::string batch_shape_str = "    output shape: [";
+            for (uint i = 0; i < output.shape.size(); i++) {
+                batchn_shape.push_back(output.shape[i]);
+                batch_shape_str = batch_shape_str + std::to_string(output.shape[i]);
+                if (i != output.shape.size() - 1) {
+                    batch_shape_str = batch_shape_str + ", ";
+                }
+                else {
+                    batch_shape_str = batch_shape_str + "]";
+                }
+            }
+            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, batch_shape_str.c_str());
+        }
+        else {
+            batchn_shape.insert(batchn_shape.end(), output.shape.begin(), output.shape.end());
+        }
+
+        responder.ProcessTensor(it->first,
+                                output_dtype,
+                                batchn_shape,
+                                output_buffer,
+                                TRITONSERVER_MEMORY_GPU,
+                                model_instance_device_id_start_);
+    }
+
+    // Finalize and wait for any pending buffer copies.
+    cuda_copy |= responder.Finalize();
+
+#ifdef TRITON_ENABLE_GPU
+    if (cuda_copy) {
+        cudaStreamSynchronize(output_stream_);
+    }
+#endif  // TRITON_ENABLE_GPU
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("PERFORMED GPU copy: ") + (cuda_copy ? std::string("YES") : std::string("NO"))).c_str());
+}
+
+/////////////
+
+extern "C" {
+
+TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
+{
+    int provided;
+    ft::mpi::initThread(nullptr, nullptr, ft::mpi::THREAD_MULTIPLE, &provided);
+    const char* cname;
+    RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
+    std::string name(cname);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
+
+    // Check the backend API version that Triton supports vs. what this
+    // backend was compiled against.
+    uint32_t api_version_major, api_version_minor;
+    RETURN_IF_ERROR(TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
+                 + std::to_string(api_version_minor))
+                    .c_str());
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("'") + name
+                 + "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "."
+                 + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+                    .c_str());
+
+    if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR)
+        || (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_UNSUPPORTED,
+            (std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
+             + std::to_string(api_version_minor) + " does not support '" + name + "' TRITONBACKEND API version: "
+             + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
+                .c_str());
+    }
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
+{
+    const char* cname;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
+    std::string name(cname);
+
+    uint64_t version;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + std::to_string(version) + ")").c_str());
+
+    // Create a ModelState object and associate it with the
+    // TRITONBACKEND_Model.
+    ModelState* model_state;
+    RETURN_IF_ERROR(ModelState::Create(model, &model_state));
+    RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
+{
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
+    ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
+
+    delete model_state;
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: MPI Finalize");
+
+    ft::mpi::finalize();
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
+{
+    int node_id = ft::mpi::getCommWorldRank();
+
+    const char* cname;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
+    std::string name(cname);
+
+    // Get the model state associated with this instance's model.
+    TRITONBACKEND_Model* model;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
+
+    void* vmodelstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
+    ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
+
+    // Create a ModelInstanceState object and associate it with the
+    // TRITONBACKEND_ModelInstance.
+    ModelInstanceState* instance_state;
+    RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
+
+    int model_instance_id    = instance_state->GetModelInstanceId();
+    int model_instance_count = instance_state->GetModelInstanceCount();
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO,
+                (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (count "
+                 + std::to_string(model_instance_count) + ")" + " (instance_id " + std::to_string(model_instance_id)
+                 + ")")
+                    .c_str());
+
+    if (node_id) {
+        while (true) {
+            instance_state->Execute(
+                nullptr, nullptr, 0, std::shared_ptr<std::unordered_map<std::string, Tensor>>(nullptr));
+        }
+    }
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
+{
+    void* vstate;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
+    ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
+
+    LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+
+    delete instance_state;
+
+    return nullptr;  // success
+}
+
+TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstance* instance,
+                                                       TRITONBACKEND_Request**      requests,
+                                                       const uint32_t               request_count)
+{
+    // Triton will not call this function simultaneously for the same
+    // 'instance'. But since this backend could be used by multiple
+    // instances from multiple models the implementation needs to handle
+    // multiple calls to this function at the same time (with different
+    // 'instance' objects). Suggested practice for this is to use only
+    // function-local and model-instance-specific state (obtained from
+    // 'instance'), which is what we do here.
+    ModelInstanceState* instance_state;
+    RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
+    ModelState* model_state = instance_state->StateForModel();
+
+    // This backend specifies BLOCKING execution policy. That means that
+    // we should not return from this function until execution is
+    // complete. Triton will automatically release 'instance' on return
+    // from this function so that it is again available to be used for
+    // another call to TRITONBACKEND_ModelInstanceExecute.
+
+    LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
+                (std::string("model ") + model_state->Name() + ", instance " + instance_state->Name() + ", executing "
+                 + std::to_string(request_count) + " requests")
+                    .c_str());
+
+    // At this point we accept ownership of 'requests', which means that
+    // even if something goes wrong we must still return success from
+    // this function. If something does go wrong in processing a
+    // particular request then we send an error response just for the
+    // specific request.
+    instance_state->ProcessRequests(requests, request_count);
+
+    return nullptr;  // success
+}
+
+}  // extern "C"
+
+}  // namespace fastertransformer_backend
+}  // namespace backend
+}  // namespace triton
diff --git a/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript b/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript
new file mode 100644
index 0000000000000000000000000000000000000000..26d2fbb33fcc9295b0afebb7c612c29726fe6923
--- /dev/null
+++ b/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript
@@ -0,0 +1,30 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+{
+  global:
+    TRITONBACKEND_*;
+  local: *;
+};
diff --git a/src/fastertransformer/triton_backend/llama/CMakeLists.txt b/src/fastertransformer/triton_backend/llama/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f07b77e754259b93c80f4d60b52add35adee467
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.8)
+
+set(llama_triton_backend_files
+    LlamaTritonModel.cc
+    LlamaTritonModelInstance.cc
+)
+
+add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
+set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE  ON)
+target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt)
+target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f47f633b73e99390934dbdd5071c21f3eeb4bd12
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "3rdparty/INIReader.h"
+#include "src/fastertransformer/models/llama/LlamaInstanceComm.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/allocator.h"
+#include <mutex>
+
+namespace ft = fastertransformer;
+
+std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
+{
+    INIReader reader = INIReader(inifile);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        return nullptr;
+    }
+
+    const std::string data_type        = reader.Get("ft_instance_hyperparameter", "data_type");
+    int               tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
+    std::string       model_dir        = reader.Get("ft_instance_hyperparameter", "model_dir");
+
+    if (data_type == "half" || data_type == "fp16") {
+        return std::make_shared<LlamaTritonModel<half>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+    else {
+        return std::make_shared<LlamaTritonModel<float>>(
+            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
+            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
+            model_dir);
+    }
+}
+
+template<typename T>
+void LlamaTritonModel<T>::handleMissingParams()
+{
+    if (!max_batch_size_) {
+        max_batch_size_ = 32;
+        FT_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
+    }
+
+    if (!session_len_) {
+        session_len_ = 2160;
+        FT_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
+    }
+
+    if (!max_context_token_num_) {
+        max_context_token_num_ = (int)std::sqrt(max_batch_size_);
+        FT_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
+                       (int)max_context_token_num_);
+    }
+
+    if (!step_length_) {
+        step_length_ = 1;
+        FT_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
+    }
+
+    if (!cache_max_entry_count_) {
+        cache_max_entry_count_ = 32;
+        FT_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
+                       (int)cache_max_entry_count_);
+    }
+
+    if (!cache_chunk_size_) {
+        cache_chunk_size_ = cache_max_entry_count_;
+        FT_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
+    }
+}
+
+template<typename T>
+LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
+                                      size_t      pipeline_para_size,
+                                      int         enable_custom_all_reduce,
+                                      std::string model_dir):
+    tensor_para_size_(tensor_para_size),
+    pipeline_para_size_(pipeline_para_size),
+    shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
+    enable_custom_all_reduce_(enable_custom_all_reduce)
+{
+    model_dir_ = model_dir;
+    const std::string inifile{model_dir + "/config.ini"};
+    INIReader         reader = INIReader(inifile);
+    if (reader.ParseError() < 0) {
+        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        ft::FT_CHECK(false);
+    }
+
+    model_name_            = reader.Get("llama", "model_name");
+    head_num_              = reader.GetInteger("llama", "head_num");
+    size_per_head_         = reader.GetInteger("llama", "size_per_head");
+    inter_size_            = reader.GetInteger("llama", "inter_size");
+    num_layer_             = reader.GetInteger("llama", "num_layer");
+    vocab_size_            = reader.GetInteger("llama", "vocab_size");
+    rotary_embedding_dim_  = reader.GetInteger("llama", "rotary_embedding");
+    norm_eps_              = reader.GetFloat("llama", "norm_eps");
+    start_id_              = reader.GetInteger("llama", "start_id");
+    end_id_                = reader.GetInteger("llama", "end_id");
+    max_batch_size_        = reader.GetInteger("llama", "max_batch_size", 0);
+    max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
+    session_len_           = reader.GetInteger("llama", "session_len", 0);
+    step_length_           = reader.GetInteger("llama", "step_length", 0);
+    cache_max_entry_count_ = reader.GetInteger("llama", "cache_max_entry_count", 0);
+    use_context_fmha_      = reader.GetInteger("llama", "use_context_fmha", 1);
+    cache_chunk_size_      = reader.GetInteger("llama", "cache_chunk_size", 0);
+    prefix_cache_len_      = reader.GetInteger("llama", "prefix_cache_len", 0);
+
+    handleMissingParams();
+
+    if (max_context_token_num_ <= max_batch_size_) {
+        max_context_token_num_ *= session_len_;
+    }
+
+    shared_state_          = std::make_shared<typename ft::LlamaV2<T>::SharedState>();
+    shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
+
+    const auto device_count = ft::getDeviceCount();
+    shared_instances_.resize(device_count);
+    shared_mutexes_.resize(device_count);
+
+    const std::string weight_type_str = reader.Get("llama", "weight_type");
+    if (weight_type_str == "fp16") {
+        weight_type_ = ft::WeightType::kFP16;
+    }
+    else if (weight_type_str == "fp32") {
+        weight_type_ = ft::WeightType::kFP32;
+    }
+    else if (weight_type_str == "int8") {
+        weight_type_ = ft::WeightType::kINT8;
+    }
+    else if (weight_type_str == "int4") {
+        weight_type_ = ft::WeightType::kINT4;
+    }
+    else {
+        std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
+        ft::FT_CHECK(0);
+    }
+}
+
+template<typename T>
+std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSharedModelInstance(
+    int                                                               device_id,
+    int                                                               rank,
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+    std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
+
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
+        new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
+
+    /// TODO: this stream handle is leaked
+    cudaStream_t stream{};
+    ft::check_cuda_error(cudaStreamCreate(&stream));
+
+    allocator->setStream(stream);
+
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+
+    std::unique_ptr<ft::cublasAlgoMap>   cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in"));
+    std::unique_ptr<std::mutex>          cublas_wrapper_mutex(new std::mutex());
+    std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper(new ft::cublasMMWrapper(
+        cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()));
+
+    std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr(new cudaDeviceProp);
+    ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id));
+
+    if (std::is_same<T, half>::value) {
+        cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
+    }
+    else if (std::is_same<T, float>::value) {
+        cublas_wrapper->setFP32GemmConfig();
+    }
+
+    ft::NcclParam tensor_para   = nccl_params.first[comms_rank];
+    ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
+
+    ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
+    ft::FT_CHECK(pipeline_para.world_size_ = pipeline_para_size_);
+
+    auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
+                                                  size_per_head_,
+                                                  inter_size_,
+                                                  num_layer_,
+                                                  vocab_size_,
+                                                  rotary_embedding_dim_,
+                                                  norm_eps_,
+                                                  max_batch_size_,
+                                                  max_context_token_num_,
+                                                  session_len_,
+                                                  step_length_,
+                                                  start_id_,
+                                                  end_id_,
+                                                  cache_max_entry_count_,
+                                                  cache_chunk_size_,
+                                                  use_context_fmha_,
+                                                  shared_state_,
+                                                  shared_weights_[device_id].get(),
+                                                  tensor_para,
+                                                  stream,
+                                                  cublas_wrapper.get(),
+                                                  allocator.get(),
+                                                  false,  // is_free_buffer_after_forward,
+                                                  cuda_device_prop_ptr.get());
+
+    return std::make_unique<LlamaTritonSharedModelInstance<T>>(
+        LlamaTritonSharedModelInstance<T>{std::move(llama),
+                                          shared_weights_[device_id],
+                                          std::move(allocator),
+                                          std::move(cublas_algo_map),
+                                          std::move(cublas_wrapper_mutex),
+                                          std::move(cublas_wrapper),
+                                          std::move(cuda_device_prop_ptr),
+                                          session_len_});
+}
+
+template<typename T>
+std::unique_ptr<AbstractTransformerModelInstance>
+LlamaTritonModel<T>::createModelInstance(int                                                               device_id,
+                                         int                                                               rank,
+                                         cudaStream_t                                                      stream,
+                                         std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                                         std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    // const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
+
+    std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance;
+    {
+        std::lock_guard<std::mutex> lock(shared_mutexes_[device_id]);
+        instance = shared_instances_[device_id].lock();
+        if (!instance) {
+            instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
+            shared_instances_[device_id] = instance;
+        }
+    }
+
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
+        new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
+
+    allocator->setStream(stream);
+
+    return std::make_unique<LlamaTritonModelInstance<T>>(instance, std::move(allocator));
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(device_id));
+    const int tensor_para_rank   = rank % tensor_para_size_;
+    const int pipeline_para_rank = rank / tensor_para_size_;
+    ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
+    shared_weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(head_num_ * size_per_head_,
+                                                                      inter_size_,
+                                                                      vocab_size_,
+                                                                      num_layer_,
+                                                                      weight_type_,
+                                                                      tensor_para_size_,
+                                                                      tensor_para_rank,
+                                                                      prefix_cache_len_);
+    shared_weights_[device_id]->loadModel(model_dir_);
+    return;
+}
+
+template<typename T>
+std::string LlamaTritonModel<T>::toString()
+{
+    std::stringstream ss;
+    ss << "Model: "
+       << "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_
+       << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nmax_batch_size: " << max_batch_size_
+       << "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
+       << "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_entry_count_
+       << "\ncache_chunk_size: " << cache_chunk_size_ << "\nuse_context_fmha: " << use_context_fmha_
+       << "\nstart_id: " << start_id_ << "\ntensor_para_size: " << tensor_para_size_
+       << "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
+       << "\nmodel_name: " << model_name_ << "\nprefix_cache_len: " << prefix_cache_len_
+       << "\nmodel_dir: " << model_dir_ << std::endl;
+
+    return ss.str();
+}
+
+template<typename T>
+void LlamaTritonModel<T>::createCustomComms(
+    std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, int world_size)
+{
+    using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
+    ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
+}
+
+template<typename T>
+std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
+{
+    const auto device_count     = ft::getDeviceCount();
+    bool       need_nccl_params = false;
+    // create nccl group when there are non-occupied devices
+    for (int i = 0; i < device_count; ++i) {
+        std::lock_guard<std::mutex> lock(shared_mutexes_[i]);
+        if (shared_instances_[i].expired()) {
+            need_nccl_params = true;
+            break;
+        }
+    }
+    if (need_nccl_params) {
+        return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node);
+    }
+    else {
+        FT_LOG_INFO("Skipping NCCL param creation.");
+
+        const int tensor_para_size   = getTensorParaSize();
+        const int pipeline_para_size = getPipelineParaSize();
+        const int local_comm_size    = multi_node ? device_count : tensor_para_size * pipeline_para_size;
+
+        std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
+        std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
+        return {std::move(tensor_para_params), std::move(pipeline_para_params)};
+    }
+}
+
+template<typename T>
+std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
+{
+    return std::make_unique<ft::LlamaInstanceComm>(size);
+}
+
+template<typename T>
+int LlamaTritonModel<T>::getTensorParaSize()
+{
+    return tensor_para_size_;
+}
+
+template<typename T>
+int LlamaTritonModel<T>::getPipelineParaSize()
+{
+    return pipeline_para_size_;
+}
+
+template struct LlamaTritonModel<float>;
+template struct LlamaTritonModel<half>;
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
new file mode 100644
index 0000000000000000000000000000000000000000..309fc1d29e4937a2b843134fc9a9f82d3bb55f2f
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <cuda_fp16.h>
+#include <mutex>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct LlamaTritonSharedModelInstance;
+
+template<typename T>
+struct LlamaTritonModel: public AbstractTransformerModel {
+    LlamaTritonModel(size_t      tensor_para_size,
+                     size_t      pipeline_para_size,
+                     int         enable_custom_all_reduce,
+                     std::string model_dir);
+
+    ~LlamaTritonModel() = default;
+
+    std::unique_ptr<AbstractTransformerModelInstance>
+    createModelInstance(int                                                               deviceId,
+                        int                                                               rank,
+                        cudaStream_t                                                      stream,
+                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
+
+    void createSharedWeights(int deviceId, int rank) override;
+
+    void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                           int                                                   world_size) override;
+
+    std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+    createNcclParams(const int node_id, const int device_id_start, const bool multi_node) override;
+
+    std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size) override;
+
+    void handleMissingParams();
+
+    std::string toString() override;
+    int         getTensorParaSize() override;
+    int         getPipelineParaSize() override;
+
+private:
+    std::unique_ptr<LlamaTritonSharedModelInstance<T>>
+    createSharedModelInstance(int                                                               deviceId,
+                              int                                                               rank,
+                              std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                              std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
+
+    size_t         head_num_;
+    size_t         size_per_head_;
+    size_t         inter_size_;
+    size_t         num_layer_;
+    size_t         vocab_size_;
+    size_t         rotary_embedding_dim_;
+    float          norm_eps_;
+    int            max_batch_size_;
+    int            max_context_token_num_;
+    int            session_len_;
+    int            step_length_;
+    int            start_id_;
+    int            end_id_;
+    int            cache_max_entry_count_;
+    int            cache_chunk_size_;
+    int            use_context_fmha_;
+    size_t         tensor_para_size_;
+    size_t         pipeline_para_size_;
+    ft::WeightType weight_type_;
+
+    size_t prefix_cache_len_{};
+
+    // shared weights for each device
+    std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;
+
+    std::shared_ptr<typename ft::LlamaV2<T>::SharedState> shared_state_;
+
+    // weak_ptr is used so that the instances get released when all strong references are gone
+    std::vector<std::weak_ptr<LlamaTritonSharedModelInstance<T>>> shared_instances_;
+    std::deque<std::mutex>                                        shared_mutexes_;  // is locking really needed?
+
+    // // residual type
+    // bool use_gptj_residual_ = true;
+
+    // // number of tasks (for prefix-prompt, p/prompt-tuning)
+    // size_t                                     num_tasks_                  = 0;
+    // int                                        prompt_learning_start_id_   = 0;
+    // ft::PromptLearningType                     prompt_learning_type_       = ft::PromptLearningType::no_prompt;
+    // std::map<std::string, std::pair<int, int>> prompt_learning_table_pair_ = {};
+
+    bool is_fp16_;
+    int  enable_custom_all_reduce_ = 0;
+
+    std::string model_name_;
+    std::string model_dir_;
+};
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56ff89ed4687b5550439ef125b887df225c9e83d
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/triton_backend/triton_utils.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
+{
+    LlamaTritonModelInstance<T>* model  = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
+    auto                         result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);
+
+    model->stream_cb_(result, model->stream_ctx_);
+}
+
+template<typename T>
+LlamaTritonModelInstance<T>::LlamaTritonModelInstance(
+    std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator):
+    instance_(std::move(instance)), allocator_(std::move(allocator))
+{
+}
+
+template<typename T>
+std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
+    move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
+
+    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
+    const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
+    // freed in forward()
+    h_total_output_lengths_ = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
+
+    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = std::unordered_map<std::string, ft::Tensor>{
+        {"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
+        // {"input_lengths", as_GPU_tensor(input_tensors->at("input_lengths"), d_input_lengths_)},
+    };
+
+    if (input_tensors->find("bad_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_);
+        ft_input_tensors.insert(
+            {"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)});
+    }
+
+    if (input_tensors->find("stop_words_list") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_);
+        ft_input_tensors.insert(
+            {"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)});
+    }
+
+    if (input_tensors->count("request_prompt_embedding") && input_tensors->count("request_prompt_lengths")
+        && input_tensors->count("request_prompt_type")) {
+
+        move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_lengths",
+             as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
+
+        move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
+        ft_input_tensors.insert(
+            {"request_prompt_embedding",
+             as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
+    }
+
+    if (input_tensors->find("top_p_decay") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_);
+        ft_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)});
+    }
+    if (input_tensors->find("top_p_min") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_);
+        ft_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)});
+    }
+    if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) {
+        move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_);
+        ft_input_tensors.insert(
+            {"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)});
+    }
+
+    for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
+        if (t->first.find("input_ids") == std::string::npos  // && t->first.find("input_lengths") == std::string::npos
+            && t->first.find("output_seq_len") == std::string::npos
+            && t->first.find("prefix_soft_prompt_embedding") == std::string::npos
+            && t->first.find("prefix_soft_prompt_lengths") == std::string::npos) {
+            if (ft_input_tensors.count(t->first) == 0) {
+                ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
+            }
+        }
+    }
+
+    return ft_input_tensors;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
+        new std::unordered_map<std::string, triton::Tensor>();
+
+    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
+        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
+    }
+
+    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
+}
+
+template<typename T>
+std::shared_ptr<std::vector<triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
+{
+    ft::FT_CHECK(false);
+    return nullptr;
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
+{
+    ft::FT_CHECK(false);
+    return nullptr;
+}
+
+template<typename T>
+std::string format_vector(const std::vector<T>& vec)
+{
+    std::stringstream ss;
+    ss << "[";
+    bool first = true;
+    for (const auto& x : vec) {
+        ss << (first ? "" : ", ") << x;
+        first = false;
+    }
+    ss << "]";
+    return ss.str();
+}
+
+template<typename T>
+std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
+                                     ft::AbstractInstanceComm*                                        instance_comm)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    // for (const auto& kv : *input_tensors) {
+    //     FT_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
+    // }
+
+    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
+                       "input_tensors->at(\"input_ids\").shape.size() == 2");
+    FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
+                       "input_tensors->at(\"input_lengths\").shape.size() == 1");
+
+    const uint32_t request_batch_size     = input_tensors->at("input_ids").shape[0];
+    const uint32_t max_request_output_len = (size_t)*std::max_element(
+        (int*)input_tensors->at("request_output_len").data,
+        (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
+    // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
+    const uint32_t beam_width =
+        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
+    FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");
+
+    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);
+
+    allocateBuffer(request_batch_size, beam_width, instance_->session_len);
+
+    std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
+        {"output_ids",
+         ft::Tensor{ft::MEMORY_GPU,
+                    ft::TYPE_UINT32,
+                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len},
+                    d_output_ids_}},
+        {"sequence_length",
+         ft::Tensor{ft::MEMORY_GPU,
+                    ft::TYPE_UINT32,
+                    std::vector<size_t>{request_batch_size, beam_width},
+                    d_sequence_lengths_}}};
+
+    if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
+        output_tensors.insert({"output_log_probs",
+                               ft::Tensor{ft::MEMORY_GPU,
+                                          ft::TYPE_FP32,
+                                          std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
+                                          d_output_log_probs_}});
+        output_tensors.insert({"cum_log_probs",
+                               ft::Tensor{ft::MEMORY_GPU,
+                                          ft::TYPE_FP32,
+                                          std::vector<size_t>{request_batch_size, beam_width},
+                                          d_cum_log_probs_}});
+    }
+    try {
+        ft::Request::Callback callback;
+
+        if (stream_cb_) {
+            callback = [this](std::unordered_map<std::string, ft::Tensor>* outputs) {
+                triton_stream_callback<T>(outputs, this);
+            };
+        }
+
+        ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
+        instance_->llm->forward(&output_tensors, &ft_input_tensors, {instance_comm, callback});
+        // ! stream synced by the model before returning
+    }
+    catch (...) {
+        h_exception_ = std::current_exception();
+        output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
+    }
+
+    if (h_total_output_lengths_ != nullptr) {
+        free(h_total_output_lengths_);
+        h_total_output_lengths_ = nullptr;
+    }
+
+    return convert_outputs(output_tensors);
+}
+
+template<typename T>
+LlamaTritonModelInstance<T>::~LlamaTritonModelInstance()
+{
+    freeBuffer();
+}
+
+template<typename T>
+void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
+                                                 const size_t beam_width,
+                                                 const size_t session_len)
+{
+    d_output_ids_ =
+        (int*)(allocator_->reMalloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len, false));
+    d_sequence_lengths_ =
+        (int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
+    d_output_log_probs_ = (float*)(allocator_->reMalloc(
+        d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * session_len, false));
+    d_cum_log_probs_ =
+        (float*)(allocator_->reMalloc(d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width, false));
+}
+
+template<typename T>
+void LlamaTritonModelInstance<T>::freeBuffer()
+{
+    allocator_->free((void**)(&d_output_ids_));
+    allocator_->free((void**)(&d_sequence_lengths_));
+    allocator_->free((void**)(&d_output_log_probs_));
+    allocator_->free((void**)(&d_cum_log_probs_));
+}
+
+template struct LlamaTritonModelInstance<float>;
+template struct LlamaTritonModelInstance<half>;
diff --git a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
new file mode 100644
index 0000000000000000000000000000000000000000..636063d924b45c5e6c74e5438a6f4a1419bba723
--- /dev/null
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+
+#pragma once
+
+#include "src/fastertransformer/models/llama/LlamaV2.h"
+#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include <memory>
+
+namespace ft = fastertransformer;
+
+template<typename T>
+struct LlamaTritonSharedModelInstance {
+    std::unique_ptr<ft::LlamaV2<T>>                         llm;
+    std::shared_ptr<ft::LlamaWeight<T>>                     llm_weight;
+    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator;
+    std::unique_ptr<ft::cublasAlgoMap>                      cublas_algo_map;
+    std::unique_ptr<std::mutex>                             cublas_wrapper_mutex;
+    std::unique_ptr<ft::cublasMMWrapper>                    cublas_wrapper;
+    std::unique_ptr<cudaDeviceProp>                         cuda_device_prop_ptr;
+    const int                                               session_len;
+};
+
+template<typename T>
+struct LlamaTritonModelInstance: AbstractTransformerModelInstance {
+
+    LlamaTritonModelInstance(std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
+                             std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator);
+    ~LlamaTritonModelInstance();
+
+    std::shared_ptr<std::vector<triton::Tensor>>
+    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) override;
+
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) override;
+
+    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
+            ft::AbstractInstanceComm*) override;
+
+    static std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors);
+
+private:
+    const std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance_;
+    const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator_;
+
+    std::unordered_map<std::string, ft::Tensor>
+    convert_inputs(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors);
+
+    void allocateBuffer(const size_t request_batch_size, const size_t beam_width, const size_t session_len);
+    void freeBuffer();
+
+    int*   d_input_ids_                = nullptr;
+    int*   d_input_lengths_            = nullptr;
+    int*   d_input_bad_words_          = nullptr;
+    int*   d_input_stop_words_         = nullptr;
+    int*   d_request_prompt_lengths_   = nullptr;
+    T*     d_request_prompt_embedding_ = nullptr;
+    float* d_top_p_decay_              = nullptr;
+    float* d_top_p_min_                = nullptr;
+    int*   d_top_p_reset_ids_          = nullptr;
+
+    int*   d_output_ids_       = nullptr;
+    int*   d_sequence_lengths_ = nullptr;
+    float* d_output_log_probs_ = nullptr;
+    float* d_cum_log_probs_    = nullptr;
+
+    uint32_t*          h_total_output_lengths_ = nullptr;
+    std::exception_ptr h_exception_            = nullptr;
+};
diff --git a/src/fastertransformer/triton_backend/transformer_triton_backend.cpp b/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c99908685aa82b46829f1f9ba95bff491bab315e
--- /dev/null
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
+
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+AbstractTransformerModel::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
+{
+    const int gpu_count          = ft::getDeviceCount();
+    const int tensor_para_size   = getTensorParaSize();
+    const int pipeline_para_size = getPipelineParaSize();
+    const int local_comm_size    = multi_node ? gpu_count : tensor_para_size * pipeline_para_size;
+    ft::FT_CHECK(tensor_para_size > 0 && pipeline_para_size > 0);
+    ft::FT_CHECK(device_id_start + (int)local_comm_size <= gpu_count);
+
+    std::vector<ft::NcclUid> nccl_ids;
+    if (tensor_para_size > 1 || pipeline_para_size > 1) {
+        nccl_ids.resize(tensor_para_size + pipeline_para_size);
+        if (node_id == 0) {
+            for (uint32_t i = 0; i < nccl_ids.size(); i++) {
+                ft::ftNcclGetUniqueId(nccl_ids[i]);
+            }
+        }
+        for (size_t i = 0; i < nccl_ids.size(); i++) {
+            ft::mpi::bcast(&nccl_ids[i], sizeof(nccl_ids[i]), ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
+        }
+    }
+
+    std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
+    std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
+    // Don't init comm when size == 1
+    if (tensor_para_size > 1) {
+        const auto group_id = ft::ftNcclNextGroupId();
+        ft::ftNcclGroupStart();
+        for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
+            int rank               = node_id * gpu_count + gid - device_id_start;
+            int tensor_para_rank   = rank % tensor_para_size;
+            int pipeline_para_rank = rank / tensor_para_size;
+
+            ft::NcclUid tensor_para_nccl_uid = nccl_ids[pipeline_para_rank];
+            ft::check_cuda_error(cudaSetDevice(gid));
+            ft::ftNcclCommInitRank(
+                tensor_para_params[gid - device_id_start], tensor_para_rank, tensor_para_size, tensor_para_nccl_uid);
+            tensor_para_params[gid - device_id_start].group_id_ = group_id;
+        }
+        ft::ftNcclGroupEnd();
+    }
+    if (pipeline_para_size > 1) {
+        const auto group_id = ft::ftNcclNextGroupId();
+        ft::ftNcclGroupStart();
+        for (int gid = device_id_start; gid < device_id_start + local_comm_size; gid++) {
+            int rank               = node_id * gpu_count + gid - device_id_start;
+            int tensor_para_rank   = rank % tensor_para_size;
+            int pipeline_para_rank = rank / tensor_para_size;
+
+            ft::NcclUid pipeline_para_nccl_uid = nccl_ids[pipeline_para_size + tensor_para_rank];
+            ft::check_cuda_error(cudaSetDevice(gid));
+            ft::ftNcclCommInitRank(pipeline_para_params[gid - device_id_start],
+                                   pipeline_para_rank,
+                                   pipeline_para_size,
+                                   pipeline_para_nccl_uid);
+            pipeline_para_params[gid - device_id_start].group_id_ = group_id;
+        }
+        ft::ftNcclGroupEnd();
+    }
+    return std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>(tensor_para_params, pipeline_para_params);
+}
diff --git a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb592a32f967de0277f11f3b4c1b61cb89bdc1b1
--- /dev/null
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) OpenMMLab. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+
+#pragma once
+
+#include <memory>
+#include <sstream>
+#include <sys/time.h>
+#include <vector>
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/custom_ar_comm.h"
+#include "src/fastertransformer/utils/instance_comm.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+
+namespace ft = fastertransformer;
+
+namespace triton {
+#ifdef USE_TRITONSERVER_DATATYPE
+
+#include "triton/core/tritonbackend.h"
+#include "triton/core/tritonserver.h"
+
+#ifndef TRITONSERVER_API_VERSION_MAJOR
+#error TRITONSERVER_API_VERSION_MAJOR Undefined!
+#endif
+
+#ifndef TRITONSERVER_API_VERSION_MINOR
+#error TRITONSERVER_API_VERSION_MINOR Undefined!
+#endif
+
+#if (TRITONSERVER_API_VERSION_MAJOR == 1 && TRITONSERVER_API_VERSION_MINOR >= 17)                                      \
+    || (TRITONSERVER_API_VERSION_MAJOR > 1)
+#define ENABLE_TRITON_BF16 1
+#endif
+
+typedef TRITONSERVER_DataType   DataType;
+typedef TRITONSERVER_MemoryType MemoryType;
+
+constexpr TRITONSERVER_DataType TYPE_INVALID = TRITONSERVER_TYPE_INVALID;
+constexpr TRITONSERVER_DataType TYPE_BOOL    = TRITONSERVER_TYPE_BOOL;
+constexpr TRITONSERVER_DataType TYPE_UINT8   = TRITONSERVER_TYPE_UINT8;
+constexpr TRITONSERVER_DataType TYPE_UINT16  = TRITONSERVER_TYPE_UINT16;
+constexpr TRITONSERVER_DataType TYPE_UINT32  = TRITONSERVER_TYPE_UINT32;
+constexpr TRITONSERVER_DataType TYPE_UINT64  = TRITONSERVER_TYPE_UINT64;
+constexpr TRITONSERVER_DataType TYPE_INT8    = TRITONSERVER_TYPE_INT8;
+constexpr TRITONSERVER_DataType TYPE_INT16   = TRITONSERVER_TYPE_INT16;
+constexpr TRITONSERVER_DataType TYPE_INT32   = TRITONSERVER_TYPE_INT32;
+constexpr TRITONSERVER_DataType TYPE_INT64   = TRITONSERVER_TYPE_INT64;
+constexpr TRITONSERVER_DataType TYPE_FP16    = TRITONSERVER_TYPE_FP16;
+constexpr TRITONSERVER_DataType TYPE_FP32    = TRITONSERVER_TYPE_FP32;
+constexpr TRITONSERVER_DataType TYPE_FP64    = TRITONSERVER_TYPE_FP64;
+constexpr TRITONSERVER_DataType TYPE_BYTES   = TRITONSERVER_TYPE_BYTES;
+
+#ifdef ENABLE_TRITON_BF16
+constexpr TRITONSERVER_DataType TYPE_BF16 = TRITONSERVER_TYPE_BF16;
+#endif
+constexpr TRITONSERVER_MemoryType MEMORY_CPU        = TRITONSERVER_MEMORY_CPU;
+constexpr TRITONSERVER_MemoryType MEMORY_CPU_PINNED = TRITONSERVER_MEMORY_CPU_PINNED;
+constexpr TRITONSERVER_MemoryType MEMORY_GPU        = TRITONSERVER_MEMORY_GPU;
+
+#else
+
+typedef ft::DataType   DataType;
+typedef ft::MemoryType MemoryType;
+
+constexpr DataType   TYPE_INVALID      = ft::TYPE_INVALID;
+constexpr DataType   TYPE_BOOL         = ft::TYPE_BOOL;
+constexpr DataType   TYPE_UINT8        = ft::TYPE_UINT8;
+constexpr DataType   TYPE_UINT16       = ft::TYPE_UINT16;
+constexpr DataType   TYPE_UINT32       = ft::TYPE_UINT32;
+constexpr DataType   TYPE_UINT64       = ft::TYPE_UINT64;
+constexpr DataType   TYPE_INT8         = ft::TYPE_INT8;
+constexpr DataType   TYPE_INT16        = ft::TYPE_INT16;
+constexpr DataType   TYPE_INT32        = ft::TYPE_INT32;
+constexpr DataType   TYPE_INT64        = ft::TYPE_INT64;
+constexpr DataType   TYPE_FP16         = ft::TYPE_FP16;
+constexpr DataType   TYPE_FP32         = ft::TYPE_FP32;
+constexpr DataType   TYPE_FP64         = ft::TYPE_FP64;
+constexpr DataType   TYPE_BYTES        = ft::TYPE_BYTES;
+constexpr DataType   TYPE_BF16         = ft::TYPE_BF16;
+constexpr MemoryType MEMORY_CPU        = ft::MEMORY_CPU;
+constexpr MemoryType MEMORY_CPU_PINNED = ft::MEMORY_CPU_PINNED;
+constexpr MemoryType MEMORY_GPU        = ft::MEMORY_GPU;
+
+#endif
+
+struct Tensor {
+    const MemoryType          where;
+    const DataType            type;
+    const std::vector<size_t> shape;
+    const void*               data;
+
+    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
+        where(_where), type(_type), shape(_shape), data(_data)
+    {
+    }
+
+    static ft::DataType convertTritonTypeToFt(DataType tmp_type)
+    {
+        ft::DataType ft_data_type;
+        switch (tmp_type) {
+            case TYPE_INVALID:
+                ft_data_type = ft::DataType::TYPE_INVALID;
+                break;
+            case TYPE_BOOL:
+                ft_data_type = ft::DataType::TYPE_BOOL;
+                break;
+            case TYPE_UINT8:
+                ft_data_type = ft::DataType::TYPE_UINT8;
+                break;
+            case TYPE_UINT16:
+                ft_data_type = ft::DataType::TYPE_UINT16;
+                break;
+            case TYPE_UINT32:
+                ft_data_type = ft::DataType::TYPE_UINT32;
+                break;
+            case TYPE_UINT64:
+                ft_data_type = ft::DataType::TYPE_UINT64;
+                break;
+            case TYPE_INT8:
+                ft_data_type = ft::DataType::TYPE_INT8;
+                break;
+            case TYPE_INT16:
+                ft_data_type = ft::DataType::TYPE_INT16;
+                break;
+            case TYPE_INT32:
+                ft_data_type = ft::DataType::TYPE_INT32;
+                break;
+            case TYPE_INT64:
+                ft_data_type = ft::DataType::TYPE_INT64;
+                break;
+            case TYPE_FP16:
+                ft_data_type = ft::DataType::TYPE_FP16;
+                break;
+            case TYPE_FP32:
+                ft_data_type = ft::DataType::TYPE_FP32;
+                break;
+            case TYPE_FP64:
+                ft_data_type = ft::DataType::TYPE_FP64;
+                break;
+#ifdef ENABLE_TRITON_BF16
+            case TYPE_BF16:
+                ft_data_type = ft::DataType::TYPE_BF16;
+                break;
+#endif
+            case TYPE_BYTES:
+                ft_data_type = ft::DataType::TYPE_BYTES;
+                break;
+            default:
+                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(tmp_type));
+                break;
+        }
+        return ft_data_type;
+    }
+
+    ft::Tensor convertTritonTensorToFt()
+    {
+        ft::DataType   ft_data_type = convertTritonTypeToFt(type);
+        ft::MemoryType ft_memory_type;
+        switch (where) {
+            case MEMORY_CPU:
+                ft_memory_type = ft::MemoryType::MEMORY_CPU;
+                break;
+            case MEMORY_CPU_PINNED:
+                ft_memory_type = ft::MemoryType::MEMORY_CPU_PINNED;
+                break;
+            case MEMORY_GPU:
+                ft_memory_type = ft::MemoryType::MEMORY_GPU;
+                break;
+        }
+        return ft::Tensor{ft_memory_type, ft_data_type, shape, data};
+    }
+
+    static Tensor convertFtTensorToTriton(ft::Tensor ft_tensor)
+    {
+        DataType triton_data_type;
+        switch (ft_tensor.type) {
+            case TYPE_INVALID:
+                triton_data_type = TYPE_INVALID;
+                break;
+            case TYPE_BOOL:
+                triton_data_type = TYPE_BOOL;
+                break;
+            case TYPE_UINT8:
+                triton_data_type = TYPE_UINT8;
+                break;
+            case TYPE_UINT16:
+                triton_data_type = TYPE_UINT16;
+                break;
+            case TYPE_UINT32:
+                triton_data_type = TYPE_UINT32;
+                break;
+            case TYPE_UINT64:
+                triton_data_type = TYPE_UINT64;
+                break;
+            case TYPE_INT8:
+                triton_data_type = TYPE_INT8;
+                break;
+            case TYPE_INT16:
+                triton_data_type = TYPE_INT16;
+                break;
+            case TYPE_INT32:
+                triton_data_type = TYPE_INT32;
+                break;
+            case TYPE_INT64:
+                triton_data_type = TYPE_INT64;
+                break;
+            case TYPE_FP16:
+                triton_data_type = TYPE_FP16;
+                break;
+            case TYPE_FP32:
+                triton_data_type = TYPE_FP32;
+                break;
+            case TYPE_FP64:
+                triton_data_type = TYPE_FP64;
+                break;
+#ifdef ENABLE_TRITON_BF16
+            case TYPE_BF16:
+                triton_data_type = TYPE_BF16;
+                break;
+#endif
+            case TYPE_BYTES:
+                triton_data_type = TYPE_BYTES;
+                break;
+            default:
+                FT_CHECK_WITH_INFO(false, "Unknown data type with type id: " + std::to_string(ft_tensor.type));
+                break;
+        }
+        MemoryType triton_memory_type;
+        switch (ft_tensor.where) {
+            case MEMORY_CPU:
+                triton_memory_type = MEMORY_CPU;
+                break;
+            case MEMORY_CPU_PINNED:
+                triton_memory_type = MEMORY_CPU_PINNED;
+                break;
+            case MEMORY_GPU:
+                triton_memory_type = MEMORY_GPU;
+                break;
+        }
+        return Tensor{triton_memory_type, triton_data_type, ft_tensor.shape, ft_tensor.data};
+    }
+};
+
+}  // namespace triton
+
+using triton_stream_cb_t = void(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>, void*);
+
+struct AbstractTransformerModel;
+struct AbstractTransformerModelInstance;
+
+struct AbstractTransformerModelInstance {
+    virtual std::shared_ptr<std::vector<triton::Tensor>>
+    forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
+
+    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors) = 0;
+
+    virtual std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
+    forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors, ft::AbstractInstanceComm*)
+    {
+        return forward(input_tensors);
+    }
+
+    void registerCallback(triton_stream_cb_t* cb, void* ctx)
+    {
+        stream_cb_  = cb;
+        stream_ctx_ = ctx;
+    }
+
+    void unRegisterCallback()
+    {
+        stream_cb_  = nullptr;
+        stream_ctx_ = nullptr;
+    }
+
+    triton_stream_cb_t* stream_cb_  = nullptr;
+    void*               stream_ctx_ = nullptr;
+};
+
+struct AbstractTransformerModel {
+    static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
+
+    virtual std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
+    createNcclParams(const int node_id, const int device_id_start = 0, const bool multi_node = false);
+
+    virtual void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
+                                   int                                                   world_size) = 0;
+
+    virtual std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size)
+    {
+        return nullptr;
+    }
+
+    virtual std::unique_ptr<AbstractTransformerModelInstance>
+    createModelInstance(int                                                               deviceId,
+                        int                                                               rank,
+                        cudaStream_t                                                      stream,
+                        std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+                        std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) = 0;
+
+    virtual void createSharedWeights(int deviceId, int rank) = 0;
+
+    virtual std::string toString()            = 0;
+    virtual int         getTensorParaSize()   = 0;
+    virtual int         getPipelineParaSize() = 0;
+};
diff --git a/src/fastertransformer/triton_backend/triton_utils.hpp b/src/fastertransformer/triton_backend/triton_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..36ee486adbd1e94cd660cd767cbc3536a9dbfdc4
--- /dev/null
+++ b/src/fastertransformer/triton_backend/triton_utils.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
+#include "src/fastertransformer/utils/Tensor.h"
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void move_tensor_H2D(const triton::Tensor&                                          tensor,
+                     T*&                                                            d_ptr,
+                     const std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>>* allocator)
+{
+    if (tensor.where == triton::MEMORY_GPU) {
+        return;
+    }
+
+    size_t tensor_size = 1;
+    for (auto t : tensor.shape) {
+        tensor_size *= t;
+    }
+
+    cudaStream_t stream = (*allocator)->returnStream();
+
+    d_ptr = (T*)((*allocator)->reMalloc(d_ptr, sizeof(T) * tensor_size, false));
+    ft::check_cuda_error(cudaMemcpyAsync(d_ptr, (T*)tensor.data, sizeof(T) * tensor_size, cudaMemcpyDefault, stream));
+}
+
+template<typename T>
+ft::Tensor as_GPU_tensor(const triton::Tensor& tensor, T* d_ptr)
+{
+    return ft::Tensor{ft::MEMORY_GPU,
+                      triton::Tensor::convertTritonTypeToFt(tensor.type),
+                      tensor.shape,
+                      tensor.where == triton::MEMORY_CPU ? d_ptr : tensor.data};
+}
+
+inline ft::Tensor as_CPU_tensor(const triton::Tensor& tensor)
+{
+    return ft::Tensor{ft::MEMORY_CPU, triton::Tensor::convertTritonTypeToFt(tensor.type), tensor.shape, tensor.data};
+}
diff --git a/src/fastertransformer/utils/CMakeLists.txt b/src/fastertransformer/utils/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16fc1a64c381a2ae13e66b0ddda6eb9dc6a48502
--- /dev/null
+++ b/src/fastertransformer/utils/CMakeLists.txt
@@ -0,0 +1,105 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.8)
+
+add_subdirectory(gemm_test)
+
+add_library(cuda_utils STATIC cuda_utils.cc)
+set_property(TARGET cuda_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cuda_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cuda_utils PUBLIC -lcudart)
+
+add_library(logger STATIC logger.cc)
+set_property(TARGET logger PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET logger PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(logger PUBLIC -lcudart)
+
+add_library(cublasAlgoMap STATIC cublasAlgoMap.cc)
+set_property(TARGET cublasAlgoMap PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasAlgoMap PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasAlgoMap PUBLIC -lcublas -lcudart -lcurand cuda_utils logger)
+
+add_library(cublasMMWrapper STATIC cublasMMWrapper.cc)
+set_property(TARGET cublasMMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasMMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasMMWrapper PUBLIC -lcublas -lcudart -lcurand cublasAlgoMap cuda_utils logger)
+if (SPARSITY_SUPPORT)
+target_link_libraries(cublasMMWrapper PUBLIC -lcusparse -lcusparseLt)
+endif()
+
+add_library(word_list STATIC word_list.cc)
+set_property(TARGET word_list PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET word_list PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(nvtx_utils STATIC nvtx_utils.cc)
+set_property(TARGET nvtx_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET nvtx_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(nvtx_utils PUBLIC -lnvToolsExt)
+
+add_library(memory_utils STATIC memory_utils.cu)
+set_property(TARGET memory_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET memory_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(memory_utils PUBLIC cuda_utils logger tensor)
+
+add_library(mpi_utils STATIC mpi_utils.cc)
+set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+if (BUILD_MULTI_GPU)
+    target_link_libraries(mpi_utils PUBLIC mpi logger)
+endif()
+
+add_library(nccl_utils STATIC nccl_utils.cc)
+set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+if (BUILD_MULTI_GPU)
+    target_link_libraries(nccl_utils PUBLIC ${NCCL_LIBRARIES} mpi_utils logger)
+endif()
+
+add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)
+set_property(TARGET cublasINT8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasINT8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasINT8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand cublasAlgoMap cublasMMWrapper cuda_utils logger)
+
+if(ENABLE_FP8)
+add_library(cublasFP8MMWrapper STATIC cublasFP8MMWrapper.cu)
+set_property(TARGET cublasFP8MMWrapper PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cublasFP8MMWrapper PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(cublasFP8MMWrapper PUBLIC -lcublasLt -lcudart -lcurand 
+                      cublasAlgoMap cublasMMWrapper nvtx_utils fp8_qgmma_1x1_utils)
+endif()
+
+add_library(custom_ar_comm STATIC custom_ar_comm.cc)
+set_property(TARGET custom_ar_comm PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET custom_ar_comm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(custom_ar_comm PUBLIC custom_ar_kernels memory_utils cuda_utils logger)
+
+add_library(gemm STATIC gemm.cc)
+set_property(TARGET gemm PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET gemm PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(gemm PUBLIC
+                      -lcublas -lcublasLt -lcudart -lcurand
+                      cublasAlgoMap memory_utils cuda_utils logger)
+if (SPARSITY_SUPPORT)
+    target_link_libraries(gemm PUBLIC -lcusparse -lcusparseLt)
+endif()
+
+add_library(cuda_fp8_utils STATIC cuda_fp8_utils.cu)
+set_property(TARGET cuda_fp8_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET cuda_fp8_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+
+add_library(tensor STATIC Tensor.cc)
+set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(tensor PUBLIC cuda_utils logger)
diff --git a/src/fastertransformer/utils/IA3.h b/src/fastertransformer/utils/IA3.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a0d227e87598fd929b558ae62317b9b899defc3
--- /dev/null
+++ b/src/fastertransformer/utils/IA3.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace fastertransformer {
+
+enum IA3_config {
+    KEY_ADAPTER   = 1 << 0,
+    VALUE_ADAPTER = 1 << 1,
+    MLP_ADAPTER   = 1 << 2,
+};
+
+static constexpr IA3_config IA3_NONE                    = static_cast<IA3_config>(0);
+static constexpr size_t     IA3_ADAPTER_MAX_NUM_ENCODER = 3;
+static constexpr size_t     IA3_ADAPTER_MAX_NUM_DECODER = 5;
+
+static inline IA3_config operator&(IA3_config x, IA3_config y)
+{
+    return static_cast<IA3_config>(static_cast<int>(x) & static_cast<int>(y));
+}
+
+static inline IA3_config operator|(IA3_config x, IA3_config y)
+{
+    return static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
+}
+
+static inline IA3_config& operator|=(IA3_config& x, IA3_config y)
+{
+    return x = static_cast<IA3_config>(static_cast<int>(x) | static_cast<int>(y));
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/ScaleList.h b/src/fastertransformer/utils/ScaleList.h
new file mode 100644
index 0000000000000000000000000000000000000000..24130ad0661589398d5f1af056298c1c72a1ebe4
--- /dev/null
+++ b/src/fastertransformer/utils/ScaleList.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "stdlib.h"
+
+namespace fastertransformer {
+
+#define ACTIVATION_AMAX_NUM 72
+#define INT8O_GEMM_NUM 8
+#define TRT_AMAX_NUM 3
+#define SCALE_RESERVE_NUM 21
+
+struct ScaleList {
+    // Part 1 -- 72:
+    //   First 72 are for activation amaxs. For each activation amax, there are 4 values: amax, amax/127.0f,
+    //   amax/127.0f/127.0f, 127.0f/amax -- input_amax 0-3 , Q_aftergemm_amax 4-7, Qbias_amax 8-11, K_aftergemm_amax
+    //   12-15, Kbias_amax 16-19, V_aftergemm_amax 20-23, Vbias_amax 24-27, bmm1_amax 28-31, Softmax_amax 32-35,
+    //   bmm2_amax 36-39, Proj_aftergemm_scale 40-43, ProjBiasNorm_amax 44-47, FC1_aftergemm_amax 48-51, F1Bias_amax
+    //   52-55, FC2_aftergemm_amax 56-59, F2BiasNorm_amax 60-63, reserve 64-71
+    // Part 2 -- 9*hidden_dim:
+    //   Kernel amaxs, for each kernel amax list, there are output_channel values : query_weight_amax_list,
+    //   key_weight_amax_list, value_weight_amax_list, proj_weight_amax_list, FC1_weight_amax_list, FC2_weight_amax_list
+    // Part 3 -- 8:
+    //   Int8 gemm deQFactor list (8 values): Q_deQ_scale, K_deQ_scale, V_deQ_scale, bmm1_deQ_scale, bmm2_deQ_scale,
+    //   FC0_deQ_scale, FC1_deQ_scale, FC2_deQ_scale
+    // Part 4 -- 3:
+    //   Amax used in trt fused mha kernel (3 values) : QKVbias_amax, Softmax_amax, bmm2_amax
+    // Part 5 -- 21: reverse
+    const float* d_scale_list_ = nullptr;
+    const float* h_scale_list_ = nullptr;
+    size_t       size_         = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM + TRT_AMAX_NUM;
+    size_t       p2_offset_    = ACTIVATION_AMAX_NUM;
+    size_t       p3_offset_    = ACTIVATION_AMAX_NUM + 9 * 768;
+    size_t       p4_offset_    = ACTIVATION_AMAX_NUM + 9 * 768 + INT8O_GEMM_NUM;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/Tensor.cc b/src/fastertransformer/utils/Tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2e1f5c3d654d016ced178d146a992d8fd2f85cf
--- /dev/null
+++ b/src/fastertransformer/utils/Tensor.cc
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/string_utils.h"
+
+#include "stdlib.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <dirent.h>
+#include <numeric>
+#include <stdlib.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unordered_map>
+#include <vector>
+
+namespace fastertransformer {
+
+Tensor::Tensor():
+    // a none tensor.
+    where(MEMORY_CPU),
+    type(TYPE_INVALID),
+    shape({}),
+    data(nullptr),
+    offsets({})  // only a record to record offset
+{
+}
+
+Tensor::Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data):
+    where(_where), type(_type), shape(_shape), data(_data)
+{
+}
+
+Tensor::Tensor(const MemoryType          _where,
+               const DataType            _type,
+               const std::vector<size_t> _shape,
+               const void*               _data,
+               const std::vector<size_t> _offset):
+    where(_where), type(_type), shape(_shape), data(_data), offsets(_offset)
+{
+}
+
+void Tensor::parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data)
+{
+    const char magic[]                   = "\x93"
+                                           "NUMPY";
+    char       magic_test[sizeof(magic)] = "\0";
+
+    size_t n_elems = fread((void*)magic_test, sizeof(char), sizeof(magic) - 1, f_ptr);
+    if (n_elems != sizeof(magic) - 1 || std::string(magic) != std::string(magic_test)) {
+        throw std::runtime_error("Could read magic token in NPY file");
+    }
+
+    uint8_t npy_major = 0;
+    uint8_t npy_minor = 0;
+    n_elems           = fread((void*)&npy_major, sizeof(uint8_t), 1, f_ptr);
+    n_elems += fread((void*)&npy_minor, sizeof(uint8_t), 1, f_ptr);
+
+    if (npy_major == 1) {
+        uint16_t header_len_u16 = 0;
+        n_elems                 = fread((void*)&header_len_u16, sizeof(uint16_t), 1, f_ptr);
+        header_len              = header_len_u16;
+    }
+    else if (npy_major == 2) {
+        uint32_t header_len_u32 = 0;
+        n_elems                 = fread((void*)&header_len_u32, sizeof(uint32_t), 1, f_ptr);
+        header_len              = header_len_u32;
+    }
+    else {
+        throw std::runtime_error("Unsupported npy version: " + std::to_string(npy_major));
+    }
+
+    start_data = 8 + 2 * npy_major + header_len;
+}
+
+int Tensor::parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape)
+{
+    char*  header_c = (char*)malloc(header_len * sizeof(char));
+    size_t n_elems  = fread((void*)header_c, sizeof(char), header_len, f_ptr);
+    if (n_elems != header_len) {
+        free(header_c);
+        return -1;
+    }
+    std::string header(header_c, header_len);
+    free(header_c);
+
+    size_t start, end;
+    start = header.find("'descr'") + 7;
+    start = header.find("'", start);
+    end   = header.find("'", start + 1);
+    type  = typeFromNumpyDesc(header.substr(start + 1, end - start - 1));
+
+    start = header.find("'fortran_order'") + 15;
+    start = header.find(":", start);
+    end   = header.find(",", start + 1);
+    if (header.substr(start + 1, end - start - 1).find("False") == std::string::npos) {
+        throw std::runtime_error("Unsupported value for fortran_order while reading npy file");
+    }
+
+    start = header.find("'shape'") + 7;
+    start = header.find("(", start);
+    end   = header.find(")", start + 1);
+
+    std::istringstream shape_stream(header.substr(start + 1, end - start - 1));
+    std::string        token;
+
+    shape.clear();
+    while (std::getline(shape_stream, token, ',')) {
+        if (token.find_first_not_of(' ') == std::string::npos) {
+            break;
+        }
+        shape.push_back(std::stoul(token));
+    }
+
+    return 0;
+}
+
+Tensor Tensor::loadNpy(const std::string& npy_file, const MemoryType where)
+{
+    DataType            type;
+    std::vector<size_t> shape;
+
+    FILE* f_ptr = fopen(npy_file.c_str(), "rb");
+    if (f_ptr == nullptr) {
+        throw std::runtime_error("Could not open file " + npy_file);
+    }
+    uint32_t header_len, start_data;
+    parseNpyIntro(f_ptr, header_len, start_data);
+    parseNpyHeader(f_ptr, header_len, type, shape);
+
+    const size_t size     = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+    void*        data_cpu = malloc(size * Tensor::getTypeSize(type));
+    void*        data     = data_cpu;
+
+    size_t n_elems = fread(data_cpu, Tensor::getTypeSize(type), size, f_ptr);
+    FT_CHECK_WITH_INFO(n_elems == size, "reading tensor failed");
+    if (where == MEMORY_GPU) {
+        cudaMalloc(&data, size * Tensor::getTypeSize(type));
+        cudaMemcpy(data, data_cpu, size * Tensor::getTypeSize(type), cudaMemcpyHostToDevice);
+        free(data_cpu);
+    }
+
+    fclose(f_ptr);
+    return Tensor(where, type, shape, data);
+}
+
+size_t Tensor::size() const
+{
+    if (data == nullptr || shape.size() == 0) {
+        return 0;
+    }
+    return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
+}
+
+size_t Tensor::sizeBytes() const
+{
+    return size() * Tensor::getTypeSize(type);
+}
+
+std::string Tensor::whereToString() const
+{
+    static const std::unordered_map<MemoryType, std::string> mem_to_string{
+        {MEMORY_CPU, "CPU"}, {MEMORY_CPU_PINNED, "CPU_PINNED"}, {MEMORY_GPU, "GPU"}};
+    return mem_to_string.at(where);
+}
+
+std::string Tensor::toString() const
+{
+    std::string memtype_str = whereToString();
+
+    static const std::unordered_map<DataType, std::string> type_to_string{
+        {TYPE_BOOL, "BOOL"},
+        {TYPE_UINT8, "UINT8"},
+        {TYPE_UINT16, "UINT16"},
+        {TYPE_UINT32, "UINT32"},
+        {TYPE_UINT64, "UINT64"},
+        {TYPE_INT8, "INT8"},
+        {TYPE_INT16, "INT16"},
+        {TYPE_INT32, "INT32"},
+        {TYPE_INT64, "INT64"},
+        {TYPE_BF16, "BF16"},
+        {TYPE_FP16, "FP16"},
+        {TYPE_FP32, "FP32"},
+        {TYPE_FP64, "FP64"},
+        {TYPE_BYTES, "BYTES"},
+        {TYPE_INVALID, "INVALID"},
+        {TYPE_FP8_E4M3, "E4M3"},
+        {TYPE_VOID, "VOID"},
+    };
+    return fmtstr("Tensor[where=%s, type=%s, shape=%s, data=%p]",
+                  memtype_str.c_str(),
+                  type_to_string.at(type).c_str(),
+                  vec2str(shape).c_str(),
+                  data);
+}
+
+DataType Tensor::typeFromNumpyDesc(std::string type)
+{
+    static const std::unordered_map<std::string, DataType> type_map{{"?", TYPE_BOOL},
+                                                                    {"b", TYPE_BYTES},
+                                                                    {"u1", TYPE_UINT8},
+                                                                    {"u2", TYPE_UINT16},
+                                                                    {"u4", TYPE_UINT32},
+                                                                    {"u8", TYPE_UINT64},
+                                                                    {"i1", TYPE_INT8},
+                                                                    {"i2", TYPE_INT16},
+                                                                    {"i4", TYPE_INT32},
+                                                                    {"i8", TYPE_INT64},
+                                                                    {"f2", TYPE_FP16},
+                                                                    {"f4", TYPE_FP32},
+                                                                    {"f8", TYPE_FP64}};
+    return type_map.at(type);
+}
+
+size_t Tensor::getTypeSize(DataType type)
+{
+    static const std::unordered_map<DataType, size_t> type_map{{TYPE_BOOL, sizeof(bool)},
+                                                               {TYPE_BYTES, sizeof(char)},
+                                                               {TYPE_UINT8, sizeof(uint8_t)},
+                                                               {TYPE_UINT16, sizeof(uint16_t)},
+                                                               {TYPE_UINT32, sizeof(uint32_t)},
+                                                               {TYPE_UINT64, sizeof(uint64_t)},
+                                                               {TYPE_INT8, sizeof(int8_t)},
+                                                               {TYPE_INT16, sizeof(int16_t)},
+                                                               {TYPE_INT32, sizeof(int32_t)},
+                                                               {TYPE_INT64, sizeof(int64_t)},
+#ifdef ENABLE_BF16
+                                                               {TYPE_BF16, sizeof(__nv_bfloat16)},
+#endif
+#ifdef ENABLE_FP8
+                                                               {TYPE_FP8_E4M3, sizeof(__nv_fp8_e4m3)},
+#endif
+                                                               {TYPE_FP16, sizeof(half)},
+                                                               {TYPE_FP32, sizeof(float)},
+                                                               {TYPE_FP64, sizeof(double)}};
+    return type_map.at(type);
+}
+
+std::string Tensor::getNumpyTypeDesc(DataType type) const
+{
+    static const std::unordered_map<DataType, std::string> type_map{{TYPE_INVALID, "x"},
+                                                                    {TYPE_BOOL, "?"},
+                                                                    {TYPE_BYTES, "b"},
+                                                                    {TYPE_UINT8, "u1"},
+                                                                    {TYPE_UINT16, "u2"},
+                                                                    {TYPE_UINT32, "u4"},
+                                                                    {TYPE_UINT64, "u8"},
+                                                                    {TYPE_INT8, "i1"},
+                                                                    {TYPE_INT16, "i2"},
+                                                                    {TYPE_INT32, "i4"},
+                                                                    {TYPE_INT64, "i8"},
+                                                                    {TYPE_FP16, "f2"},
+                                                                    {TYPE_FP32, "f4"},
+                                                                    {TYPE_FP64, "f8"}};
+
+    if (type == TYPE_BF16) {
+        FT_LOG_WARNING("getNumpyTypeDesc(TYPE_BF16) returns an invalid type 'x' since Numpy doesn't "
+                       "support bfloat16 as of now, it will be properly extended if numpy supports. "
+                       "Please refer for the discussions https://github.com/numpy/numpy/issues/19808.");
+    }
+
+    return type_map.count(type) > 0 ? type_map.at(type) : "x";
+}
+
+void Tensor::saveNpy(const std::string& filename) const
+{
+    // Save tensor to NPY 1.0 format (see https://numpy.org/neps/nep-0001-npy-format.html)
+    void*  cpu_data     = (void*)data;
+    bool   is_data_temp = false;
+    size_t tensor_size  = size();
+    if (where == MemoryType::MEMORY_GPU) {
+        cpu_data     = malloc(tensor_size * Tensor::getTypeSize(type));
+        is_data_temp = true;
+        cudaDeviceSynchronize();
+        cudaMemcpy(cpu_data, data, tensor_size * Tensor::getTypeSize(type), cudaMemcpyDeviceToHost);
+    }
+
+    const char    magic[]   = "\x93"
+                              "NUMPY";
+    const uint8_t npy_major = 1;
+    const uint8_t npy_minor = 0;
+
+    std::stringstream header_stream;
+    header_stream << "{'descr': '" << getNumpyTypeDesc(type) << "', 'fortran_order': False, 'shape': (";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        header_stream << shape[i];
+        if (i + 1 < shape.size() || shape.size() == 1) {
+            header_stream << ", ";
+        }
+    }
+    header_stream << ")}";
+    int base_length = 6 + 4 + header_stream.str().size();
+    int pad_length  = 16 * ((base_length + 1 + 15) / 16);  // Take ceiling of base_length + 1 (for '\n' ending)
+    for (int i = 0; i < pad_length - base_length; ++i) {
+        header_stream << ((i == pad_length - base_length - 1) ? "\n" : "\x20");
+    }
+    std::string    header     = header_stream.str();
+    const uint16_t header_len = header.size();
+
+    FILE* f_ptr = fopen(filename.c_str(), "wb");
+    FT_CHECK_WITH_INFO(f_ptr != nullptr, fmtstr("Unable to open %s for writing.\n", filename.c_str()));
+
+    fwrite(magic, sizeof(char), sizeof(magic) - 1, f_ptr);
+    fwrite(&npy_major, sizeof(uint8_t), 1, f_ptr);
+    fwrite(&npy_minor, sizeof(uint8_t), 1, f_ptr);
+    fwrite(&header_len, sizeof(uint16_t), 1, f_ptr);
+    fwrite(header.c_str(), sizeof(char), header_len, f_ptr);
+    fwrite(cpu_data, Tensor::getTypeSize(type), tensor_size, f_ptr);
+
+    fclose(f_ptr);
+
+    if (is_data_temp) {
+        free(cpu_data);
+    }
+}
+
+Tensor Tensor::slice(std::vector<size_t> shape, size_t offset) const
+{
+    if (this->data != nullptr) {
+        size_t n_elts        = this->size();
+        size_t n_sliced_elts = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+        FT_CHECK_WITH_INFO(
+            n_sliced_elts + offset <= n_elts,
+            fmtstr("The number (%ld) of elements of sliced tensor exceeds that (%ld) of the original tensor",
+                   n_sliced_elts + offset,
+                   n_elts));
+    }
+    return Tensor(this->where, this->type, shape, this->getPtrWithOffset(offset));
+}
+
+TensorMap::TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map)
+{
+    for (auto& kv : tensor_map) {
+        if (isValid(kv.second)) {
+            insert(kv.first, kv.second);
+        }
+        else {
+            FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", kv.first.c_str()));
+        }
+    }
+}
+
+TensorMap::TensorMap(const std::vector<Tensor>& tensor_map)
+{
+    for (size_t i = 0; i < tensor_map.size(); i++) {
+        insert(std::to_string(i), tensor_map[i]);
+    }
+}
+
+TensorMap::TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map)
+{
+    for (auto& pair : tensor_map) {
+        if (isValid(pair.second)) {
+            insert(pair.first, pair.second);
+        }
+        else {
+            FT_LOG_DEBUG(fmtstr("%s is not a valid tensor, skipping insert into TensorMap", pair.first.c_str()));
+        }
+    }
+}
+
+TensorMap::~TensorMap()
+{
+    tensor_map_.clear();
+}
+
+std::vector<std::string> TensorMap::keys() const
+{
+    std::vector<std::string> key_names;
+    for (auto& kv : tensor_map_) {
+        key_names.push_back(kv.first);
+    }
+    return key_names;
+}
+
+std::string TensorMap::toString()
+{
+    std::stringstream ss;
+    ss << "{";
+    std::vector<std::string> key_names = keys();
+    for (size_t i = 0; i < tensor_map_.size(); ++i) {
+        ss << key_names[i] << ": " << at(key_names[i]).toString();
+        if (i < tensor_map_.size() - 1) {
+            ss << ", ";
+        }
+    }
+    ss << "}";
+    return ss.str();
+}
+
+TensorMap TensorMap::fromNpyFolder(const std::string& base_folder)
+{
+    DIR* dir_p = opendir(base_folder.c_str());
+    FT_CHECK_WITH_INFO(dir_p != nullptr, fmtstr("Could not open folder %s. ", base_folder.c_str()));
+    struct dirent* dp;
+
+    TensorMap ret_tensor;
+    while ((dp = readdir(dir_p)) != nullptr) {
+        std::string filename(dp->d_name);
+        size_t      len = filename.length();
+        if (len < 4 || filename.compare(len - 4, 4, ".npy")) {
+            continue;
+        }
+
+        size_t pos = filename.find('-');
+        FT_CHECK_WITH_INFO(pos != std::string::npos, fmtstr("Invalid filename: %s\n", filename.c_str()));
+
+        MemoryType where;
+        if (filename.compare(0, pos, "GPU") == 0) {
+            where = MEMORY_GPU;
+        }
+        else if (filename.compare(0, pos, "CPU") == 0) {
+            where = MEMORY_CPU;
+        }
+        else if (filename.compare(0, pos, "CPU_PINNED") == 0) {
+            where = MEMORY_CPU_PINNED;
+        }
+        else {
+            FT_CHECK_WITH_INFO(false, fmtstr("Invalid filename: %s\n", filename.c_str()));
+        }
+        std::string key = filename.substr(pos + 1, len - pos - 5);
+
+        ret_tensor.tensor_map_.insert({key, Tensor::loadNpy(base_folder + "/" + filename, where)});
+    }
+
+    closedir(dir_p);
+
+    return ret_tensor;
+}
+
+void TensorMap::saveNpy(const std::string& base_folder)
+{
+    mode_t mode_0755 = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+    int    ret       = mkdir(base_folder.c_str(), mode_0755);
+    FT_CHECK_WITH_INFO(ret == 0 || errno == EEXIST, fmtstr("Could not create folder %s.\n", base_folder.c_str()));
+
+    for (const auto& item : tensor_map_) {
+        item.second.saveNpy(base_folder + "/" + item.second.whereToString() + "-" + item.first + ".npy");
+    }
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/Tensor.h b/src/fastertransformer/utils/Tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..435d7dcd039eb75a96e5b7886f5f1a2b993f5e8d
--- /dev/null
+++ b/src/fastertransformer/utils/Tensor.h
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/string_utils.h"
+
+#include "stdlib.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <dirent.h>
+#include <numeric>
+#include <stdlib.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unordered_map>
+#include <vector>
+
+namespace fastertransformer {
+
+typedef enum datatype_enum
+{
+    TYPE_INVALID,
+    TYPE_BOOL,
+    TYPE_UINT8,
+    TYPE_UINT16,
+    TYPE_UINT32,
+    TYPE_UINT64,
+    TYPE_INT8,
+    TYPE_INT16,
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_FP16,
+    TYPE_FP32,
+    TYPE_FP64,
+    TYPE_BYTES,
+    TYPE_BF16,
+    TYPE_FP8_E4M3,
+    TYPE_STR,
+    TYPE_VOID,
+} DataType;
+
+template<typename T>
+DataType getTensorType()
+{
+    if (std::is_same<T, float>::value || std::is_same<T, const float>::value) {
+        return TYPE_FP32;
+    }
+    else if (std::is_same<T, half>::value || std::is_same<T, const half>::value) {
+        return TYPE_FP16;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value || std::is_same<T, const __nv_bfloat16>::value) {
+        return TYPE_BF16;
+    }
+#endif
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value || std::is_same<T, const __nv_fp8_e4m3>::value) {
+        return TYPE_FP8_E4M3;
+    }
+#endif
+    else if (std::is_same<T, int>::value || std::is_same<T, const int>::value) {
+        return TYPE_INT32;
+    }
+    else if (std::is_same<T, int8_t>::value || std::is_same<T, const int8_t>::value) {
+        return TYPE_INT8;
+    }
+    else if (std::is_same<T, uint>::value || std::is_same<T, const uint>::value) {
+        return TYPE_UINT32;
+    }
+    else if (std::is_same<T, unsigned long long int>::value || std::is_same<T, const unsigned long long int>::value) {
+        return TYPE_UINT64;
+    }
+    else if (std::is_same<T, bool>::value || std::is_same<T, const bool>::value) {
+        return TYPE_BOOL;
+    }
+    else if (std::is_same<T, char>::value || std::is_same<T, const char>::value) {
+        return TYPE_BYTES;
+    }
+    else {
+        return TYPE_INVALID;
+    }
+}
+
+typedef enum memorytype_enum
+{
+    MEMORY_CPU,
+    MEMORY_CPU_PINNED,
+    MEMORY_GPU
+} MemoryType;
+
+struct Tensor {
+    const MemoryType          where;
+    const DataType            type;
+    const std::vector<size_t> shape;
+    const void*               data;  // TODO(bhseuh) modify from const void* to void* const
+    const std::vector<size_t> offsets = std::vector<size_t>{};
+
+    Tensor();
+    Tensor(const MemoryType _where, const DataType _type, const std::vector<size_t> _shape, const void* _data);
+    Tensor(const MemoryType          _where,
+           const DataType            _type,
+           const std::vector<size_t> _shape,
+           const void*               _data,
+           const std::vector<size_t> _offset);
+
+    size_t size() const;
+    size_t sizeBytes() const;
+
+    std::string whereToString() const;
+    std::string toString() const;
+    std::string getNumpyTypeDesc(DataType type) const;
+
+    void          saveNpy(const std::string& filename) const;
+    static Tensor loadNpy(const std::string& npy_file, const MemoryType where);
+
+    static DataType typeFromNumpyDesc(std::string type);
+    static size_t   getTypeSize(DataType type);
+
+    template<typename T>
+    inline T getVal(size_t index) const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        FT_CHECK(where == MEMORY_CPU);
+        FT_CHECK(data != nullptr);
+        FT_CHECK_WITH_INFO(index < size(), "index is larger than buffer size");
+
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        return ((T*)data)[index];
+    }
+
+    template<typename T>
+    inline T getVal() const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        return getVal<T>(0);
+    }
+
+    template<typename T>
+    inline T* getPtr() const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getPtr with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        return (T*)data;
+    }
+
+    inline void* getPtrWithOffset(size_t offset) const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (data == nullptr) {
+            return (void*)data;
+        }
+        else {
+            FT_CHECK_WITH_INFO(offset < size(), "offset is larger than buffer size");
+            return (void*)((char*)data + offset * Tensor::getTypeSize(type));
+        }
+    }
+
+    template<typename T>
+    inline T* getPtrWithOffset(size_t offset) const
+    {
+        FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        if (data == nullptr) {
+            return (T*)data;
+        }
+        else {
+            FT_CHECK_WITH_INFO(offset < size(),
+                               fmtstr("offset (%lu) is larger than buffer size (%lu)", offset, size()));
+            return ((T*)data) + offset;
+        }
+    }
+
+    template<typename T>
+    T max() const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "max() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        size_t max_idx = 0;
+        T      max_val = getVal<T>(max_idx);
+        for (size_t i = 1; i < size(); ++i) {
+            T val = getVal<T>(i);
+            if (val > max_val) {
+                max_idx = i;
+                max_val = val;
+            }
+        }
+        return max_val;
+    }
+
+    template<typename T>
+    T min() const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "min() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        size_t min_idx = 0;
+        T      min_val = getVal<T>(min_idx);
+        for (size_t i = 1; i < size(); ++i) {
+            T val = getVal<T>(i);
+            if (val < min_val) {
+                min_idx = i;
+                min_val = val;
+            }
+        }
+        return min_val;
+    }
+
+    template<typename T>
+    T any(T val) const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "any() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        for (size_t i = 0; i < size(); ++i) {
+            if (getVal<T>(i) == val) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    template<typename T>
+    T all(T val) const
+    {
+        if (getTensorType<T>() != type) {
+            FT_LOG_DEBUG("getVal with type %s, but data type is: %s",
+                         getNumpyTypeDesc(getTensorType<T>()).c_str(),
+                         getNumpyTypeDesc(type).c_str());
+        }
+        FT_CHECK_WITH_INFO(shape.size() > 0 && data != nullptr, "Should be a non-empty tensor.");
+        FT_CHECK_WITH_INFO(where == MEMORY_CPU || where == MEMORY_CPU_PINNED,
+                           "all() supports MEMORY_CPU or MEMORY_CPU_PINNED tensor.");
+        for (size_t i = 0; i < size(); ++i) {
+            if (getVal<T>(i) != val) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    void updateShape(size_t idx, size_t val)
+    {
+        // TODO: find a better way to update the shape
+        std::vector<size_t>& shape_ref = const_cast<std::vector<size_t>&>(shape);
+        shape_ref[idx]                 = val;
+    }
+
+    Tensor slice(std::vector<size_t> shape, size_t offset = 0) const;
+
+private:
+    static void parseNpyIntro(FILE*& f_ptr, uint32_t& header_len, uint32_t& start_data);
+    static int  parseNpyHeader(FILE*& f_ptr, uint32_t header_len, DataType& type, std::vector<size_t>& shape);
+};
+
+class TensorMap {
+private:
+    std::unordered_map<std::string, Tensor> tensor_map_;
+
+    inline bool isValid(const Tensor& tensor)
+    {
+        return tensor.size() > 0 && tensor.data != nullptr;
+    }
+
+public:
+    TensorMap() = default;
+    TensorMap(const std::unordered_map<std::string, Tensor>& tensor_map);
+    TensorMap(const std::vector<Tensor>& tensor_map);
+    TensorMap(std::initializer_list<std::pair<std::string, Tensor>> tensor_map);
+    ~TensorMap();
+
+    inline size_t size() const
+    {
+        return tensor_map_.size();
+    }
+
+    inline bool isExist(const std::string& key) const
+    {
+        FT_LOG_DEBUG("%s for key: %s", __PRETTY_FUNCTION__, key.c_str());
+        return tensor_map_.find(key) != tensor_map_.end();
+    }
+
+    std::vector<std::string> keys() const;
+
+    inline void insert(const std::string& key, const Tensor& value)
+    {
+        FT_CHECK_WITH_INFO(!isExist(key), fmtstr("Duplicated key %s", key.c_str()));
+        FT_CHECK_WITH_INFO(isValid(value), fmtstr("A none tensor or nullptr is not allowed (key is %s)", key.c_str()));
+        tensor_map_.insert({key, value});
+    }
+
+    inline void insertIfValid(const std::string& key, const Tensor& value)
+    {
+        if (isValid(value)) {
+            insert({key, value});
+        }
+    }
+
+    inline void insert(std::pair<std::string, Tensor> p)
+    {
+        tensor_map_.insert(p);
+    }
+
+    // prevent converting int or size_t to string automatically
+    Tensor at(int tmp)    = delete;
+    Tensor at(size_t tmp) = delete;
+
+    inline Tensor& at(const std::string& key)
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key);
+    }
+
+    inline Tensor at(const std::string& key) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key);
+    }
+
+    inline Tensor& at(const std::string& key, Tensor& default_tensor)
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    inline Tensor at(const std::string& key, Tensor& default_tensor) const
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    inline Tensor& at(const std::string& key, Tensor&& default_tensor)
+    {
+        FT_LOG_DEBUG("%s for key %s", __PRETTY_FUNCTION__, key.c_str());
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    inline Tensor at(const std::string& key, Tensor&& default_tensor) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key);
+        }
+        return default_tensor;
+    }
+
+    template<typename T>
+    inline T getVal(const std::string& key) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getVal<T>();
+    }
+
+    template<typename T>
+    inline T getVal(const std::string& key, T default_value) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getVal<T>();
+        }
+        return default_value;
+    }
+
+    template<typename T>
+    inline T getValWithOffset(const std::string& key, size_t index) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getVal<T>(index);
+    }
+
+    template<typename T>
+    inline T getValWithOffset(const std::string& key, size_t index, T default_value) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getVal<T>(index);
+        }
+        return default_value;
+    }
+
+    template<typename T>
+    inline T* getPtr(const std::string& key) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getPtr<T>();
+    }
+
+    template<typename T>
+    inline T* getPtr(const std::string& key, T* default_ptr) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getPtr<T>();
+        }
+        return default_ptr;
+    }
+
+    template<typename T>
+    inline T* getPtrWithOffset(const std::string& key, size_t index) const
+    {
+        FT_CHECK_WITH_INFO(isExist(key),
+                           fmtstr("Cannot find a tensor of name %s in the tensor map (keys: %s)",
+                                  key.c_str(),
+                                  vec2str(keys()).c_str()));
+        return tensor_map_.at(key).getPtrWithOffset<T>(index);
+    }
+
+    template<typename T>
+    inline T* getPtrWithOffset(const std::string& key, size_t index, T* default_ptr) const
+    {
+        if (isExist(key)) {
+            return tensor_map_.at(key).getPtrWithOffset<T>(index);
+        }
+        return default_ptr;
+    }
+
+    inline std::unordered_map<std::string, Tensor> getMap() const
+    {
+        return tensor_map_;
+    }
+
+    inline std::unordered_map<std::string, Tensor>::iterator begin()
+    {
+        return tensor_map_.begin();
+    }
+
+    inline std::unordered_map<std::string, Tensor>::iterator end()
+    {
+        return tensor_map_.end();
+    }
+
+    inline std::unordered_map<std::string, Tensor>& get()
+    {
+        return tensor_map_;
+    }
+
+    inline std::unordered_map<std::string, Tensor>::const_iterator begin() const
+    {
+        return tensor_map_.begin();
+    }
+
+    inline std::unordered_map<std::string, Tensor>::const_iterator end() const
+    {
+        return tensor_map_.end();
+    }
+
+    std::string      toString();
+    static TensorMap fromNpyFolder(const std::string& base_folder);
+    void             saveNpy(const std::string& base_folder);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/activation_types.h b/src/fastertransformer/utils/activation_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..c608d3d7d08640a2a963cefe34da14815e3b1a0c
--- /dev/null
+++ b/src/fastertransformer/utils/activation_types.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+enum class ActivationType {
+    Gelu,
+    Relu,
+    Silu,
+    GeGLU,
+    ReGLU,
+    SiGLU,
+    Identity,
+    InvalidType
+};
+
+inline ActivationType getActivationType(std::string activation_type_str)
+{
+    if (activation_type_str == "Gelu" || activation_type_str == "gelu") {
+        return ActivationType::Gelu;
+    }
+    else if (activation_type_str == "Relu" || activation_type_str == "relu") {
+        return ActivationType::Relu;
+    }
+    else if (activation_type_str == "Silu" || activation_type_str == "silu") {
+        return ActivationType::Silu;
+    }
+    else if (activation_type_str == "GeGLU" || activation_type_str == "geglu" || activation_type_str == "gated-gelu") {
+        return ActivationType::GeGLU;
+    }
+    else if (activation_type_str == "ReGLU" || activation_type_str == "reglu" || activation_type_str == "gated-relu") {
+        return ActivationType::ReGLU;
+    }
+    else if (activation_type_str == "SiGLU" || activation_type_str == "gated-silu") {
+        return ActivationType::SiGLU;
+    }
+    else {
+        FT_CHECK_WITH_INFO(false, "Activation Type: " + activation_type_str + " not supported !");
+    }
+    return ActivationType::InvalidType;
+}
+
+inline bool isGatedActivation(ActivationType activaiton_type)
+{
+    return activaiton_type == ActivationType::GeGLU || activaiton_type == ActivationType::ReGLU
+           || activaiton_type == ActivationType::SiGLU;
+}
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/utils/allocator.h b/src/fastertransformer/utils/allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3b833db1c82504c0deb0dbab2cf92abefc0cf61
--- /dev/null
+++ b/src/fastertransformer/utils/allocator.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Memory Allocator
+ **/
+
+#pragma once
+
+#include "cuda_utils.h"
+#include <cuda_runtime.h>
+#include <unordered_map>
+#include <vector>
+
+#ifdef GOOGLE_CUDA
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#endif
+
+#ifdef TORCH_CUDA
+#include "torch/extension.h"
+#include <memory>
+#endif
+
+#include "src/fastertransformer/utils/logger.h"
+
+#if defined(CUDART_VERSION) && CUDART_VERSION < 11020
+#define CUDA_MEMORY_POOL_DISABLED
+#endif
+
+namespace fastertransformer {
+
+enum class AllocatorType {
+    CUDA,
+    TF,
+    TH
+};
+
+enum class ReallocType {
+    INCREASE,
+    REUSE,
+    DECREASE,
+};
+
+class IAllocator {
+public:
+    virtual ~IAllocator(){};
+
+    virtual void*        malloc(size_t size, const bool is_set_zero = true, bool is_host = false) = 0;
+    virtual void         free(void** ptr, bool is_host = false) const                             = 0;
+    virtual void         setStream(cudaStream_t stream)                                           = 0;
+    virtual cudaStream_t returnStream()                                                           = 0;
+    virtual void         memSet(void* ptr, const int val, const size_t size)                      = 0;
+
+    template<typename T>
+    void* reMalloc(T* ptr, size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        size              = ((size + 31) / 32) * 32;  // make the buffer align with 32 bytes
+        void* void_ptr    = (void*)ptr;
+        void* ptr_address = getAddress(void_ptr);
+        if (isExist(ptr_address)) {
+            ReallocType realloc_type = isReMalloc(ptr_address, size);
+            if (realloc_type == ReallocType::INCREASE) {
+                FT_LOG_DEBUG("ReMalloc the buffer %p since it is too small.", void_ptr);
+                free((void**)(&void_ptr), is_host);
+                return malloc(size, is_set_zero, is_host);
+            }
+#if !defined(CUDA_MEMORY_POOL_DISABLED)
+            else if (realloc_type == ReallocType::DECREASE) {
+                FT_LOG_DEBUG("ReMalloc the buffer %p to release unused memory to memory pools.", void_ptr);
+                free((void**)(&void_ptr), is_host);
+                return malloc(size, is_set_zero, is_host);
+            }
+#endif
+            else {
+                FT_LOG_DEBUG("Reuse original buffer %p with size %d and do nothing for reMalloc.", void_ptr, size);
+                if (is_set_zero) {
+                    memSet(void_ptr, 0, size);
+                }
+                return void_ptr;
+            }
+        }
+        else {
+            FT_LOG_DEBUG("Cannot find buffer %p, mallocing new one.", void_ptr);
+            return malloc(size, is_set_zero, is_host);
+        }
+    }
+
+protected:
+    virtual bool        isExist(void* address) const                 = 0;
+    virtual ReallocType isReMalloc(void* address, size_t size) const = 0;
+
+    void* getAddress(void* ptr) const
+    {
+        return ptr;
+    }
+};
+
+template<AllocatorType AllocType_>
+class Allocator;
+
+template<>
+class Allocator<AllocatorType::CUDA>: public IAllocator {
+private:
+    const int                          device_id_;
+    cudaStream_t                       stream_ = 0;  // initialize as default stream
+    std::unordered_map<void*, size_t>* pointer_mapping_;
+
+    bool isExist(void* address) const
+    {
+        return pointer_mapping_->count(address) > 0;
+    }
+    ReallocType isReMalloc(void* address, size_t size) const
+    {
+        FT_CHECK(isExist(address));
+        if (pointer_mapping_->at(address) < size) {
+            return ReallocType::INCREASE;
+        }
+        else if (pointer_mapping_->at(address) == size) {
+            return ReallocType::REUSE;
+        }
+        else {
+            return ReallocType::DECREASE;
+        }
+    }
+
+public:
+    Allocator(int device_id): device_id_(device_id)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        pointer_mapping_ = new std::unordered_map<void*, size_t>();
+#if defined(CUDA_MEMORY_POOL_DISABLED)
+        FT_LOG_WARNING(
+            "Async cudaMalloc/Free is not supported before CUDA 11.2. Using Sync cudaMalloc/Free."
+            "Note this may lead to hang with NCCL kernels launched in parallel; if so, try NCCL_LAUNCH_MODE=GROUP");
+#else
+        int device_count = 1;
+        check_cuda_error(cudaGetDeviceCount(&device_count));
+        cudaMemPool_t mempool;
+        check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, device_id));
+        cudaMemAccessDesc desc                  = {};
+        int               peer_access_available = 0;
+        for (int i = 0; i < device_count; i++) {
+            if (i == device_id) {
+                continue;
+            }
+            check_cuda_error(cudaDeviceCanAccessPeer(&peer_access_available, device_id, i));
+            if (!peer_access_available) {
+                FT_LOG_WARNING("Device " + std::to_string(device_id) + " peer access Device " + std::to_string(i)
+                               + " is not available.");
+                continue;
+            }
+            desc.location.type = cudaMemLocationTypeDevice;
+            desc.location.id   = i;
+            desc.flags         = cudaMemAccessFlagsProtReadWrite;
+            check_cuda_error(cudaMemPoolSetAccess(mempool, &desc, 1));
+        }
+        // set memory pool threshold to avoid shrinking the pool
+        uint64_t setVal = UINT64_MAX;
+        check_cuda_error(cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &setVal));
+#endif
+    }
+
+    virtual ~Allocator()
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        while (!pointer_mapping_->empty()) {
+            free((void**)(&pointer_mapping_->begin()->first));
+        }
+        delete pointer_mapping_;
+    }
+
+    void setStream(cudaStream_t stream)
+    {
+        stream_ = stream;
+    }
+
+    cudaStream_t returnStream()
+    {
+        return stream_;
+    };
+
+    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        if (size == 0) {
+            return nullptr;
+        }
+        void* ptr      = nullptr;
+        int   o_device = 0;
+
+        check_cuda_error(getSetDevice(device_id_, &o_device));
+        if (is_host) {
+            check_cuda_error(cudaMallocHost(&ptr, (size_t)(ceil(size / 32.)) * 32));
+        }
+        else {
+#if defined(CUDA_MEMORY_POOL_DISABLED)
+            check_cuda_error(cudaMalloc(&ptr, (size_t)(ceil(size / 32.)) * 32));
+#else
+            check_cuda_error(cudaMallocAsync(&ptr, (size_t)(ceil(size / 32.)) * 32, stream_));
+#endif
+        }
+        if (is_set_zero) {
+            check_cuda_error(cudaMemsetAsync(ptr, 0, (size_t)(ceil(size / 32.)) * 32, stream_));
+        }
+        check_cuda_error(getSetDevice(o_device));
+        FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, size);
+
+        pointer_mapping_->insert({getAddress(ptr), size});
+
+        return ptr;
+    }
+
+    void free(void** ptr, bool is_host = false) const
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        void* address = getAddress(*ptr);
+        if (*ptr != nullptr) {
+            int o_device = 0;
+            if (pointer_mapping_->count(address)) {
+                FT_LOG_DEBUG("Free buffer %p", address);
+                check_cuda_error(getSetDevice(device_id_, &o_device));
+                if (is_host) {
+                    check_cuda_error(cudaFreeHost(*ptr));
+                }
+                else {
+#if defined(CUDA_MEMORY_POOL_DISABLED)
+                    check_cuda_error(cudaFree(*ptr));
+#else
+                    check_cuda_error(cudaFreeAsync(*ptr, stream_));
+                    cudaStreamSynchronize(stream_);
+#endif
+                }
+                check_cuda_error(getSetDevice(o_device));
+                pointer_mapping_->erase(address);
+            }
+            else {
+                FT_LOG_WARNING("pointer_mapping_ does not have information of ptr at %p.", address);
+            }
+        }
+        *ptr = nullptr;
+        return;
+    }
+
+    void memSet(void* ptr, const int val, const size_t size)
+    {
+        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
+    }
+};
+
+#ifdef GOOGLE_CUDA
+using namespace tensorflow;
+template<>
+class Allocator<AllocatorType::TF>: public IAllocator {
+    OpKernelContext*                               context_;
+    std::unordered_map<void*, tensorflow::Tensor>* pointer_mapping_;
+    cudaStream_t                                   stream_;
+
+    bool isExist(void* address) const
+    {
+        return pointer_mapping_->count(address) > 0;
+    }
+    ReallocType isReMalloc(void* address, size_t size) const
+    {
+        FT_CHECK(isExist(address));
+        size_t current_buffer_size = 1;
+        for (int i = 0; i < pointer_mapping_->at(address).dims(); i++) {
+            current_buffer_size *= pointer_mapping_->at(address).dim_size(i);
+        }
+        FT_LOG_DEBUG("current_buffer_size: %d, new buffer: %d", current_buffer_size, size);
+        if (current_buffer_size < size) {
+            return ReallocType::INCREASE;
+        }
+        else if (current_buffer_size == size) {
+            return ReallocType::REUSE;
+        }
+        else {
+            return ReallocType::DECREASE;
+        }
+    }
+
+public:
+    Allocator(OpKernelContext* context, cudaStream_t stream): context_(context), stream_(stream)
+    {
+        pointer_mapping_ = new std::unordered_map<void*, tensorflow::Tensor>();
+    }
+
+    void setStream(cudaStream_t stream)
+    {
+        stream_ = stream;
+    }
+
+    cudaStream_t returnStream()
+    {
+        return stream_;
+    };
+
+    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        tensorflow::Tensor buf;
+        long long int      buf_size = ((long long int)ceil(size / 32.) * 32);
+        tensorflow::Status status;
+        if (is_host) {
+            tensorflow::AllocatorAttributes pinned_allocator;
+            pinned_allocator.set_on_host(true);
+            pinned_allocator.set_gpu_compatible(true);
+            status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf, pinned_allocator);
+        }
+        else {
+            status = context_->allocate_temp(DT_UINT8, TensorShape{buf_size}, &buf);
+        }
+
+        if (status != tensorflow::Status::OK()) {
+            throw std::runtime_error("TF error: context->allocate_temp failed");
+        }
+
+        auto  flat = buf.flat<uint8>();
+        void* ptr  = (void*)flat.data();
+        if (is_set_zero) {
+            cudaMemsetAsync(ptr, 0, buf_size, stream_);
+        }
+        pointer_mapping_->insert({getAddress(ptr), buf});
+
+        return ptr;
+    }
+
+    void free(void** ptr, bool is_host = false) const
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        void* address = getAddress(*ptr);
+        pointer_mapping_->erase(address);
+        *ptr = nullptr;
+        return;
+    }
+
+    virtual ~Allocator()
+    {
+        while (!pointer_mapping_->empty()) {
+            void* ptr = pointer_mapping_->begin()->second.flat<uint8>().data();
+            free((void**)(&ptr));
+        }
+        pointer_mapping_->clear();
+        delete pointer_mapping_;
+    }
+
+    void memSet(void* ptr, const int val, const size_t size)
+    {
+        check_cuda_error(cudaMemsetAsync(ptr, val, size, stream_));
+    }
+};
+#endif
+
+#ifdef TORCH_CUDA
+template<>
+class Allocator<AllocatorType::TH>: public IAllocator {
+    std::unordered_map<void*, torch::Tensor>* pointer_mapping_;
+
+    bool isExist(void* address) const
+    {
+        return pointer_mapping_->count(address) > 0;
+    }
+    ReallocType isReMalloc(void* address, size_t size) const
+    {
+        FT_CHECK(isExist(address));
+        size_t current_buffer_size = 1;
+        for (int i = 0; i < pointer_mapping_->at(address).dim(); i++) {
+            current_buffer_size *= pointer_mapping_->at(address).size(i);
+        }
+        FT_LOG_DEBUG(
+            "current_buffer_size: %d, original buffer: %p, new buffer: %d", current_buffer_size, address, size);
+        if (current_buffer_size < size) {
+            return ReallocType::INCREASE;
+        }
+        else if (current_buffer_size == size) {
+            return ReallocType::REUSE;
+        }
+        else {
+            return ReallocType::DECREASE;
+        }
+    }
+
+public:
+    Allocator()
+    {
+        pointer_mapping_ = new std::unordered_map<void*, torch::Tensor>();
+    }
+
+    void setStream(cudaStream_t stream)
+    {
+        // nothing to do here;
+    }
+
+    cudaStream_t returnStream()
+    {
+        // nothing to do here;
+        return 0;
+    };
+
+    void* malloc(size_t size, const bool is_set_zero = true, bool is_host = false)
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        int64_t       buf_size = static_cast<int64_t>(ceil(size / 32.)) * 32;
+        torch::Tensor buf;
+        if (is_host) {
+            buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCPU).pinned_memory(true));
+        }
+        else {
+            buf = torch::empty({buf_size}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
+        }
+        void* ptr = buf.data_ptr();
+        if (is_set_zero) {
+            cudaMemset(ptr, 0, buf_size);
+        }
+        FT_LOG_DEBUG("malloc buffer %p with size %ld", ptr, buf_size);
+        pointer_mapping_->insert({getAddress(ptr), buf});
+        return ptr;
+    }
+
+    void free(void** ptr, bool is_host = false) const
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        void* address = getAddress(*ptr);
+        pointer_mapping_->erase(address);
+        *ptr = nullptr;
+        return;
+    }
+
+    virtual ~Allocator()
+    {
+        FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+        while (!pointer_mapping_->empty()) {
+            void* ptr = pointer_mapping_->begin()->second.data_ptr();
+            free((void**)(&ptr));
+        }
+        pointer_mapping_->clear();
+        delete pointer_mapping_;
+    }
+
+    void memSet(void* ptr, const int val, const size_t size)
+    {
+        check_cuda_error(cudaMemset(ptr, val, size));
+    }
+};
+#endif
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/conv2d.h b/src/fastertransformer/utils/conv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2b202ab18bacb00b9581682233dcdf5b59a68b1
--- /dev/null
+++ b/src/fastertransformer/utils/conv2d.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasLt.h"
+#include "cuda_utils.h"
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cudnn.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void conv2d(T*             output,
+            const T*       input,
+            const T*       kernel,
+            const int      batch,
+            const int      h,
+            const int      w,
+            const int      in_channels,
+            const int      out_channels,
+            const int      kernel_size,
+            const int      stride,
+            cudnnHandle_t& cudnn_handle)
+{
+    cudnnDataType_t dataType;
+    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+    float           alpha       = 1.0f;
+    float           beta        = 0.0f;
+    if (std::is_same<T, half>::value) {
+        dataType = CUDNN_DATA_HALF;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        dataType = CUDNN_DATA_BFLOAT16;
+    }
+#endif
+    else {
+        dataType = CUDNN_DATA_FLOAT;
+    }
+
+    cudnnTensorDescriptor_t      input_descriptor_;
+    cudnnTensorDescriptor_t      output_descriptor_;
+    cudnnFilterDescriptor_t      kernel_descriptor_;
+    cudnnConvolutionDescriptor_t convolution_descriptor_;
+    cudnnConvolutionFwdAlgo_t    convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_DIRECT;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+    // cudnnConvolutionFwdAlgo_t convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor_));
+    checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor_,
+                                          /*format=*/CUDNN_TENSOR_NCHW,
+                                          /*dataType=*/dataType,
+                                          /*batch_size=*/batch,
+                                          /*channels=*/in_channels,
+                                          /*image_height=*/h,
+                                          /*image_width=*/w));
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor_));
+    checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor_,
+                                          /*format=*/CUDNN_TENSOR_NHWC,
+                                          /*dataType=*/dataType,
+                                          /*batch_size=*/batch,
+                                          /*channels=*/out_channels,
+                                          /*image_height=*/h / stride,
+                                          /*image_width=*/w / stride));
+
+    checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor_));
+    checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor_,
+                                          /*dataType=*/dataType,
+                                          /*format=*/CUDNN_TENSOR_NCHW,
+                                          /*out_channels=*/out_channels,
+                                          /*in_channels=*/in_channels,
+                                          /*kernel_height=*/kernel_size,
+                                          /*kernel_width=*/kernel_size));
+
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor_));
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor_,
+                                               /*pad_height=*/0,
+                                               /*pad_width=*/0,
+                                               /*vertical_stride=*/stride,
+                                               /*horizontal_stride=*/stride,
+                                               /*dilation_height=*/1,
+                                               /*dilation_width=*/1,
+                                               /*mode=*//*CUDNN_CONVOLUTION,*/ CUDNN_CROSS_CORRELATION,
+                                               /*computeType=*/computeType));
+
+    /*checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnn_handle,
+                                                   input_descriptor_,
+                                                   kernel_descriptor_,
+                                                   convolution_descriptor_,
+                                                   output_descriptor_,
+                                                   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+                                                   0,//memoryLimitInBytes
+                                                   &convolution_algorithm_));*/
+
+    checkCUDNN(cudnnConvolutionForward(cudnn_handle,
+                                       &alpha,
+                                       input_descriptor_,
+                                       input,
+                                       kernel_descriptor_,
+                                       kernel,
+                                       convolution_descriptor_,
+                                       convolution_algorithm_,
+                                       nullptr,
+                                       0,
+                                       &beta,
+                                       output_descriptor_,
+                                       output));
+
+    checkCUDNN(cudnnDestroyTensorDescriptor(input_descriptor_));
+    checkCUDNN(cudnnDestroyTensorDescriptor(output_descriptor_));
+    checkCUDNN(cudnnDestroyFilterDescriptor(kernel_descriptor_));
+    checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/convert_data_type.h b/src/fastertransformer/utils/convert_data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6fec358ba605a4cffe3c287fa3225550f360cc1
--- /dev/null
+++ b/src/fastertransformer/utils/convert_data_type.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "stdio.h"
+#include "stdlib.h"
+
+// be consistent with FasterTransformer
+int8_t float_to_int8_rn_host(float x)
+{
+    int8_t  res;
+    int32_t tmp;
+    if (x >= 0) {
+        tmp = int(x + 0.5);
+        tmp = tmp > 127 ? 127 : tmp;
+        res = int8_t(tmp);
+    }
+    else {
+        tmp = int(x - 0.5);
+        tmp = tmp < -127 ? -127 : tmp;
+        res = int8_t(tmp);
+    }
+    return res;
+}
\ No newline at end of file
diff --git a/src/fastertransformer/utils/cublasAlgoMap.cc b/src/fastertransformer/utils/cublasAlgoMap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee46f6adef06e7f21c32a918b2532317746456f2
--- /dev/null
+++ b/src/fastertransformer/utils/cublasAlgoMap.cc
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasAlgoMap.h"
+
+namespace fastertransformer {
+
+cublasAlgoMap::cublasAlgoMap(const std::string filename, const std::string sp_config_filename):
+    config_filename_(filename), sp_config_filename_(sp_config_filename)
+{
+    loadGemmConfig();
+    loadSpGemmConfig();
+}
+
+cublasAlgoMap::cublasAlgoMap(const cublasAlgoMap& algo_map):
+    config_filename_(algo_map.config_filename_),
+    sp_config_filename_(algo_map.sp_config_filename_),
+    algo_map_(algo_map.algo_map_),
+    sp_algo_map_(algo_map.sp_algo_map_)
+{
+}
+
+cublasAlgoMap::~cublasAlgoMap()
+{
+    algo_map_.clear();
+}
+
+void cublasAlgoMap::loadGemmConfig()
+{
+    FILE* fd;
+    fd = fopen(config_filename_.c_str(), "r");
+    if (fd == NULL) {
+        std::cout << "[WARNING] " << config_filename_ << " is not found; using default GEMM algo" << std::endl;
+        return;
+    }
+
+    int   batchCount2, m2, n2, k2, algoId, customOption, tile, splitK_val;
+    int   batch_size, seq_len, head_num, size_per_head, dataType;
+    int   swizzle, reductionScheme, workspaceSize, stages;
+    int   inner_shapeId, cluster_shapeId, mma_shapeId, cga_shapeId, sche_mode;
+    float exec_time;
+    char  tmp[1024];
+    if (!fgets(tmp, 1024, fd)) {
+        printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
+        exit(-1);
+    }
+    while (fscanf(fd,
+                  "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                  "%d %d "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                  "%d %d %d "
+#endif
+                  "%f\n",
+                  &batch_size,
+                  &seq_len,
+                  &head_num,
+                  &size_per_head,
+                  &dataType,
+                  &batchCount2,
+                  &n2,
+                  &m2,
+                  &k2,
+                  &algoId,
+                  &customOption,
+                  &tile,
+                  &splitK_val,
+                  &swizzle,
+                  &reductionScheme,
+                  &workspaceSize,
+                  &stages,
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                  &inner_shapeId,
+                  &cluster_shapeId,
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                  &mma_shapeId,
+                  &cga_shapeId,
+                  &sche_mode,
+#endif
+                  &exec_time)
+           != EOF) {
+        if (dataType != FLOAT_DATATYPE && dataType != HALF_DATATYPE && dataType != BFLOAT16_DATATYPE
+            && dataType != INT8_DATATYPE && dataType != FP8_DATATYPE) {
+            printf("[WARNING][readAlgoFromConfig] wrong dataType %d!\n", dataType);
+            continue;
+        }
+        cublasAlgoConfig_t markStr{batchCount2, m2, n2, k2, static_cast<CublasDataType>(dataType)};
+        // workspaceSize should be zero
+        if (algo_map_.find(markStr) == algo_map_.end()) {
+            algo_map_[markStr].algoId          = algoId;
+            algo_map_[markStr].customOption    = customOption;
+            algo_map_[markStr].tile            = tile;
+            algo_map_[markStr].splitK_val      = splitK_val;
+            algo_map_[markStr].swizzle         = swizzle;
+            algo_map_[markStr].reductionScheme = reductionScheme;
+            algo_map_[markStr].workspaceSize   = workspaceSize;
+            algo_map_[markStr].stages          = stages;
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+            algo_map_[markStr].inner_shapeId   = (uint16_t)inner_shapeId;
+            algo_map_[markStr].cluster_shapeId = (uint16_t)cluster_shapeId;
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+            algo_map_[markStr].mma_shapeId = (uint16_t)mma_shapeId;
+            algo_map_[markStr].cga_shapeId = (uint16_t)cga_shapeId;
+            algo_map_[markStr].sche_mode   = (uint16_t)sche_mode;
+#endif
+            algo_map_[markStr].exec_time = exec_time;
+        }
+    }
+    fclose(fd);
+}
+
+bool cublasAlgoMap::isExist(
+    const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
+{
+    cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
+    return algo_map_.find(mark) != algo_map_.end();
+}
+
+cublasLtMatmulAlgo_info
+cublasAlgoMap::getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type)
+{
+    cublasAlgoConfig_t mark{batch_count, n, m, k, data_type};
+    if (algo_map_.find(mark) != algo_map_.end()) {
+        return algo_map_[mark];
+    }
+    else {
+        cublasLtMatmulAlgo_info tmp_algo;
+        tmp_algo.algoId =
+            static_cast<int>(data_type == FLOAT_DATATYPE ? CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        tmp_algo.customOption    = -1;
+        tmp_algo.tile            = -1;
+        tmp_algo.splitK_val      = -1;
+        tmp_algo.swizzle         = -1;
+        tmp_algo.reductionScheme = -1;
+        tmp_algo.workspaceSize   = -1;
+        tmp_algo.stages          = -1;
+        tmp_algo.exec_time       = -1.0f;
+        return tmp_algo;
+    }
+}
+
+void cublasAlgoMap::loadSpGemmConfig()
+{
+    if (sp_config_filename_.empty()) {
+        return;
+    }
+    FILE* fd = fopen(sp_config_filename_.c_str(), "r");
+    if (fd == NULL) {
+        printf("[WARNING] %s is not found; using SPGEMM algo id 0\n", sp_config_filename_.c_str());
+        return;
+    }
+    sp_algo_map_.clear();
+    int   batch_size, seq_len, head_num, size_per_head, data_type;
+    int   batchCount, m, n, k, algoId;
+    float exec_time;
+    char  tmp[1024];
+    if (!fgets(tmp, 1024, fd)) {
+        printf("[ERROR] fgets fail at %s:%d \n", __FILE__, __LINE__);
+        exit(-1);
+    }
+    while (fscanf(fd,
+                  "%d %d %d %d %d ### %d %d %d %d %d %f\n",
+                  &batch_size,
+                  &seq_len,
+                  &head_num,
+                  &size_per_head,
+                  &data_type,
+                  &batchCount,
+                  &m,
+                  &n,
+                  &k,
+                  &algoId,
+                  &exec_time)
+           != EOF) {
+        char mark[256];
+        sprintf(mark, "%d_%d_%d_%d", batchCount, m, n, k);
+        std::string markStr(mark);
+        sp_algo_map_[markStr] = algoId;
+    }
+    fclose(fd);
+}
+
+int cublasAlgoMap::getSpAlgo(const int batch_count, const int m, const int n, const int k)
+{
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
+    if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
+        return sp_algo_map_[mark];
+    }
+    else {
+        // for remove padding, select algo 1 for simplicity
+        return 0;
+    }
+}
+
+bool cublasAlgoMap::isUseSparse(const int batch_count, const int m, const int n, const int k)
+{
+    // not available to use cusparselt.
+    if (m % 8 != 0 || n % 8 != 0 || k % 8 != 0) {
+        return false;
+    }
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", batch_count, m, n, k);
+    if (sp_algo_map_.find(mark) != sp_algo_map_.end()) {
+        return sp_algo_map_[mark] != -1;
+    }
+    else {
+        // no gemm test case, choose sparse according to sparse flag
+        return true;
+    }
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasAlgoMap.h b/src/fastertransformer/utils/cublasAlgoMap.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb9d607631d8a56ddfde87fefa01feff800087bf
--- /dev/null
+++ b/src/fastertransformer/utils/cublasAlgoMap.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#pragma once
+namespace fastertransformer {
+
+#define GEMM_NUM 6
+#define GEMM_CONFIG "gemm_config.in"
+#define IGEMM_CONFIG "igemm_config.in"
+#define SPGEMM_CONFIG "spgemm_config.in"
+#define SPIGEMM_CONFIG "spigemm_config.in"
+
+typedef struct {
+    int algoId, customOption, tile, splitK_val;
+    int swizzle, reductionScheme, workspaceSize;
+    // only used in cublasLt >= 11.0
+    int stages;
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+    uint16_t inner_shapeId, cluster_shapeId;
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+    uint16_t mma_shapeId, cga_shapeId, sche_mode;
+#endif
+    float exec_time;
+} cublasLtMatmulAlgo_info;
+
+/* Structure to store information about different run trials */
+typedef struct {
+    cublasLtMatmulAlgo_t      algo;
+    cublasStatus_t            status;
+    float                     time;
+    size_t                    workspaceSize;  // actual memory workspace needed
+    cublasMath_t              mathMode;
+    cublasLtReductionScheme_t reductionScheme;
+    int                       customOption;
+    float                     wavesCount;
+} customMatmulPerf_t;
+
+struct cublasAlgoConfig_t {
+    int            batch_count;
+    int            m;
+    int            n;
+    int            k;
+    CublasDataType data_type;
+    bool           operator==(cublasAlgoConfig_t const& config) const
+    {
+        return (batch_count == config.batch_count) && (m == config.m) && (n == config.n) && (k == config.k)
+               && (data_type == config.data_type);
+    }
+};
+
+class cublasAlgoConfig_hasher {
+public:
+    std::size_t operator()(cublasAlgoConfig_t const& config) const
+    {
+        return config.batch_count * 98317ull ^ config.m * 49157ull ^ config.n * 24593ull ^ config.k * 196613ull
+               ^ static_cast<int>(config.data_type) * 6151ull;
+    }
+};
+
+class cublasAlgoMap {
+private:
+    std::unordered_map<cublasAlgoConfig_t, cublasLtMatmulAlgo_info, cublasAlgoConfig_hasher> algo_map_;
+    std::string                                                                              config_filename_;
+    std::string                                                                              sp_config_filename_;
+    std::map<std::string, int>                                                               sp_algo_map_;
+
+public:
+    cublasAlgoMap(){};
+    explicit cublasAlgoMap(const std::string filename, const std::string sp_config_filename = "");
+    cublasAlgoMap(const cublasAlgoMap& map);
+    ~cublasAlgoMap();
+    void loadGemmConfig();
+    void loadSpGemmConfig();
+    int  getSpAlgo(const int batch_count, const int m, const int n, const int k);
+    bool isUseSparse(const int batch_count, const int m, const int n, const int k);
+
+    bool isExist(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
+
+    cublasLtMatmulAlgo_info
+    getAlgo(const int batch_count, const int m, const int n, const int k, const CublasDataType data_type);
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasFP8MMWrapper.cu b/src/fastertransformer/utils/cublasFP8MMWrapper.cu
new file mode 100644
index 0000000000000000000000000000000000000000..63272d435c7bb0b5e1a6a87450a71f61f28db49a
--- /dev/null
+++ b/src/fastertransformer/utils/cublasFP8MMWrapper.cu
@@ -0,0 +1,1021 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasFP8MMWrapper.h"
+#include "cuda_utils.h"
+
+namespace fastertransformer {
+
+#define CUBLAS_WORKSPACE_1MB 1048576
+cublasFP8MMWrapper::cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle,
+                                       cudaStream_t     stream,
+                                       cublasAlgoMap*   cublas_algo_map,
+                                       std::mutex*      mu,
+                                       IAllocator*      allocator):
+    cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
+    cublasVersionCheck();
+
+    if (allocator_ != nullptr) {
+        cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, CUBLAS_WORKSPACE_1MB, true);
+    }
+}
+
+cublasFP8MMWrapper::cublasFP8MMWrapper(cublasHandle_t   cublas_handle,
+                                       cublasLtHandle_t cublaslt_handle,
+                                       cudaStream_t     stream,
+                                       cublasAlgoMap*   cublas_algo_map,
+                                       std::mutex*      mu,
+                                       IAllocator*      allocator):
+    cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    FT_CHECK_WITH_INFO(allocator != nullptr, "must pass allocator to cublasFP8MMWrapper");
+    cublasVersionCheck();
+    if (allocator_ != nullptr) {
+        cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, CUBLAS_WORKSPACE_1MB, true);
+    }
+}
+
+cublasFP8MMWrapper::~cublasFP8MMWrapper()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_ = nullptr;
+    if (allocator_ != nullptr) {
+        allocator_->free((void**)(&cublas_workspace_qgemm_));
+    }
+}
+
+cublasFP8MMWrapper::cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper):
+    cublasMMWrapper(wrapper.cublas_handle_,
+                    wrapper.cublaslt_handle_,
+                    wrapper.stream_,
+                    wrapper.cublas_algo_map_,
+                    wrapper.mu_,
+                    wrapper.allocator_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cublasVersionCheck();
+}
+
+void cublasFP8MMWrapper::cublasVersionCheck()
+{
+    cublasGetProperty(MAJOR_VERSION, &version_major_);
+    cublasGetProperty(MINOR_VERSION, &version_minor_);
+    cublasGetProperty(PATCH_LEVEL, &version_patch_);
+    size_t cublasVersion = (version_major_ * 10000 + version_minor_ * 100 + version_patch_);
+#if defined(FP8_MHA) || !defined(FP8_GEMM_OUTPUT_QUANT_DISABLE)
+    FT_CHECK_WITH_INFO((version_major_ > 11) || (version_major_ == 11 && version_minor_ == 11 && version_patch_ >= 4),
+                       "FP8 MHA needs d-scale, which is only supported after cublas 11.11.4 !");
+
+#endif
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_bfloat16*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale)
+{
+    Gemm(res,
+         batchCount,
+         m,
+         n,
+         k,
+         strideA,
+         strideB,
+         strideD,
+         alpha,
+         beta,
+         input,
+         kernel,
+         input_scale,
+         kernel_scale,
+         (cudaStream_t)0);
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_bfloat16*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale,
+                              cudaStream_t         stream,
+                              bool                 fastAccum)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void*  devAscalePtr = (const void*)kernel_scale;
+    const void*  devBscalePtr = (const void*)input_scale;
+    const size_t wsSizeBytes  = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto dType       = CUDA_R_16BF;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+    // const auto epilogueAuxType = CUDA_R_16BF;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0 && fastAccum) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    bool                    findAlgo = cublas_algo_map_->isExist(batchCount, n, m, k, FP8_DATATYPE);
+    cublasLtMatmulAlgo_info info     = cublas_algo_map_->getAlgo(batchCount, n, m, k, FP8_DATATYPE);
+    if (info.stages == -1) {
+        findAlgo = false;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+    if (findAlgo) {
+        if (info.workspaceSize > workspaceSize) {
+            findAlgo = false;
+        }
+        else {
+            cublasLtMatmulAlgoInit(
+                cublaslt_handle_, computeType, scaleType, aType, bType, dType, dType, info.algoId, &algo);
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+            cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));
+
+#if (CUDART_VERSION >= 11000)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &(info.cluster_shapeId), sizeof(info.cluster_shapeId));
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+#endif
+        }
+    }
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               nullptr,  // Cptr, not used here
+                                               Ddesc,
+                                               res,
+                                               Ddesc,
+                                               (findAlgo ? (&algo) : NULL),
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale,
+                              const float*         output_scale)
+{
+    Gemm(res,
+         batchCount,
+         m,
+         n,
+         k,
+         strideA,
+         strideB,
+         strideD,
+         alpha,
+         beta,
+         input,
+         kernel,
+         input_scale,
+         kernel_scale,
+         output_scale,
+         0);
+}
+
+void cublasFP8MMWrapper::Gemm(__nv_fp8_e4m3*       res,
+                              int                  batchCount,
+                              int                  m,
+                              int                  n,
+                              int                  k,
+                              int64_t              strideA,
+                              int64_t              strideB,
+                              int64_t              strideD,
+                              const float*         alpha,
+                              const float*         beta,
+                              const __nv_fp8_e4m3* input,
+                              const __nv_fp8_e4m3* kernel,
+                              const float*         input_scale,
+                              const float*         kernel_scale,
+                              const float*         output_scale,
+                              cudaStream_t         stream,
+                              bool                 fastAccum)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void* devAscalePtr = (const void*)kernel_scale;
+    const void* devBscalePtr = (const void*)input_scale;
+    const void* devDscalePtr = (const void*)output_scale;
+
+    FT_CHECK(cublas_workspace_ != nullptr);
+    const size_t wsSizeBytes = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto cType       = CUDA_R_16BF;
+    const auto dType       = CUDA_R_8F_E4M3;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Cdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0 && fastAccum) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+        // check_cuda_error(cublasLtMatmulDescSetAttribute(
+        //     matmulDesc, CUBLASLT_MATMUL_DESC_C_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, cType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    bool                    findAlgo = cublas_algo_map_->isExist(batchCount, n, m, k, FP8_DATATYPE);
+    cublasLtMatmulAlgo_info info     = cublas_algo_map_->getAlgo(batchCount, n, m, k, FP8_DATATYPE);
+    if (info.stages == -1) {
+        findAlgo = false;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+    if (findAlgo) {
+        if (info.workspaceSize > workspaceSize) {
+            findAlgo = false;
+        }
+        else {
+            cublasLtMatmulAlgoInit(
+                cublaslt_handle_, computeType, scaleType, aType, bType, cType, dType, info.algoId, &algo);
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+            cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(info.reductionScheme));
+
+#if (CUDART_VERSION >= 11000)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &(info.cluster_shapeId), sizeof(info.cluster_shapeId));
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+            cublasLtMatmulAlgoConfigSetAttribute(
+                &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+#endif
+        }
+    }
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               nullptr,  // Cptr, not used here
+                                               Cdesc,
+                                               res,
+                                               Ddesc,
+                                               (findAlgo ? (&algo) : NULL),
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Cdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Cdesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+template<bool RELU, bool GELU>
+void cublasFP8MMWrapper::Conv1x1Gemm(__nv_fp8_e4m3*       res,
+                                     int                  m,
+                                     int                  n,
+                                     int                  k,
+                                     const __nv_fp8_e4m3* input,
+                                     const __nv_fp8_e4m3* kernel,
+                                     const __nv_bfloat16* bias,
+                                     const float          input_scale,
+                                     const float          kernel_scale,
+                                     const float          output_scale,
+                                     cudaStream_t         stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+    size_t workspace_size = 0;
+    // get workspace size
+    qgmmaLauncher.getWorkSpaceSize<RELU, GELU>(n, workspace_size);
+
+    if (workspace_size > CUBLAS_WORKSPACE_1MB) {
+        throw std::runtime_error("Need to rellocate workspace for qgemm. It is not supported");
+        // cublas_workspace_qgemm_ = allocator_->reMalloc(cublas_workspace_qgemm_, workspace_size);
+    }
+
+    qgmmaLauncher.invokeQgmma1x1<RELU, GELU>(
+        res, m, n, k, input, kernel, bias, input_scale, kernel_scale, output_scale, cublas_workspace_qgemm_, stream);
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+template void cublasFP8MMWrapper::Conv1x1Gemm<true, false>(__nv_fp8_e4m3*       res,
+                                                           int                  m,
+                                                           int                  n,
+                                                           int                  k,
+                                                           const __nv_fp8_e4m3* input,
+                                                           const __nv_fp8_e4m3* kernel,
+                                                           const __nv_bfloat16* bias,
+                                                           const float          input_scale,
+                                                           const float          kernel_scale,
+                                                           const float          output_scale,
+                                                           cudaStream_t         stream);
+template void cublasFP8MMWrapper::Conv1x1Gemm<true, true>(__nv_fp8_e4m3*       res,
+                                                          int                  m,
+                                                          int                  n,
+                                                          int                  k,
+                                                          const __nv_fp8_e4m3* input,
+                                                          const __nv_fp8_e4m3* kernel,
+                                                          const __nv_bfloat16* bias,
+                                                          const float          input_scale,
+                                                          const float          kernel_scale,
+                                                          const float          output_scale,
+                                                          cudaStream_t         stream);
+template void cublasFP8MMWrapper::Conv1x1Gemm<false, false>(__nv_fp8_e4m3*       res,
+                                                            int                  m,
+                                                            int                  n,
+                                                            int                  k,
+                                                            const __nv_fp8_e4m3* input,
+                                                            const __nv_fp8_e4m3* kernel,
+                                                            const __nv_bfloat16* bias,
+                                                            const float          input_scale,
+                                                            const float          kernel_scale,
+                                                            const float          output_scale,
+                                                            cudaStream_t         stream);
+template void cublasFP8MMWrapper::Conv1x1Gemm<false, true>(__nv_fp8_e4m3*       res,
+                                                           int                  m,
+                                                           int                  n,
+                                                           int                  k,
+                                                           const __nv_fp8_e4m3* input,
+                                                           const __nv_fp8_e4m3* kernel,
+                                                           const __nv_bfloat16* bias,
+                                                           const float          input_scale,
+                                                           const float          kernel_scale,
+                                                           const float          output_scale,
+                                                           cudaStream_t         stream);
+
+template<bool RELU, bool GELU>
+void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_bfloat16*       res,
+                                       int                  batchCount,
+                                       int                  m,
+                                       int                  n,
+                                       int                  k,
+                                       int64_t              strideA,
+                                       int64_t              strideB,
+                                       int64_t              strideD,
+                                       const float*         alpha,
+                                       const float*         beta,
+                                       const __nv_fp8_e4m3* input,
+                                       const __nv_fp8_e4m3* kernel,
+                                       const float*         input_scale,
+                                       const float*         kernel_scale,
+                                       const __nv_bfloat16* bias,
+                                       const float*         output_scale,
+                                       cudaStream_t         stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void*  devAscalePtr = (const void*)kernel_scale;
+    const void*  devBscalePtr = (const void*)input_scale;
+    const void*  devDscalePtr = (const void*)output_scale;
+    const size_t wsSizeBytes  = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto dType       = CUDA_R_16BF;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+    // const auto epilogueAuxType = CUDA_R_16BF;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+
+        cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
+        if (RELU == true) {
+            epi = CUBLASLT_EPILOGUE_RELU_BIAS;
+        }
+        else if (GELU == true) {
+            epi = CUBLASLT_EPILOGUE_GELU_BIAS;
+        }
+        // cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    const int                       requestedAlgoCount = 1;
+    cublasLtMatmulHeuristicResult_t heuristicResult;
+    cublasLtMatmulPreference_t      preference;
+    int                             returnedAlgoCount = -1;
+    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &wsSizeBytes, sizeof(wsSizeBytes)));
+
+    check_cuda_error(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
+                                                    matmulDesc,
+                                                    Adesc,
+                                                    Bdesc,
+                                                    Ddesc,
+                                                    Ddesc,
+                                                    preference,
+                                                    requestedAlgoCount,
+                                                    &heuristicResult,
+                                                    &returnedAlgoCount));
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               res,
+                                               Ddesc,
+                                               res,
+                                               Ddesc,
+                                               &heuristicResult.algo,
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+template<bool RELU, bool GELU>
+void cublasFP8MMWrapper::Gemm_Bias_Act(__nv_fp8_e4m3*       res,
+                                       int                  batchCount,
+                                       int                  m,
+                                       int                  n,
+                                       int                  k,
+                                       int64_t              strideA,
+                                       int64_t              strideB,
+                                       int64_t              strideD,
+                                       const float*         alpha,
+                                       const float*         beta,
+                                       const __nv_fp8_e4m3* input,
+                                       const __nv_fp8_e4m3* kernel,
+                                       const float*         input_scale,
+                                       const float*         kernel_scale,
+                                       const __nv_bfloat16* bias,
+                                       const float*         output_scale,
+                                       cudaStream_t         stream)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+
+    const void*  devAscalePtr = (const void*)kernel_scale;
+    const void*  devBscalePtr = (const void*)input_scale;
+    const void*  devDscalePtr = (const void*)output_scale;
+    const size_t wsSizeBytes  = CUBLAS_WORKSPACE_SIZE;
+
+    const auto aType       = CUDA_R_8F_E4M3;
+    const auto bType       = CUDA_R_8F_E4M3;
+    const auto cType       = CUDA_R_16BF;
+    const auto dType       = CUDA_R_8F_E4M3;
+    const auto computeType = CUBLAS_COMPUTE_32F;
+    const auto scaleType   = CUDA_R_32F;
+    // const auto epilogueAuxType = CUDA_R_16BF;
+
+    const cublasOperation_t tA = CUBLAS_OP_T;
+    const cublasOperation_t tB = CUBLAS_OP_N;
+
+    //------- init, desc & tensors
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t Adesc;
+    cublasLtMatrixLayout_t Bdesc;
+    cublasLtMatrixLayout_t Cdesc;
+    cublasLtMatrixLayout_t Ddesc;
+
+    {
+        check_cuda_error(cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &tB, sizeof(tB)));
+
+        if (version_major_ >= 11 && version_minor_ >= 11 && version_patch_ > 0) {
+            const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+            check_cuda_error(cublasLtMatmulDescSetAttribute(
+                matmulDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode))));
+        }
+
+        // TODO: Check that do we need to set these attributes
+        // TODO: comment them for compiler first
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &devAscalePtr, sizeof(devAscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &devBscalePtr, sizeof(devBscalePtr)));
+        check_cuda_error(cublasLtMatmulDescSetAttribute(
+            matmulDesc, CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, &devDscalePtr, sizeof(devDscalePtr)));
+
+        cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_GELU_BIAS;
+        // cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+        cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+    }
+
+    {
+        const int64_t lda = k;
+        const int64_t ldb = k;
+        const int64_t ldd = n;
+
+        // create matrix descriptors, we are good with the details here so no need
+        // to set any extra attributes
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Adesc, aType, tA == CUBLAS_OP_N ? n : k, tA == CUBLAS_OP_N ? k : n, lda));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        }
+
+        check_cuda_error(
+            cublasLtMatrixLayoutCreate(&Bdesc, bType, tB == CUBLAS_OP_N ? k : m, tB == CUBLAS_OP_N ? m : k, ldb));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, cType, n, m, ldd));
+        // (TODO Hongbinl)Not sure if the implementation makes sense
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+
+        check_cuda_error(cublasLtMatrixLayoutCreate(&Ddesc, dType, n, m, ldd));
+        if (batchCount > 1) {
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+            check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+                Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        }
+    }
+
+    const int                       requestedAlgoCount = 1;
+    cublasLtMatmulHeuristicResult_t heuristicResult;
+    cublasLtMatmulPreference_t      preference;
+    int                             returnedAlgoCount = -1;
+    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &wsSizeBytes, sizeof(wsSizeBytes)));
+#if (CUBLAS_VERSION) <= 12000
+    uint32_t pointer_mode_mask = 0;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
+#endif
+
+    check_cuda_error(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle_,
+                                                    matmulDesc,
+                                                    Adesc,
+                                                    Bdesc,
+                                                    Cdesc,
+                                                    Ddesc,
+                                                    preference,
+                                                    requestedAlgoCount,
+                                                    &heuristicResult,
+                                                    &returnedAlgoCount));
+
+    {
+        cublasStatus_t status = cublasLtMatmul(cublaslt_handle_,
+                                               matmulDesc,
+                                               alpha,
+                                               kernel,
+                                               Adesc,
+                                               input,
+                                               Bdesc,
+                                               beta,
+                                               res,
+                                               Cdesc,
+                                               res,
+                                               Ddesc,
+                                               &heuristicResult.algo,
+                                               cublas_workspace_,
+                                               wsSizeBytes,
+                                               stream);
+        check_cuda_error(status);
+    }
+
+    if (Ddesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Ddesc));
+    }
+    if (Bdesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Bdesc));
+    }
+    if (Adesc) {
+        check_cuda_error(cublasLtMatrixLayoutDestroy(Adesc));
+    }
+    if (matmulDesc) {
+        check_cuda_error(cublasLtMatmulDescDestroy(matmulDesc));
+    }
+
+    mu_->unlock();
+}
+
+template void cublasFP8MMWrapper::Gemm_Bias_Act<false, true>(__nv_bfloat16*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<false, true>(__nv_fp8_e4m3*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<true, false>(__nv_bfloat16*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<true, false>(__nv_fp8_e4m3*       res,
+                                                             int                  batchCount,
+                                                             int                  m,
+                                                             int                  n,
+                                                             int                  k,
+                                                             int64_t              strideA,
+                                                             int64_t              strideB,
+                                                             int64_t              strideD,
+                                                             const float*         alpha,
+                                                             const float*         beta,
+                                                             const __nv_fp8_e4m3* input,
+                                                             const __nv_fp8_e4m3* kernel,
+                                                             const float*         input_scale,
+                                                             const float*         kernel_scale,
+                                                             const __nv_bfloat16* bias,
+                                                             const float*         output_scale,
+                                                             cudaStream_t         stream);
+template void cublasFP8MMWrapper::Gemm_Bias_Act<false, false>(__nv_fp8_e4m3*       res,
+                                                              int                  batchCount,
+                                                              int                  m,
+                                                              int                  n,
+                                                              int                  k,
+                                                              int64_t              strideA,
+                                                              int64_t              strideB,
+                                                              int64_t              strideD,
+                                                              const float*         alpha,
+                                                              const float*         beta,
+                                                              const __nv_fp8_e4m3* input,
+                                                              const __nv_fp8_e4m3* kernel,
+                                                              const float*         input_scale,
+                                                              const float*         kernel_scale,
+                                                              const __nv_bfloat16* bias,
+                                                              const float*         output_scale,
+                                                              cudaStream_t         stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasFP8MMWrapper.h b/src/fastertransformer/utils/cublasFP8MMWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..13b4db798e82d716819e98a472f7fb0c750ad019
--- /dev/null
+++ b/src/fastertransformer/utils/cublasFP8MMWrapper.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "3rdparty/fp8_qgmma_1x1/fp8_qgmma_1x1_utils.h"
+#include "cuda_utils.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <mutex>
+#include <string>
+
+#pragma once
+
+namespace fastertransformer {
+
+class cublasFP8MMWrapper: public cublasMMWrapper {
+public:
+    cublasFP8MMWrapper(cublasLtHandle_t cublaslt_handle_,
+                       cudaStream_t     stream,
+                       cublasAlgoMap*   map,
+                       std::mutex*      mu,
+                       IAllocator*      allocator);
+
+    cublasFP8MMWrapper(cublasHandle_t   cublas_handle,
+                       cublasLtHandle_t cublaslt_handle,
+                       cudaStream_t     stream,
+                       cublasAlgoMap*   map,
+                       std::mutex*      mu,
+                       IAllocator*      allocator);
+
+    virtual ~cublasFP8MMWrapper();
+
+    cublasFP8MMWrapper(const cublasFP8MMWrapper& wrapper);
+
+    virtual void cublasVersionCheck() override;
+
+    void Gemm(__nv_bfloat16*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale);
+
+    void Gemm(__nv_bfloat16*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale,
+              cudaStream_t         stream,
+              bool                 fastAccum = true);
+
+    void Gemm(__nv_fp8_e4m3*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale,
+              const float*         output_scale);
+
+    void Gemm(__nv_fp8_e4m3*       res,
+              int                  batchCount,
+              int                  m,
+              int                  n,
+              int                  k,
+              int64_t              stridea,
+              int64_t              strideb,
+              int64_t              stridec,
+              const float*         alpha,
+              const float*         beta,
+              const __nv_fp8_e4m3* input,
+              const __nv_fp8_e4m3* kernel,
+              const float*         input_scale,
+              const float*         kernel_scale,
+              const float*         output_scale,
+              cudaStream_t         stream,
+              bool                 fastAccum = true);
+
+    template<bool RELU, bool GELU>
+    void Conv1x1Gemm(__nv_fp8_e4m3*       res,
+                     int                  m,
+                     int                  n,
+                     int                  k,
+                     const __nv_fp8_e4m3* input,
+                     const __nv_fp8_e4m3* kernel,
+                     const __nv_bfloat16* bias,
+                     const float          input_scale,
+                     const float          kernel_scale,
+                     const float          output_scale,
+                     cudaStream_t         stream);
+
+    template<bool RELU, bool GELU>
+    void Gemm_Bias_Act(__nv_bfloat16*       res,
+                       int                  batchCount,
+                       int                  m,
+                       int                  n,
+                       int                  k,
+                       int64_t              stridea,
+                       int64_t              strideb,
+                       int64_t              stridec,
+                       const float*         alpha,
+                       const float*         beta,
+                       const __nv_fp8_e4m3* input,
+                       const __nv_fp8_e4m3* kernel,
+                       const float*         input_scale,
+                       const float*         kernel_scale,
+                       const __nv_bfloat16* bias,
+                       const float*         output_scale,
+                       cudaStream_t         stream);
+
+    template<bool RELU, bool GELU>
+    void Gemm_Bias_Act(__nv_fp8_e4m3*       res,
+                       int                  batchCount,
+                       int                  m,
+                       int                  n,
+                       int                  k,
+                       int64_t              stridea,
+                       int64_t              strideb,
+                       int64_t              stridec,
+                       const float*         alpha,
+                       const float*         beta,
+                       const __nv_fp8_e4m3* input,
+                       const __nv_fp8_e4m3* kernel,
+                       const float*         input_scale,
+                       const float*         kernel_scale,
+                       const __nv_bfloat16* bias,
+                       const float*         output_scale,
+                       cudaStream_t         stream);
+
+private:
+    int                                 version_major_, version_minor_, version_patch_;
+    fastertransformer::qgmma1x1Launcher qgmmaLauncher;
+    void*                               cublas_workspace_qgemm_ = nullptr;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasINT8MMWrapper.cc b/src/fastertransformer/utils/cublasINT8MMWrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..795908182d216910669078f64ed119af8e039dd7
--- /dev/null
+++ b/src/fastertransformer/utils/cublasINT8MMWrapper.cc
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasINT8MMWrapper.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle,
+                                         cudaStream_t     stream,
+                                         cublasAlgoMap*   cublas_algo_map,
+                                         std::mutex*      mu,
+                                         bool             use_ORDER_COL32_2R_4R4):
+    cublasMMWrapper(nullptr, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
+    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
+{
+}
+
+cublasINT8MMWrapper::cublasINT8MMWrapper(cublasHandle_t   cublas_handle,
+                                         cublasLtHandle_t cublaslt_handle,
+                                         cudaStream_t     stream,
+                                         cublasAlgoMap*   cublas_algo_map,
+                                         std::mutex*      mu,
+                                         bool             use_ORDER_COL32_2R_4R4):
+    cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, mu, nullptr),
+    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
+{
+}
+
+#ifdef SPARSITY_ENABLED
+cublasINT8MMWrapper::cublasINT8MMWrapper(cublasLtHandle_t   cublaslt_handle,
+                                         cusparseLtHandle_t cusparselt_handle,
+                                         cudaStream_t       stream,
+                                         cublasAlgoMap*     cublas_algo_map,
+                                         std::mutex*        mu,
+                                         bool               use_ORDER_COL32_2R_4R4):
+    cublasMMWrapper(nullptr, cublaslt_handle, cusparselt_handle, stream, cublas_algo_map, mu, nullptr),
+    use_ORDER_COL32_2R_4R4_(use_ORDER_COL32_2R_4R4)
+{
+}
+#endif
+
+cublasINT8MMWrapper::~cublasINT8MMWrapper()
+{
+    mu_ = nullptr;
+}
+
+cublasINT8MMWrapper::cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper):
+#ifdef SPARSITY_ENABLED
+    cublasMMWrapper(nullptr,
+                    wrapper.cublaslt_handle_,
+                    wrapper.cusparselt_handle_,
+                    wrapper.stream_,
+                    wrapper.cublas_algo_map_,
+                    wrapper.mu_,
+                    wrapper.allocator_),
+#else
+    cublasMMWrapper(
+        nullptr, wrapper.cublaslt_handle_, wrapper.stream_, wrapper.cublas_algo_map_, wrapper.mu_, wrapper.allocator_),
+#endif
+    use_ORDER_COL32_2R_4R4_(wrapper.use_ORDER_COL32_2R_4R4_)
+{
+}
+
+// for int8 cublasLtMM with algo
+// ATransform should be m*n, CUBLASLT_ORDER_COL32
+// kernel should be n*k, CUBLASLT_ORDER_COL4_4R2_8C or CUBLASLT_ORDER_COL32_2R_4R4
+// res is m*n, CUBLASLT_ORDER_COL32
+void cublasINT8MMWrapper::Gemm(int*          res,
+                               int           batchCount,
+                               int           m,
+                               int           n,
+                               int           k,
+                               int64_t       stridea,
+                               int64_t       strideb,
+                               int64_t       stridec,
+                               const int8_t* ATransform,
+                               const int8_t* kernel)
+{
+    mu_->lock();
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t AtransformDesc = NULL;
+    cublasLtMatrixLayout_t BtransformDesc = NULL;
+    cublasLtMatrixLayout_t CtransformDesc = NULL;
+    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;
+
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4_) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+
+    int ldaTransform = 32 * m;
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4_) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+    int ldcTransform = 32 * m;
+
+    // create matmulDesc
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulDescCreate(&matmulDesc, computeType, CUDA_R_32I);
+#else
+    cublasLtMatmulDescCreate(&matmulDesc, computeType);
+#endif
+    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
+    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
+    cublasLtMatrixLayoutSetAttribute(
+        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_32I, m, n, ldcTransform);
+    cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (batchCount > 1) {
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
+    }
+
+    int alphaI = 1;
+    int betaI  = 0;
+
+    // get algo
+    cublasLtMatmulAlgo_t algo;
+    int                  findAlgo = 0;
+    if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
+        // printf("find algo %s\n", markStr.c_str());
+        findAlgo = 1;
+
+        cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
+
+        cublasLtMatmulAlgoInit(cublaslt_handle_,
+                               computeType,
+                               CUDA_R_32I,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               CUDA_R_32I,
+                               CUDA_R_32I,
+                               tmp_info.algoId,
+                               &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
+#endif
+    }
+    else {
+        findAlgo = 1;
+        int algoId;
+        if (use_ORDER_COL32_2R_4R4_) {
+            algoId = 7;
+        }
+        else {
+            algoId = 6;
+        }
+        int swizzle         = 0;
+        int customOption    = 0;
+        int tile            = 20;
+        int splitK_val      = 0;
+        int reductionScheme = 0;
+        cublasLtMatmulAlgoInit(
+            cublaslt_handle_, computeType, CUDA_R_32I, CUDA_R_8I, CUDA_R_8I, CUDA_R_32I, CUDA_R_32I, algoId, &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        int stages;
+        if (use_ORDER_COL32_2R_4R4_) {
+            stages = 15;
+        }
+        else {
+            stages = 13;
+        }
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+#endif
+    }
+
+    cublasLtMatmul(cublaslt_handle_,
+                   matmulDesc,
+                   &alphaI,
+                   ATransform,
+                   AtransformDesc,
+                   kernel,
+                   BtransformDesc,
+                   &betaI,
+                   res,
+                   CtransformDesc,
+                   res,
+                   CtransformDesc,
+                   (findAlgo == 1 ? (&algo) : NULL),
+                   NULL,
+                   0,
+                   stream_);
+
+    cublasLtMatmulDescDestroy(matmulDesc);
+    cublasLtMatrixLayoutDestroy(AtransformDesc);
+    cublasLtMatrixLayoutDestroy(BtransformDesc);
+    cublasLtMatrixLayoutDestroy(CtransformDesc);
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+// for int8 IO cublasLtMM with algo
+// ATransform should be m*k CUBLASLT_ORDER_COL32
+// kernel should be n*k CUBLASLT_ORDER_COL4_4R2_8C
+// res is m*n CUBLASLT_ORDER_COL32
+void cublasINT8MMWrapper::Gemm(int8_t*       res,
+                               int           batchCount,
+                               int           m,
+                               int           n,
+                               int           k,
+                               int64_t       stridea,
+                               int64_t       strideb,
+                               int64_t       stridec,
+                               const float   alpha,
+                               const int8_t* ATransform,
+                               const int8_t* kernel)
+{
+    mu_->lock();
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+    // int8 gemm does not support CUBLAS_POINTER_MODE_DEVICE
+    // cublasLtPointerMode_t pointerMode = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+    cudaDataType_t scaleType = CUDA_R_32F;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasLtMatmulDesc_t   matmulDesc;
+    cublasLtMatrixLayout_t AtransformDesc = NULL;
+    cublasLtMatrixLayout_t BtransformDesc = NULL;
+    cublasLtMatrixLayout_t CtransformDesc = NULL;
+    cublasLtOrder_t        order_COL32    = CUBLASLT_ORDER_COL32;
+
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4_) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+
+    int ldaTransform = 32 * m;
+
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4_) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+
+    int ldcTransform = 32 * m;
+
+    // create matmulDesc
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulDescCreate(&matmulDesc, computeType, scaleType);
+#else
+    cublasLtMatmulDescCreate(&matmulDesc, computeType);
+#endif
+    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scaleType, sizeof(scaleType));
+    // cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointerMode,
+    // sizeof(cublasLtPointerMode_t));
+    cublasLtMatrixLayoutCreate(&AtransformDesc, CUDA_R_8I, m, k, ldaTransform);
+    cublasLtMatrixLayoutSetAttribute(AtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    cublasLtMatrixLayoutCreate(&BtransformDesc, CUDA_R_8I, n, k, ldbTransform);
+    cublasLtMatrixLayoutSetAttribute(
+        BtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    cublasLtMatrixLayoutCreate(&CtransformDesc, CUDA_R_8I, m, n, ldcTransform);
+    cublasLtMatrixLayoutSetAttribute(CtransformDesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (batchCount > 1) {
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            AtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            BtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+        cublasLtMatrixLayoutSetAttribute(
+            CtransformDesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
+    }
+
+    // get algo
+    cublasLtMatmulAlgo_t algo;
+    int                  findAlgo = 0;
+    if (cublas_algo_map_->isExist(batchCount, m, n, k, INT8_DATATYPE)) {
+        findAlgo = 1;
+
+        cublasLtMatmulAlgo_info tmp_info = cublas_algo_map_->getAlgo(batchCount, m, n, k, INT8_DATATYPE);
+
+        cublasLtMatmulAlgoInit(cublaslt_handle_,
+                               computeType,
+                               CUDA_R_32F,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               CUDA_R_8I,
+                               tmp_info.algoId,
+                               &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(tmp_info.customOption), sizeof(tmp_info.customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tmp_info.tile), sizeof(tmp_info.tile));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(tmp_info.splitK_val), sizeof(tmp_info.splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(tmp_info.swizzle), sizeof(tmp_info.swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(tmp_info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(tmp_info.stages), sizeof(tmp_info.stages));
+#endif
+    }
+    else {
+        findAlgo = 1;
+        int algoId;
+        if (use_ORDER_COL32_2R_4R4_) {
+            algoId = 7;
+        }
+        else {
+            algoId = 6;
+        }
+        int swizzle         = 0;
+        int customOption    = 0;
+        int tile            = 20;
+        int splitK_val      = 0;
+        int reductionScheme = 0;
+        cublasLtMatmulAlgoInit(
+            cublaslt_handle_, computeType, CUDA_R_32F, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, CUDA_R_8I, algoId, &algo);
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(customOption), sizeof(customOption));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(splitK_val), sizeof(splitK_val));
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
+        cublasLtMatmulAlgoConfigSetAttribute(
+            &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+        int stages;
+        if (use_ORDER_COL32_2R_4R4_) {
+            stages = 15;
+        }
+        else {
+            stages = 13;
+        }
+        cublasLtMatmulAlgoConfigSetAttribute(&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+#endif
+    }
+
+    float beta = 0.0f;
+    cublasLtMatmul(cublaslt_handle_,
+                   matmulDesc,
+                   &alpha,
+                   ATransform,
+                   AtransformDesc,
+                   kernel,
+                   BtransformDesc,
+                   &beta,
+                   res,
+                   CtransformDesc,
+                   res,
+                   CtransformDesc,
+                   (findAlgo == 1 ? (&algo) : NULL),
+                   NULL,
+                   0,
+                   stream_);
+
+    cublasLtMatmulDescDestroy(matmulDesc);
+    cublasLtMatrixLayoutDestroy(AtransformDesc);
+    cublasLtMatrixLayoutDestroy(BtransformDesc);
+    cublasLtMatrixLayoutDestroy(CtransformDesc);
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+template<typename T>
+int cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights)
+{
+
+    int           fusedINT8QKV_type = 0;
+    const int8_t* Q_weight          = (const int8_t*)(attention_weights->query_weight.kernel);
+    const int8_t* K_weight          = (const int8_t*)(attention_weights->key_weight.kernel);
+    const int8_t* V_weight          = (const int8_t*)(attention_weights->value_weight.kernel);
+    // for QKV weight are DataType_ & continue
+    if ((attention_weights->query_weight.kernel + n * k == attention_weights->key_weight.kernel)
+        && (attention_weights->key_weight.kernel + n * k == attention_weights->value_weight.kernel)) {
+        fusedINT8QKV_type = 1;
+    }
+    // for QVK weight are int8 & continue
+    else if ((Q_weight + n * k == K_weight) && (K_weight + n * k == V_weight)) {
+        fusedINT8QKV_type = 2;
+    }
+    return fusedINT8QKV_type;
+}
+
+bool cublasINT8MMWrapper::getUseOrderCol322R4R4()
+{
+    return use_ORDER_COL32_2R_4R4_;
+}
+
+template int
+cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<float>* attention_weights);
+
+template int
+cublasINT8MMWrapper::getFusedINT8QKVType(const int k, const int n, const AttentionWeight<half>* attention_weights);
+
+#ifdef SPARSITY_ENABLED
+// A is sparse weight [m,k], non transposed row major
+// B is activation input [k, n], non transposed col major
+void cublasINT8MMWrapper::SpGemm(
+    const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C)
+{
+    cudaDataType_t                 Atype        = CUDA_R_8I;
+    cudaDataType_t                 Btype        = CUDA_R_8I;
+    cudaDataType_t                 Ctype        = CUDA_R_8I;
+    cusparseComputeType            compute_type = CUSPARSE_COMPUTE_32I;
+    cusparseOrder_t                col_order    = CUSPARSE_ORDER_COL;
+    cusparseOrder_t                row_order    = CUSPARSE_ORDER_ROW;
+    cusparseOperation_t            opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseOperation_t            opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseLtMatmulDescriptor_t   matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t         plan;
+
+    auto     num_A_rows = m;
+    auto     num_A_cols = k;
+    auto     num_B_rows = k;
+    auto     num_B_cols = n;
+    auto     num_C_rows = m;
+    auto     num_C_cols = n;
+    unsigned alignment  = 16;
+    auto     lda        = num_A_cols;
+    auto     ldb        = num_B_rows;
+    auto     ldc        = num_C_rows;
+    float    _beta(0.0f);
+
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
+    if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    else {
+        // initializing MatDesc takes a lot of time
+        cusparseLtMatDescriptor_t matA, matB, matC;
+        sp_mat_A_desc_map_[mark] = matA;
+        sp_mat_B_desc_map_[mark] = matB;
+        sp_mat_C_desc_map_[mark] = matC;
+        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                          &sp_mat_A_desc_map_[mark],
+                                                          num_A_rows,
+                                                          num_A_cols,
+                                                          lda,
+                                                          alignment,
+                                                          Atype,
+                                                          row_order,
+                                                          CUSPARSELT_SPARSITY_50_PERCENT))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype, col_order))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype, col_order))
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    mu_->lock();
+    CHECK_CUSPARSE(
+        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+    int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
+    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+    size_t workspace_size;
+    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
+
+    void*        d_workspace = nullptr;
+    int          num_streams = 1;
+    cudaStream_t streams[1]  = {stream_};
+    CHECK_CUSPARSE(
+        cusparseLtMatmul(&cusparselt_handle_, &plan, &alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+#endif
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasINT8MMWrapper.h b/src/fastertransformer/utils/cublasINT8MMWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cc74290bd218a9f612c7e1e2ffcda1326c5babb
--- /dev/null
+++ b/src/fastertransformer/utils/cublasINT8MMWrapper.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_utils.h"
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <mutex>
+#include <string>
+
+#pragma once
+namespace fastertransformer {
+
+class cublasINT8MMWrapper: public cublasMMWrapper {
+private:
+    bool use_ORDER_COL32_2R_4R4_;
+
+public:
+    cublasINT8MMWrapper(cublasLtHandle_t cublaslt_handle_,
+                        cudaStream_t     stream,
+                        cublasAlgoMap*   map,
+                        std::mutex*      mu,
+                        bool             use_ORDER_COL32_2R_4R4);
+
+    cublasINT8MMWrapper(cublasHandle_t   cublas_handle,
+                        cublasLtHandle_t cublaslt_handle,
+                        cudaStream_t     stream,
+                        cublasAlgoMap*   map,
+                        std::mutex*      mu,
+                        bool             use_ORDER_COL32_2R_4R4);
+#ifdef SPARSITY_ENABLED
+    cublasINT8MMWrapper(cublasLtHandle_t   cublaslt_handle_,
+                        cusparseLtHandle_t cusparselt_handle,
+                        cudaStream_t       stream,
+                        cublasAlgoMap*     map,
+                        std::mutex*        mu,
+                        bool               use_ORDER_COL32_2R_4R4);
+#endif
+
+    ~cublasINT8MMWrapper();
+
+    cublasINT8MMWrapper(const cublasINT8MMWrapper& wrapper);
+
+    void Gemm(int*          res,
+              int           batchCount,
+              int           m,
+              int           n,
+              int           k,
+              int64_t       stridea,
+              int64_t       strideb,
+              int64_t       stridec,
+              const int8_t* ATransform,
+              const int8_t* kernel);
+
+    void Gemm(int8_t*       res,
+              int           batchCount,
+              int           m,
+              int           n,
+              int           k,
+              int64_t       stridea,
+              int64_t       strideb,
+              int64_t       stridec,
+              const float   alpha,
+              const int8_t* ATransform,
+              const int8_t* kernel);
+
+    template<typename T>
+    int getFusedINT8QKVType(const int k, const int n, const AttentionWeight<T>* attention_weights);
+
+    bool getUseOrderCol322R4R4();
+
+#ifdef SPARSITY_ENABLED
+    void SpGemm(const int m, const int n, const int k, const float alpha, const void* A, const void* B, void* C);
+#endif
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasMMWrapper.cc b/src/fastertransformer/utils/cublasMMWrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf6006c281b981f706c4afd334568accfb018362
--- /dev/null
+++ b/src/fastertransformer/utils/cublasMMWrapper.cc
@@ -0,0 +1,1102 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasMMWrapper.h"
+#include "cuda_utils.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+cublasMMWrapper::cublasMMWrapper(cublasHandle_t   cublas_handle,
+                                 cublasLtHandle_t cublaslt_handle,
+                                 cudaStream_t     stream,
+                                 cublasAlgoMap*   cublas_algo_map,
+                                 std::mutex*      mu,
+                                 IAllocator*      allocator):
+    cublas_handle_(cublas_handle),
+    cublaslt_handle_(cublaslt_handle),
+    stream_(stream),
+    cublas_algo_map_(cublas_algo_map),
+    mu_(mu),
+    allocator_(allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (allocator_ != nullptr) {
+        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
+    }
+}
+
+#ifdef SPARSITY_ENABLED
+cublasMMWrapper::cublasMMWrapper(cublasHandle_t     cublas_handle,
+                                 cublasLtHandle_t   cublaslt_handle,
+                                 cusparseLtHandle_t cusparselt_handle,
+                                 cudaStream_t       stream,
+                                 cublasAlgoMap*     cublas_algo_map,
+                                 std::mutex*        mu,
+                                 IAllocator*        allocator):
+    cublas_handle_(cublas_handle),
+    cublaslt_handle_(cublaslt_handle),
+    cusparselt_handle_(cusparselt_handle),
+    stream_(stream),
+    cublas_algo_map_(cublas_algo_map),
+    mu_(mu),
+    allocator_(allocator)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (allocator_ != nullptr) {
+        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
+    }
+}
+#endif
+
+cublasMMWrapper::~cublasMMWrapper()
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_ = nullptr;
+    if (allocator_ != nullptr) {
+        allocator_->free((void**)(&cublas_workspace_));
+        allocator_ = nullptr;
+    }
+}
+
+cublasMMWrapper::cublasMMWrapper(const cublasMMWrapper& wrapper):
+    cublas_handle_(wrapper.cublas_handle_),
+    cublaslt_handle_(wrapper.cublaslt_handle_),
+#ifdef SPARSITY_ENABLED
+    cusparselt_handle_(wrapper.cusparselt_handle_),
+#endif
+    stream_(wrapper.stream_),
+    cublas_algo_map_(wrapper.cublas_algo_map_),
+    mu_(wrapper.mu_),
+    allocator_(wrapper.allocator_)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    if (allocator_ != nullptr) {
+        cublas_workspace_ = allocator_->reMalloc(cublas_workspace_, CUBLAS_WORKSPACE_SIZE, false);
+    }
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       alpha,
+                           const void*       A,
+                           cudaDataType_t    Atype,
+                           int               lda,
+                           const void*       B,
+                           cudaDataType_t    Btype,
+                           int               ldb,
+                           const void*       beta,
+                           void*             C,
+                           cudaDataType_t    Ctype,
+                           int               ldc,
+                           cudaDataType_t    computeType,
+                           cublasGemmAlgo_t  algo)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    mu_->lock();
+    check_cuda_error(cublasGemmEx(cublas_handle_,
+                                  transa,
+                                  transb,
+                                  m,
+                                  n,
+                                  k,
+                                  alpha,
+                                  A,
+                                  Atype,
+                                  lda,
+                                  B,
+                                  Btype,
+                                  ldb,
+                                  beta,
+                                  C,
+                                  Ctype,
+                                  ldc,
+                                  computeType,
+                                  algo));
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           void*             C,
+                           const int         ldc)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    Gemm(transa, transb, m, n, k, A, lda, B, ldb, C, ldc, 1.0f, 0.0f);
+}
+
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           void*             C,
+                           const int         ldc,
+                           float             f_alpha,
+                           float             f_beta)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    half h_alpha = (half)(f_alpha);
+    half h_beta  = (half)(f_beta);
+
+    mu_->lock();
+    // TODO: default cublas libs
+    int  is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    bool using_cublasLt      = (Atype_ == CUDA_R_16F) ? true : false;
+    int  batch_count         = 1;
+    // fp32 use cublas as default
+    // fp16 use cublasLt as default
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+
+    int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+    if (findAlgo) {
+        if (info.stages != -1) {
+            using_cublasLt = true;
+        }
+        else {
+            using_cublasLt = false;
+        }
+    }
+
+    if (using_cublasLt) {
+        cublasLtMatmulDesc_t   operationDesc = NULL;
+        cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+        cudaDataType_t         scaleType;
+#if (CUDART_VERSION >= 11000)
+        cublasComputeType_t computeType;
+#else
+        cudaDataType_t computeType;
+#endif
+
+        if (is_fp16_computeType) {
+#if (CUDART_VERSION >= 11000)
+            computeType = CUBLAS_COMPUTE_16F;
+#else
+            computeType = CUDA_R_16F;
+#endif
+            scaleType = CUDA_R_16F;
+        }
+        else {
+#if (CUDART_VERSION >= 11000)
+            computeType = CUBLAS_COMPUTE_32F;
+#else
+            computeType = CUDA_R_32F;
+#endif
+            scaleType = CUDA_R_32F;
+        }
+
+        // --------------------------------------
+        // Create descriptors for the original matrices
+        cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+        cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+        cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+        cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+
+        cublasLtMatmulAlgo_t algo;
+        void*                workSpace     = cublas_workspace_;
+        int                  workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+        if (findAlgo) {
+            if (info.workspaceSize > workspaceSize) {
+                findAlgo = 0;
+            }
+            else {
+                cublasLtMatmulAlgoInit(
+                    cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                     CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                     &(info.reductionScheme),
+                                                     sizeof(info.reductionScheme));
+
+#if (CUDART_VERSION >= 11000)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
+                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                     CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
+                                                     &(info.cluster_shapeId),
+                                                     sizeof(info.cluster_shapeId));
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
+#endif
+            }
+        }
+
+        cublasLtMatmul(cublaslt_handle_,
+                       operationDesc,
+                       alpha,
+                       A,
+                       Adesc,
+                       B,
+                       Bdesc,
+                       beta,
+                       C,
+                       Cdesc,
+                       C,
+                       Cdesc,
+                       (findAlgo == 1 ? (&algo) : NULL),
+                       workSpace,
+                       workspaceSize,
+                       stream_);
+
+        cublasLtMatmulDescDestroy(operationDesc);
+        cublasLtMatrixLayoutDestroy(Adesc);
+        cublasLtMatrixLayoutDestroy(Bdesc);
+        cublasLtMatrixLayoutDestroy(Cdesc);
+        sync_check_cuda_error();
+    }
+    else {
+        int cublasAlgo = info.algoId;
+        check_cuda_error(cublasGemmEx(cublas_handle_,
+                                      transa,
+                                      transb,
+                                      m,
+                                      n,
+                                      k,
+                                      alpha,
+                                      A,
+                                      Atype_,
+                                      lda,
+                                      B,
+                                      Btype_,
+                                      ldb,
+                                      beta,
+                                      C,
+                                      Ctype_,
+                                      ldc,
+                                      computeType_,
+                                      static_cast<cublasGemmAlgo_t>(cublasAlgo)));
+        sync_check_cuda_error();
+    }
+    mu_->unlock();
+}
+
+void cublasMMWrapper::setFP32GemmConfig()
+{
+    Atype_       = CUDA_R_32F;
+    Btype_       = CUDA_R_32F;
+    Ctype_       = CUDA_R_32F;
+    computeType_ = CUDA_R_32F;
+}
+
+void cublasMMWrapper::setFP16GemmConfig()
+{
+    Atype_       = CUDA_R_16F;
+    Btype_       = CUDA_R_16F;
+    Ctype_       = CUDA_R_16F;
+    computeType_ = CUDA_R_32F;
+}
+
+#ifdef ENABLE_BF16
+void cublasMMWrapper::setBF16GemmConfig()
+{
+    Atype_       = CUDA_R_16BF;
+    Btype_       = CUDA_R_16BF;
+    Ctype_       = CUDA_R_16BF;
+    computeType_ = CUDA_R_32F;
+}
+#endif
+
+void cublasMMWrapper::setGemmConfig(cudaDataType_t aType,
+                                    cudaDataType_t bType,
+                                    cudaDataType_t cType,
+                                    cudaDataType_t computeType)
+{
+    Atype_       = aType;
+    Btype_       = bType;
+    Ctype_       = cType;
+    computeType_ = computeType;
+}
+
+CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
+{
+    if (data_type == CUDA_R_16F) {
+        return HALF_DATATYPE;
+    }
+    else if (data_type == CUDA_R_32F) {
+        return FLOAT_DATATYPE;
+    }
+#ifdef ENABLE_BF16
+    else if (data_type == CUDA_R_16BF) {
+        return BFLOAT16_DATATYPE;
+    }
+#endif
+    return FLOAT_DATATYPE;
+}
+
+#if (CUDART_VERSION >= 11000)
+// input, weight, output are row-major
+// only works for cublas 11.x
+void cublasMMWrapper::Gemm(cublasOperation_t transa,
+                           cublasOperation_t transb,
+                           const int         m,
+                           const int         n,
+                           const int         k,
+                           const void*       A,
+                           const int         lda,
+                           const void*       B,
+                           const int         ldb,
+                           const void*       bias,
+                           void*             C,
+                           const int         ldc)
+{
+    FT_LOG_DEBUG(__PRETTY_FUNCTION__);
+    cudaDataType_t      Atype, Btype, Ctype;
+    cublasComputeType_t computeType;
+    cudaDataType_t      scaleType;
+    float               alpha_float = 1.0f;
+    float               beta_float  = 0.0f;
+    half                alpha_half  = half(1.0f);
+    half                beta_half   = half(0.0f);
+    void *              alpha, *beta;
+
+    // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    if (Atype_ == CUDA_R_32F) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+        Atype       = CUDA_R_32F;
+        Btype       = CUDA_R_32F;
+        Ctype       = CUDA_R_32F;
+        scaleType   = CUDA_R_32F;
+        alpha       = &alpha_float;
+        beta        = &beta_float;
+    }
+    else if (Atype_ == CUDA_R_16BF) {
+        computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
+        Atype       = CUDA_R_16BF;
+        Btype       = CUDA_R_16BF;
+        Ctype       = CUDA_R_16BF;
+        scaleType   = CUDA_R_32F;
+        alpha       = &alpha_float;
+        beta        = &beta_float;
+    }
+    else {
+        computeType = CUBLAS_COMPUTE_16F;
+        Atype       = CUDA_R_16F;
+        Btype       = CUDA_R_16F;
+        Ctype       = CUDA_R_16F;
+        scaleType   = CUDA_R_16F;
+        alpha       = &alpha_half;
+        beta        = &beta_half;
+    }
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cublasLtEpilogue_t     epi = CUBLASLT_EPILOGUE_BIAS;
+    cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
+    cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
+    cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
+
+    cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
+    check_cuda_error(cublasLtMatmul(
+        cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
+    cublasLtMatrixLayoutDestroy(Adesc);
+    cublasLtMatrixLayoutDestroy(Bdesc);
+    cublasLtMatrixLayoutDestroy(Cdesc);
+    cublasLtMatmulDescDestroy(operationDesc);
+}
+#endif
+void cublasMMWrapper::setStream(cudaStream_t stream)
+{
+    stream_ = stream;
+}
+
+void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
+                                         cublasOperation_t transb,
+                                         const int         m,
+                                         const int         n,
+                                         const int         k,
+                                         const void*       A,
+                                         const int         lda,
+                                         const int64_t     strideA,
+                                         const void*       B,
+                                         const int         ldb,
+                                         const int64_t     strideB,
+                                         void*             C,
+                                         const int         ldc,
+                                         const int64_t     strideC,
+                                         const int         batch_count,
+                                         const float       f_alpha,
+                                         const float       f_beta)
+{
+    half h_alpha = (half)f_alpha;
+    half h_beta  = (half)f_beta;
+
+    mu_->lock();
+    int         is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha =
+        is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
+    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
+                                                transa,
+                                                transb,
+                                                m,
+                                                n,
+                                                k,
+                                                alpha,
+                                                A,
+                                                Atype_,
+                                                lda,
+                                                strideA,
+                                                B,
+                                                Btype_,
+                                                ldb,
+                                                strideB,
+                                                beta,
+                                                C,
+                                                Ctype_,
+                                                ldc,
+                                                strideC,
+                                                batch_count,
+                                                computeType_,
+                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
+
+    mu_->unlock();
+}
+
+void cublasMMWrapper::stridedBatchedGemm(cublasOperation_t transa,
+                                         cublasOperation_t transb,
+                                         const int         m,
+                                         const int         n,
+                                         const int         k,
+                                         const float       f_alpha,
+                                         const void*       A,
+                                         cudaDataType_t    AType,
+                                         const int         lda,
+                                         const int64_t     strideA,
+                                         const void*       B,
+                                         cudaDataType_t    BType,
+                                         const int         ldb,
+                                         const int64_t     strideB,
+                                         const float       f_beta,
+                                         void*             C,
+                                         cudaDataType_t    CType,
+                                         const int         ldc,
+                                         const int64_t     strideC,
+                                         const int         batch_count,
+                                         cudaDataType_t    computeType)
+{
+    half h_alpha = (half)f_alpha;
+    half h_beta  = (half)f_beta;
+
+    mu_->lock();
+    int         is_fp16_computeType = computeType == CUDA_R_16F ? 1 : 0;
+    const void* alpha =
+        is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<const void*>(&f_alpha);
+    const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<const void*>(&f_beta);
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
+                                                transa,
+                                                transb,
+                                                m,
+                                                n,
+                                                k,
+                                                alpha,
+                                                A,
+                                                AType,
+                                                lda,
+                                                strideA,
+                                                B,
+                                                BType,
+                                                ldb,
+                                                strideB,
+                                                beta,
+                                                C,
+                                                CType,
+                                                ldc,
+                                                strideC,
+                                                batch_count,
+                                                computeType,
+                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
+
+    mu_->unlock();
+}
+
+void cublasMMWrapper::batchedGemm(cublasOperation_t  transa,
+                                  cublasOperation_t  transb,
+                                  const int          m,
+                                  const int          n,
+                                  const int          k,
+                                  const void* const* A,
+                                  const int          lda,
+                                  const void* const* B,
+                                  const int          ldb,
+                                  void* const*       C,
+                                  const int          ldc,
+                                  const int          batch_count)
+{
+    float f_alpha = static_cast<float>(1.0f);
+    float f_beta  = static_cast<float>(0.0f);
+
+    half h_alpha = (half)1.0f;
+    half h_beta  = (half)0.0f;
+
+    mu_->lock();
+    int         is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
+    const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
+    const void* beta  = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
+
+    check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
+                                         transa,
+                                         transb,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         A,
+                                         Atype_,
+                                         lda,
+                                         B,
+                                         Btype_,
+                                         ldb,
+                                         beta,
+                                         C,
+                                         Ctype_,
+                                         ldc,
+                                         batch_count,
+                                         computeType_,
+                                         static_cast<cublasGemmAlgo_t>(info.algoId)));
+    mu_->unlock();
+}
+
+bool cublasMMWrapper::isFuseBatchGemm(const int batch_count, const int m, const int k, const int n)
+{
+    CublasDataType data_type = getCublasDataType(Atype_);
+
+    if (cublas_algo_map_->isExist(batch_count, m, k, n, data_type) == false
+        || cublas_algo_map_->isExist(1, m, k, n, data_type) == false) {
+        return false;
+    }
+    else {
+        return cublas_algo_map_->getAlgo(batch_count, m, k, n, data_type).exec_time
+               < 3 * cublas_algo_map_->getAlgo(1, m, k, n, data_type).exec_time;
+    }
+}
+
+#ifdef SPARSITY_ENABLED
+void cublasMMWrapper::SpGemm(cublasOperation_t transa,
+                             cublasOperation_t transb,
+                             const int         m,
+                             const int         n,
+                             const int         k,
+                             const void*       A,
+                             const void*       B,
+                             void*             C)
+{
+    if (Atype_ != CUDA_R_16F || Btype_ != CUDA_R_16F || Ctype_ != CUDA_R_16F) {
+        throw std::runtime_error("\n[FT][ERROR] sparse GEMM only supports FP16 data type now.");
+    }
+    static bool not_printed_fp32_accumulation_warning = true;
+    if (computeType_ != CUDA_R_16F && not_printed_fp32_accumulation_warning) {
+        printf("[FT][WARNING] cublasMMWrapper sets to FP32 compute type, "
+               "but sparse gemm will use FP16 compute type since cusparselt "
+               "supports FP16 accumulation only.\n");
+        not_printed_fp32_accumulation_warning = false;
+    }
+    cusparseOrder_t     order = CUSPARSE_ORDER_COL;
+    cusparseOperation_t opA = (transa == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+    cusparseOperation_t opB = (transb == CUBLAS_OP_N) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
+    cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
+    cusparseLtMatmulDescriptor_t   matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t         plan;
+
+    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
+    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    auto     num_A_rows     = (isA_transposed) ? k : m;
+    auto     num_A_cols     = (isA_transposed) ? m : k;
+    auto     num_B_rows     = (isB_transposed) ? n : k;
+    auto     num_B_cols     = (isB_transposed) ? k : n;
+    auto     num_C_rows     = m;
+    auto     num_C_cols     = n;
+    unsigned alignment      = 16;
+    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
+    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
+    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
+    float    _alpha(1.0f);
+    float    _beta(0.0f);
+
+    char mark[256];
+    sprintf(mark, "%d_%d_%d_%d", 1, m, n, k);
+    if (sp_mat_A_desc_map_.find(mark) != sp_mat_A_desc_map_.end()) {
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    else {
+        // initializing MatDesc takes a lot of time
+        cusparseLtMatDescriptor_t matA, matB, matC;
+        sp_mat_A_desc_map_[mark] = matA;
+        sp_mat_B_desc_map_[mark] = matB;
+        sp_mat_C_desc_map_[mark] = matC;
+        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                          &sp_mat_A_desc_map_[mark],
+                                                          num_A_rows,
+                                                          num_A_cols,
+                                                          lda,
+                                                          alignment,
+                                                          Atype_,
+                                                          order,
+                                                          CUSPARSELT_SPARSITY_50_PERCENT))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_B_desc_map_[mark], num_B_rows, num_B_cols, ldb, alignment, Btype_, order))
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &sp_mat_C_desc_map_[mark], num_C_rows, num_C_cols, ldc, alignment, Ctype_, order))
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &sp_mat_A_desc_map_[mark],
+                                                      &sp_mat_B_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      &sp_mat_C_desc_map_[mark],
+                                                      compute_type))
+    }
+    mu_->lock();
+    CHECK_CUSPARSE(
+        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+    int alg = cublas_algo_map_->getSpAlgo(1, num_A_rows, num_B_cols, num_A_cols);
+    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+    size_t workspace_size;
+    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size))
+
+    void*        d_workspace = nullptr;
+    int          num_streams = 1;
+    cudaStream_t streams[1]  = {stream_};
+    CHECK_CUSPARSE(
+        cusparseLtMatmul(&cusparselt_handle_, &plan, &_alpha, A, B, &_beta, C, C, d_workspace, streams, num_streams))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+    sync_check_cuda_error();
+    mu_->unlock();
+}
+
+size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
+{
+    // Get a compressed matrix size of shape (m, k) used in cusparselt.
+    auto            Atype_     = CUDA_R_16F;
+    cusparseOrder_t order      = CUSPARSE_ORDER_COL;
+    unsigned        alignment  = 16;
+    int             num_A_rows = m;
+    int             num_A_cols = k;
+    int             lda        = num_A_rows;
+
+    cusparseLtMatDescriptor_t matA;
+    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                      &matA,
+                                                      num_A_rows,
+                                                      num_A_cols,
+                                                      lda,
+                                                      alignment,
+                                                      Atype_,
+                                                      order,
+                                                      CUSPARSELT_SPARSITY_50_PERCENT));
+    size_t compressed_size = 0;
+    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &matA, &compressed_size));
+    return compressed_size;
+}
+
+void cublasMMWrapper::compressMatrix(const void* input, void* output, const int m, const int k)
+{
+    cusparseOrder_t           order = CUSPARSE_ORDER_COL;
+    cusparseOperation_t       opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    cusparseLtMatDescriptor_t matA;
+    unsigned                  alignment = 16;
+    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+        &cusparselt_handle_, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &matA, true, opA, input, output, stream_))
+    sync_check_cuda_error();
+}
+
+bool cublasMMWrapper::isUseSparse(const int batch_count, const int m, const int n, const int k)
+{
+    return cublas_algo_map_->isUseSparse(batch_count, m, n, k);
+}
+#endif
+
+std::pair<bool, cublasLtMatmulAlgo_t> cublasMMWrapper::findBestAlgo(cublasLtHandle_t       lightHandle,
+                                                                    cublasLtMatmulDesc_t   computeDesc,
+                                                                    const void*            alpha,
+                                                                    const void*            A,
+                                                                    cublasLtMatrixLayout_t Adesc,
+                                                                    const void*            B,
+                                                                    cublasLtMatrixLayout_t Bdesc,
+                                                                    const void*            beta,
+                                                                    const void*            C,
+                                                                    cublasLtMatrixLayout_t Cdesc,
+                                                                    void*                  D,
+                                                                    cublasLtMatrixLayout_t Ddesc,
+                                                                    cudaStream_t           stream)
+{
+#if (CUBLAS_VERSION) <= 11601
+    FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
+    return {false, cublasLtMatmulAlgo_t{}};
+#else
+    size_t returnSize;
+    int32_t pointer_mode;
+    cublasLtMatmulDescGetAttribute(
+        computeDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode), &returnSize);
+
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristics(200);
+    cublasLtMatmulPreference_t preference;
+    check_cuda_error(cublasLtMatmulPreferenceCreate(&preference));
+    check_cuda_error(cublasLtMatmulPreferenceInit(preference));
+    uint64_t workspace_size = CUBLAS_WORKSPACE_SIZE;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size)));
+#if (CUBLAS_VERSION) <= 12000
+    uint32_t pointer_mode_mask = 0;
+    check_cuda_error(cublasLtMatmulPreferenceSetAttribute(
+        preference, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &pointer_mode_mask, sizeof(pointer_mode_mask)));
+#endif
+
+    int return_count = 0;
+    auto ret = cublasLtMatmulAlgoGetHeuristic(lightHandle,
+                                              computeDesc,
+                                              Adesc,
+                                              Bdesc,
+                                              Cdesc,
+                                              Ddesc,
+                                              preference,
+                                              heuristics.size(),
+                                              heuristics.data(),
+                                              &return_count);
+    heuristics.resize(return_count);
+
+    std::map<int, std::vector<float>> algo_results;
+    for (const auto& heuristic : heuristics) {
+        cublasLtMatmulAlgo_t algo = heuristic.algo;
+        int32_t algo_id;
+        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
+
+        cudaEvent_t start_event, stop_event;
+        cudaEventCreate(&start_event);
+        cudaEventCreate(&stop_event);
+
+        float my_alpha = 1.0f;
+        float my_beta = 0.0f;
+
+        for (int i = 0; i < 11; i++) {
+            float duration_ms;
+            cudaEventRecord(start_event, stream);
+            check_cuda_error(cublasLtMatmul(lightHandle,
+                                            computeDesc,
+                                            alpha,
+                                            A,
+                                            Adesc,
+                                            B,
+                                            Bdesc,
+                                            beta,
+                                            C,
+                                            Cdesc,
+                                            D,
+                                            Ddesc,
+                                            &algo,
+                                            cublas_workspace_,
+                                            CUBLAS_WORKSPACE_SIZE,
+                                            stream));
+            cudaEventRecord(stop_event, stream);
+            cudaEventSynchronize(stop_event);
+            cudaEventElapsedTime(&duration_ms, start_event, stop_event);
+
+            algo_results[algo_id].push_back(duration_ms);
+        }
+        std::sort(algo_results[algo_id].begin(), algo_results[algo_id].end());
+    }
+
+    cublasLtMatmulHeuristicResult_t result;
+    float best_time = INFINITY;
+    for (const auto& heuristic : heuristics) {
+        cublasLtMatmulAlgo_t algo = heuristic.algo;
+        int32_t algo_id;
+        cublasLtMatmulAlgoConfigGetAttribute(&algo, CUBLASLT_ALGO_CONFIG_ID, &algo_id, sizeof(algo_id), &returnSize);
+        const auto& results = algo_results[algo_id];
+
+        if (results.size() > 0 && results[5] < best_time) {
+            best_time = results[5];
+            result = heuristic;
+        }
+    }
+
+    return {best_time != INFINITY, result.algo};
+#endif
+}
+
+cublasMMWrapper::MatrixLayout cublasMMWrapper::createMatrixLayout(cublasLtMatrixLayout_t Mdesc)
+{
+    size_t       returnSize;
+    MatrixLayout m_layout;
+
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_TYPE, &std::get<0>(m_layout), sizeof(std::get<0>(m_layout)), &returnSize);
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &std::get<1>(m_layout), sizeof(std::get<1>(m_layout)), &returnSize);
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_ROWS, &std::get<2>(m_layout), sizeof(std::get<2>(m_layout)), &returnSize);
+    cublasLtMatrixLayoutGetAttribute(
+        Mdesc, CUBLASLT_MATRIX_LAYOUT_COLS, &std::get<3>(m_layout), sizeof(std::get<3>(m_layout)), &returnSize);
+
+    return m_layout;
+}
+
+cublasStatus_t cublasMMWrapper::cublasLtMatmulWrapper(cublasLtHandle_t            lightHandle,
+                                                      cublasLtMatmulDesc_t        computeDesc,
+                                                      const void*                 alpha,
+                                                      const void*                 A,
+                                                      cublasLtMatrixLayout_t      Adesc,
+                                                      const void*                 B,
+                                                      cublasLtMatrixLayout_t      Bdesc,
+                                                      const void*                 beta,
+                                                      const void*                 C,
+                                                      cublasLtMatrixLayout_t      Cdesc,
+                                                      void*                       D,
+                                                      cublasLtMatrixLayout_t      Ddesc,
+                                                      const cublasLtMatmulAlgo_t* algo,
+                                                      void*                       workspace,
+                                                      size_t                      workspaceSizeInBytes,
+                                                      cudaStream_t                stream)
+{
+    cache_idx_t cache_idx{
+        computeDesc,
+        {createMatrixLayout(Adesc), createMatrixLayout(Bdesc), createMatrixLayout(Cdesc), createMatrixLayout(Ddesc)}};
+
+    cublasLtMatmulAlgo_t algo_value;
+    bool                 found_algo = false;
+    if (algo == nullptr) {
+        if (algo_cache.find(cache_idx) == algo_cache.end()) {
+            auto result =
+                findBestAlgo(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, stream);
+            if (result.first) {
+                algo_cache[cache_idx] = result.second;
+                algo_value            = result.second;
+                found_algo            = true;
+            }
+        }
+        else {
+            algo_value = algo_cache[cache_idx];
+            found_algo = true;
+        }
+    }
+
+    return cublasLtMatmul(lightHandle,
+                          computeDesc,
+                          alpha,
+                          A,
+                          Adesc,
+                          B,
+                          Bdesc,
+                          beta,
+                          C,
+                          Cdesc,
+                          D,
+                          Ddesc,
+                          found_algo ? &algo_value : algo,
+                          workspace,
+                          workspaceSizeInBytes,
+                          stream);
+}
+
+void cublasMMWrapper::_Int8Gemm(const int     m,
+                                const int     n,
+                                const int     k,
+                                const int8_t* A,
+                                const int     lda,
+                                const int8_t* B,
+                                const int     ldb,
+                                void*         C,
+                                const int     ldc,
+                                const void*   alpha,
+                                const int     mode,
+                                const bool    per_column_scaling)
+{
+    /* mode:
+     *  - 0: int8 * int8 -> int32 -> int8
+     *  - 1: int8 * int8 -> int32 -> int32
+     */
+#if (CUBLAS_VERSION) <= 11601
+    FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
+#else
+
+    mu_->lock();
+    const auto op_a = CUBLAS_OP_T;
+    const auto op_b = CUBLAS_OP_N;
+    const auto dataType = CUDA_R_8I;
+    const auto resultType = mode == 0 ? CUDA_R_8I : CUDA_R_32I;
+    const auto computeType = CUBLAS_COMPUTE_32I;
+    const auto scaleType = mode == 0 ? CUDA_R_32F : CUDA_R_32I;
+    const int batch_count = 1;
+    const void* beta;
+
+    int findAlgo = cublas_algo_map_->isExist(batch_count, m, n, k, getCublasDataType(dataType));
+
+    cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(dataType));
+
+    cublasLtMatmulDesc_t operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+
+    // --------------------------------------
+    // Create descriptors for the original matrices
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Adesc, dataType, k, m, lda));
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Bdesc, dataType, k, n, ldb));
+    check_cuda_error(cublasLtMatrixLayoutCreate(&Cdesc, resultType, m, n, ldc));
+
+    check_cuda_error(cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType));
+
+    auto pointer_mode = CUBLASLT_POINTER_MODE_HOST;
+    if (mode == 0) {
+        pointer_mode =
+            per_column_scaling ? CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST : CUBLASLT_POINTER_MODE_DEVICE;
+    }
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &op_a, sizeof(cublasOperation_t)));
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &op_b, sizeof(cublasOperation_t)));
+    check_cuda_error(
+        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSC, &op_b, sizeof(cublasOperation_t)));
+    check_cuda_error(cublasLtMatmulDescSetAttribute(
+        operationDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode, sizeof(pointer_mode)));
+
+    const int32_t int_one = 1;
+    const int32_t int_zero = 0;
+    const float float_zero = 0;
+    if (mode == 0) {
+        beta = per_column_scaling ? &float_zero : NULL;
+    }
+    else {
+        alpha = &int_one;
+        beta = &int_zero;
+    }
+
+    cublasLtMatmulAlgo_t algo;
+    void* workSpace = cublas_workspace_;
+    int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
+
+    sync_check_cuda_error();
+    auto ret = cublasLtMatmulWrapper(cublaslt_handle_,
+                                     operationDesc,
+                                     alpha,
+                                     A,
+                                     Adesc,
+                                     B,
+                                     Bdesc,
+                                     beta,
+                                     C,
+                                     Cdesc,
+                                     C,
+                                     Cdesc,
+                                     NULL,
+                                     workSpace,
+                                     workspaceSize,
+                                     stream_);
+    check_cuda_error(ret);
+    sync_check_cuda_error();
+
+    cublasLtMatmulDescDestroy(operationDesc);
+    cublasLtMatrixLayoutDestroy(Adesc);
+    cublasLtMatrixLayoutDestroy(Bdesc);
+    cublasLtMatrixLayoutDestroy(Cdesc);
+    sync_check_cuda_error();
+    mu_->unlock();
+#endif
+}
+
+void cublasMMWrapper::Int8Gemm(const int     m,
+                               const int     n,
+                               const int     k,
+                               const int8_t* A,
+                               const int     lda,
+                               const int8_t* B,
+                               const int     ldb,
+                               int8_t*       C,
+                               const int     ldc,
+                               const float*  alpha,
+                               const bool    per_column_scaling)
+{
+    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, alpha, 0, per_column_scaling);
+}
+
+void cublasMMWrapper::Int8Gemm(const int     m,
+                               const int     n,
+                               const int     k,
+                               const int8_t* A,
+                               const int     lda,
+                               const int8_t* B,
+                               const int     ldb,
+                               int32_t*      C,
+                               const int     ldc)
+{
+    return _Int8Gemm(m, n, k, A, lda, B, ldb, C, ldc, (float*)nullptr, 1, false);
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cublasMMWrapper.h b/src/fastertransformer/utils/cublasMMWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c7ecbffa4567a64b180b2d1b5e552fbfcf49d95
--- /dev/null
+++ b/src/fastertransformer/utils/cublasMMWrapper.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_utils.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <mutex>
+#include <string>
+
+#pragma once
+namespace fastertransformer {
+
+class cublasMMWrapper {
+protected:
+    cublasHandle_t   cublas_handle_;
+    cublasLtHandle_t cublaslt_handle_;
+#ifdef SPARSITY_ENABLED
+    cusparseLtHandle_t                               cusparselt_handle_;
+    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_A_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_B_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_C_desc_map_;
+#endif
+
+    cudaDataType_t Atype_;
+    cudaDataType_t Btype_;
+    cudaDataType_t Ctype_;
+    cudaDataType_t computeType_;
+
+    cudaStream_t   stream_;
+    cublasAlgoMap* cublas_algo_map_;
+    std::mutex*    mu_;
+
+    IAllocator* allocator_        = nullptr;
+    void*       cublas_workspace_ = nullptr;
+
+    friend class cublasINT8MMWrapper;
+
+    void _Int8Gemm(const int     m,
+                   const int     n,
+                   const int     k,
+                   const int8_t* A,
+                   const int     lda,
+                   const int8_t* B,
+                   const int     ldb,
+                   void*         C,
+                   const int     ldc,
+                   const void*   alpha,
+                   const int     mode,
+                   const bool    per_column_scaling);
+
+public:
+    cublasMMWrapper(cublasHandle_t   cublas_handle_,
+                    cublasLtHandle_t cublaslt_handle_,
+                    cudaStream_t     stream,
+                    cublasAlgoMap*   map,
+                    std::mutex*      mu,
+                    IAllocator*      allocator);
+
+#ifdef SPARSITY_ENABLED
+    cublasMMWrapper(cublasHandle_t     cublas_handle_,
+                    cublasLtHandle_t   cublaslt_handle_,
+                    cusparseLtHandle_t cusparselt_handle,
+                    cudaStream_t       stream,
+                    cublasAlgoMap*     map,
+                    std::mutex*        mu,
+                    IAllocator*        allocator);
+#endif
+
+    ~cublasMMWrapper();
+
+    cublasMMWrapper(const cublasMMWrapper& wrapper);
+
+    virtual void cublasVersionCheck()
+    {
+        return;
+    };
+    cublasStatus_t cublasLtMatmulWrapper(cublasLtHandle_t            lightHandle,
+                                         cublasLtMatmulDesc_t        computeDesc,
+                                         const void*                 alpha,
+                                         const void*                 A,
+                                         cublasLtMatrixLayout_t      Adesc,
+                                         const void*                 B,
+                                         cublasLtMatrixLayout_t      Bdesc,
+                                         const void*                 beta,
+                                         const void*                 C,
+                                         cublasLtMatrixLayout_t      Cdesc,
+                                         void*                       D,
+                                         cublasLtMatrixLayout_t      Ddesc,
+                                         const cublasLtMatmulAlgo_t* algo,
+                                         void*                       workspace,
+                                         size_t                      workspaceSizeInBytes,
+                                         cudaStream_t                stream);
+
+    std::pair<bool, cublasLtMatmulAlgo_t> findBestAlgo(cublasLtHandle_t       lightHandle,
+                                                       cublasLtMatmulDesc_t   computeDesc,
+                                                       const void*            alpha,
+                                                       const void*            A,
+                                                       cublasLtMatrixLayout_t Adesc,
+                                                       const void*            B,
+                                                       cublasLtMatrixLayout_t Bdesc,
+                                                       const void*            beta,
+                                                       const void*            C,
+                                                       cublasLtMatrixLayout_t Cdesc,
+                                                       void*                  D,
+                                                       cublasLtMatrixLayout_t Ddesc,
+                                                       cudaStream_t           stream);
+
+    using MatrixLayout = std::tuple<cudaDataType_t, cublasLtOrder_t, uint64_t, uint64_t>;
+    using cache_idx_t  = std::tuple<cublasLtMatmulDesc_t, std::array<MatrixLayout, 4>>;
+    std::map<cache_idx_t, cublasLtMatmulAlgo_t> algo_cache;
+
+    MatrixLayout createMatrixLayout(cublasLtMatrixLayout_t Mdesc);
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       alpha,
+              const void*       A,
+              cudaDataType_t    Atype,
+              int               lda,
+              const void*       B,
+              cudaDataType_t    Btype,
+              int               ldb,
+              const void*       beta,
+              void*             C,
+              cudaDataType_t    Ctype,
+              int               ldc,
+              cudaDataType_t    computeType,
+              cublasGemmAlgo_t  algo);
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              void*             C,
+              const int         ldc);
+
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              void*             C,
+              const int         ldc,
+              float             f_alpha,
+              float             f_beta);
+
+    void Int8Gemm(const int     m,
+                  const int     n,
+                  const int     k,
+                  const int8_t* A,
+                  const int     lda,
+                  const int8_t* B,
+                  const int     ldb,
+                  int8_t*       C,
+                  const int     ldc,
+                  const float*  alpha,
+                  const bool    per_column_scaling = false);
+
+    void Int8Gemm(const int     m,
+                  const int     n,
+                  const int     k,
+                  const int8_t* A,
+                  const int     lda,
+                  const int8_t* B,
+                  const int     ldb,
+                  int32_t*      C,
+                  const int     ldc);
+
+    void setFP32GemmConfig();
+    void setFP16GemmConfig();
+#ifdef ENABLE_BF16
+    void setBF16GemmConfig();
+#endif
+    void setStream(cudaStream_t stream);
+
+    void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType);
+
+    CublasDataType getCublasDataType(cudaDataType_t data_type);
+
+#if (CUDART_VERSION >= 11000)
+    void Gemm(cublasOperation_t transa,
+              cublasOperation_t transb,
+              const int         m,
+              const int         n,
+              const int         k,
+              const void*       A,
+              const int         lda,
+              const void*       B,
+              const int         ldb,
+              const void*       bias,
+              void*             C,
+              const int         ldc);
+#endif
+
+    void stridedBatchedGemm(cublasOperation_t transa,
+                            cublasOperation_t transb,
+                            const int         m,
+                            const int         n,
+                            const int         k,
+                            const void*       A,
+                            const int         lda,
+                            const int64_t     strideA,
+                            const void*       B,
+                            const int         ldb,
+                            const int64_t     strideB,
+                            void*             C,
+                            const int         ldc,
+                            const int64_t     strideC,
+                            const int         batchCount,
+                            const float       f_alpha = 1.0f,
+                            const float       f_beta  = 0.0f);
+
+    void stridedBatchedGemm(cublasOperation_t transa,
+                            cublasOperation_t transb,
+                            const int         m,
+                            const int         n,
+                            const int         k,
+                            const float       f_alpha,
+                            const void*       A,
+                            cudaDataType_t    AType,
+                            const int         lda,
+                            const int64_t     strideA,
+                            const void*       B,
+                            cudaDataType_t    BType,
+                            const int         ldb,
+                            const int64_t     strideB,
+                            const float       f_beta,
+                            void*             C,
+                            cudaDataType_t    CType,
+                            const int         ldc,
+                            const int64_t     strideC,
+                            const int         batch_count,
+                            cudaDataType_t    computeType);
+
+    void batchedGemm(cublasOperation_t  transa,
+                     cublasOperation_t  transb,
+                     const int          m,
+                     const int          n,
+                     const int          k,
+                     const void* const* A,
+                     const int          lda,
+                     const void* const* B,
+                     const int          ldb,
+                     void* const*       C,
+                     const int          ldc,
+                     const int          batch_count);
+
+    bool isFuseBatchGemm(const int batch_count, const int m, const int k, const int n);
+
+#ifdef SPARSITY_ENABLED
+    void SpGemm(cublasOperation_t transa,
+                cublasOperation_t transb,
+                const int         m,
+                const int         n,
+                const int         k,
+                const void*       A,
+                const void*       B,
+                void*             C);
+
+    size_t getSparseMatrixSize(int m, int k);
+    void   compressMatrix(const void* input, void* output, const int m, const int k);
+
+    bool isUseSparse(const int batch_count, const int m, const int n, const int k);
+#endif
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh b/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..22c7deeb8a357a6900297e241011088fbff5bd23
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+
+namespace fastertransformer {
+
+#ifdef ENABLE_BF16
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = __low2float(val); 
+    f_val.y = __high2float(val);
+    return f_val;
+#else
+    return __bfloat1622float2(val);
+#endif
+}
+
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = max(min(__low2float(val), 127.f), -128.f);
+    f_val.y = max(min(__high2float(val), 127.f), -128.f);
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
+    return int16;
+#else
+    val = __hmin2(val, make_bfloat162(127., 127.));
+    val = __hmax2(val, make_bfloat162(-128., -128.));
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
+    return int16;
+#endif
+}
+
+inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __floats2bfloat162_rn(val.x, val.y);
+#else
+    return __float22bfloat162_rn(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    __nv_bfloat162 val2;
+    val2.x = val;
+    val2.y = val;
+    return val2;
+#else
+    return __bfloat162bfloat162(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
+#else
+    return __hadd2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+#else
+    return __hadd(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
+#else
+    return __hsub2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+#else
+    return __hsub(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
+#else
+    return __hmul2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+#else 
+    return __hmul(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh, fzl, fzh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    fzl = __low2float(z);
+    fzh = __high2float(z);
+    return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
+#else
+    return __hfma2(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+#else
+    return __hfma(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);;
+    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
+#else
+    return h2exp(x);
+#endif
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+
+inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+#endif
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
+#else
+    return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    fdl = __low2float(d);
+    fdh = __high2float(d);
+    return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
+#else
+    return a * b * c + d;
+#endif
+}
+
+#endif // ENABLE_BF16
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/utils/cuda_bf16_wrapper.h b/src/fastertransformer/utils/cuda_bf16_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac6863beb54328d6b7059f98e04910275a9a705c
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_bf16_wrapper.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
\ No newline at end of file
diff --git a/src/fastertransformer/utils/cuda_fp8_utils.cu b/src/fastertransformer/utils/cuda_fp8_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1d24656615e22e0ebbf396443a231af004a51
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_fp8_utils.cu
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_fp8_utils.h"
+
+namespace fastertransformer {
+#ifdef ENABLE_FP8
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+__global__ void quantizeMatrix(T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n)
+{
+    for (uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; i < size; i += blockDim.x * gridDim.x) {
+        if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
+            output[i] = T_OUT((float)(input[i]) * __ldg(input_scale + (i % n)));
+        }
+        else {
+            output[i] = T_OUT((float)(input[i]) * __ldg(input_scale));
+        }
+    }
+}
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrix(
+    T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream)
+{
+    dim3 grid(32);
+    dim3 block(256);
+    quantizeMatrix<T_OUT, T_IN, quantize_mode><<<grid, block, 0, stream>>>(output, input_scale, input, size, n);
+}
+
+#define defineinvokeQuantizeMatrix(type_out, type_in, mode)                                                            \
+    template void invokeQuantizeMatrix<type_out, type_in, mode>(type_out * output,                                     \
+                                                                float const*   input_scale,                            \
+                                                                type_in const* input,                                  \
+                                                                uint32_t       size,                                   \
+                                                                uint32_t       n,                                      \
+                                                                cudaStream_t   stream);
+
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
+#ifdef ENABLE_BF16
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_TENSOR);
+defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
+defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
+#endif
+
+template<typename T_OUT, typename T_IN, typename T_FAKE>
+__global__ void fakeQuantize(T_OUT* dst, const T_IN* src, const int size)
+{
+    for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        T_FAKE tmp = (T_FAKE)((float)src[tid]);
+        dst[tid]   = (T_OUT)((float)tmp);
+    }
+}
+
+template<typename T_OUT, typename T_IN, typename T_FAKE>
+void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream)
+{
+    fakeQuantize<T_OUT, T_IN, T_FAKE><<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+template void
+invokeFakeQuantize<float, float, __nv_fp8_e4m3>(float* dst, const float* src, const int size, cudaStream_t stream);
+template void
+invokeFakeQuantize<half, half, __nv_fp8_e4m3>(half* dst, const half* src, const int size, cudaStream_t stream);
+template void invokeFakeQuantize<__nv_bfloat16, __nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16*       dst,
+                                                                              const __nv_bfloat16* src,
+                                                                              const int            size,
+                                                                              cudaStream_t         stream);
+
+template<typename T_W>
+__global__ void computeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n)
+{
+    float max = -10000.f;
+    for (int i = 0; i < k; i++) {
+        float val = fabs((float)weights[i * n + blockIdx.x * blockDim.x + threadIdx.x]);
+        max       = max > val ? max : val;
+        if (threadIdx.x == 0 && blockIdx.x == 0 && i % 100 == 0) {
+            printf("max: %f, val: %f \n", max, val);
+        }
+    }
+    // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = 1.0f;
+    // quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = FP8_E4M3_MAX / max;
+    quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = std::max(max / FP8_E4M3_MAX, 1.0f / 32.f);
+}
+
+template<typename T_W>
+void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream)
+{
+    dim3 block(256);
+    dim3 grid;
+    grid.x = (n + 255) / 256;
+    computeFP8QuantizeScale<T_W><<<grid, block, 0, stream>>>(quant_ptr, weights, k, n);
+}
+
+#ifdef ENABLE_BF16
+template void invokeComputeFP8QuantizeScale(
+    float* quant_ptr, const __nv_bfloat16* weights, const int k, const int n, cudaStream_t stream);
+#endif
+template void
+invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
+
+#endif  // ENABLE_FP8
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/utils/cuda_fp8_utils.h b/src/fastertransformer/utils/cuda_fp8_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b681171cbeafa7572312b298cfabd849c2c2d8c
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_fp8_utils.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef ENABLE_FP8
+#include <cuda_fp8.h>
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+// #define FP8_MHA
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900
+#define FUSE_GEMM_ACT
+#endif
+#define FP8_GEMM_OUTPUT_QUANT_DISABLE
+
+#ifdef FUSE_GEMM_ACT
+#define USE_QGMMA
+#endif
+
+namespace fastertransformer {
+
+const float FP8_E4M3_MAX = 480.0f;
+
+enum QUANTIZE_MODE {
+    PER_CHANNEL,
+    PER_TENSOR,
+    PER_CHANNEL_WEIGHT_PER_TENSOR_ACT
+};
+
+// Packed Data Type
+typedef struct __CUDA_ALIGN__(32) {
+    float array[8];
+} float8;
+
+typedef struct __CUDA_ALIGN__(16) {
+    half array[8];
+} half8;
+
+#ifdef ENABLE_BF16
+typedef struct __CUDA_ALIGN__(4) {
+    __nv_bfloat16 array[2];
+} __nv_bfloat16_2;
+
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_bfloat162 x, y;
+} __nv_bfloat162_2_xy;
+
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_bfloat16 array[4];
+} __nv_bfloat164;
+
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_bfloat162 array[2];
+} __nv_bfloat162_2;
+
+typedef struct __CUDA_ALIGN__(16) {
+    __nv_bfloat16 array[8];
+} __nv_bfloat168;
+
+typedef struct __CUDA_ALIGN__(16) {
+    __nv_bfloat162 array[4];
+} __nv_bfloat162_4;
+
+typedef struct __CUDA_ALIGN__(32) {
+    __nv_bfloat16 array[16];
+} __nv_bfloat1616;
+#endif
+
+#ifdef ENABLE_FP8
+typedef struct __CUDA_ALIGN__(2) {
+    __nv_fp8_e4m3 array[2];
+} __nv_fp8_2_e4m3;
+
+typedef struct __CUDA_ALIGN__(4) {
+    __nv_fp8_e4m3 array[4];
+} __nv_fp8_4_e4m3;
+
+typedef struct __CUDA_ALIGN__(4) {
+    __nv_fp8x2_e4m3 array[2];
+} __nv_fp8x2_x2_e4m3;
+
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_fp8_e4m3 array[8];
+} __nv_fp8_8_e4m3;
+
+typedef struct __CUDA_ALIGN__(8) {
+    __nv_fp8x2_e4m3 array[4];
+} __nv_fp8x2_x4_e4m3;
+
+typedef struct __CUDA_ALIGN__(16) {
+    __nv_fp8_e4m3 array[16];
+} __nv_fp8x16_e4m3;
+#endif
+
+// only BF16 and FP8
+template<typename T, int PACK_SIZE>
+struct PackType {
+    using type = float;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct PackType<__nv_bfloat16, 2> {
+    using type = __nv_bfloat16_2;
+};
+
+template<>
+struct PackType<__nv_bfloat16, 4> {
+    using type = __nv_bfloat164;
+};
+
+template<>
+struct PackType<__nv_bfloat16, 8> {
+    using type = __nv_bfloat168;
+};
+#endif
+
+#ifdef ENABLE_FP8
+template<>
+struct PackType<__nv_fp8_e4m3, 2> {
+    using type = __nv_fp8_2_e4m3;
+};
+
+template<>
+struct PackType<__nv_fp8_e4m3, 4> {
+    using type = __nv_fp8_4_e4m3;
+};
+
+template<>
+struct PackType<__nv_fp8_e4m3, 8> {
+    using type = __nv_fp8_8_e4m3;
+};
+#endif
+
+__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, const __nv_fp8x4_e4m3* in)
+{
+    const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
+    *out1               = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                           (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    *out2               = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
+                           (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
+}
+
+__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(const __nv_fp8x2_e4m3* in)
+{
+    const char2    tmp_val = reinterpret_cast<const char2*>(in)[0];
+    __nv_bfloat162 out     = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                                        (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    return out;
+}
+
+__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, const __nv_fp8x4_e4m3* in)
+{
+    const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
+    *out1               = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                  (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    *out2               = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
+                  (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
+}
+
+__inline__ __device__ half2 fp8x2_e4m3_to_half2(const __nv_fp8x2_e4m3* in)
+{
+    const char2 tmp_val = reinterpret_cast<const char2*>(in)[0];
+    half2       out     = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
+                      (float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
+    return out;
+}
+
+template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
+void invokeQuantizeMatrix(
+    T_OUT* output, float const* input_qua_amax_ptr, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream);
+
+template<typename T_OUT, typename T_IN, typename T_FAKE>
+void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream);
+
+template<typename T_W>
+void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);
+
+}  // namespace fastertransformer
+#endif  // ENABLE_FP8
diff --git a/src/fastertransformer/utils/cuda_type_utils.cuh b/src/fastertransformer/utils/cuda_type_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9d6cafdd1573d29903fe1c519c700355481ba6c8
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_type_utils.cuh
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+inline __device__ T ldg(const T* val) {
+    return __ldg(val);
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162* val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return val[0];
+#else
+    return __ldg(val);
+#endif
+}
+
+template<>
+inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16* val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return val[0];
+#else
+    return __ldg(val);
+#endif
+}
+#endif // ENABLE_BF16
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template<typename T>
+struct TypeConverter {using Type = half2;}; // keep for generality
+
+template<>
+struct TypeConverter<half2> {using Type = half;};
+
+template<>
+struct TypeConverter<half> {using Type = half2;};
+
+#if ENABLE_BF16
+template<>
+struct TypeConverter<__nv_bfloat162> {using Type = __nv_bfloat16;};
+
+template<>
+struct TypeConverter<__nv_bfloat16> {using Type = __nv_bfloat162;};
+#endif // ENABLE_BF16
+
+// Defined math operations (bfloat16 fallback to fp32 when it is not supported)
+template<typename T>
+inline __device__ T hadd2(T a, T b) {
+    return __hadd2(a, b);
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hadd2(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hadd2(a, b);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T add(T a, T b) {
+    return a + b;
+}
+
+template<>
+inline __device__ half2 add(half2 a, half2 b) {
+    return __hadd2(a, b);
+}
+
+template<>
+inline __device__ half add(half a, half b) {
+    return __hadd(a, b);
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hadd2(a, b);
+}
+
+template<>
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
+    return bf16hadd(a, b);
+}
+
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, float b) {
+    return bf16hadd(a, __float2bfloat16(b));
+}
+#endif // ENABLE_BF16
+
+// applies to all 4 values addition
+template<typename T>
+inline __device__ T add(T a, T b, T c) {
+    return a + b + c;
+}
+
+#if ENABLE_BF16
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+    return bf16hadd(a, b, c);
+}
+
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hadd2(a, b, c);
+}
+#endif // ENABLE_BF16
+
+// applies to all 4 values addition
+template<typename T>
+inline __device__ T add(T a, T b, T c, T d) {
+    return (T)((float)a + (float)b + (float)c + (float)d);
+}
+
+#if ENABLE_BF16
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+    return bf16hadd(a, b, c, d);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T hsub2(T a, T b) {
+    return __hsub2(a, b);
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hsub2(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hsub2(a, b);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T hmul2(T a, T b) {
+    return __hmul2(a, b);
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b) {
+    return bf16hmul2(a, b);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T hmul2(T a, T b, T c) {
+    return a * b * c;
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hmul2(a, b, c);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T mul(T a, T b, T c) {
+    return a * b * c;
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+    return bf16hmul(a, b, c);
+}
+
+inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hmul2(a, b, c);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T fma(T a, T b, T c, T d) {
+    return a * b * c + d;
+}
+
+#if ENABLE_BF16
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+    return bf16hfma2(a, b, c, d);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T fma(T a, T b, T c) {
+    return a * b + c;
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+    return bf16hfma2(a, b, c);
+}
+
+template<>
+inline __device__ __nv_bfloat16 fma(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+    return bf16hfma(a, b, c);
+}
+#endif // ENABLE_BF16
+
+template<typename T>
+inline __device__ T hexp2(T a) {
+    return h2exp(a);
+}
+
+#if ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat162 hexp2(__nv_bfloat162 a) {
+    return bf16exp2(a);
+}
+#endif // ENABLE_BF16
+
+template<typename T_OUT, typename T_IN> __device__ inline T_OUT cuda_cast(T_IN val) { return val; }
+
+template<> __device__ inline float2 cuda_cast<float2, int2>(int2 val) { return make_float2(val.x, val.y); }
+template<> __device__ inline float2 cuda_cast<float2, float>(float val) { return make_float2(val, val); }
+template<> __device__ inline float2 cuda_cast<float2, half2>(half2 val) { return __half22float2(val); }
+template<> __device__ inline half2 cuda_cast<half2, float2>(float2 val) { return __float22half2_rn(val); }
+template<> __device__ inline half2 cuda_cast<half2, float>(float val) { return __float2half2_rn(val); }
+template<> __device__ inline half2 cuda_cast<half2, half>(half val) { return __half2half2(val); }
+
+template<> __device__ inline int8_t cuda_cast<int8_t, half>(half val) {
+    union { int8_t int8[2]; int16_t int16; };
+    union { half fp16; int16_t int16_in; };
+    fp16 = val;
+    asm volatile ("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
+    return int8[0];
+}
+
+template<> __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = cuda_cast<int8_t>(val.x);
+    int8[1] = cuda_cast<int8_t>(val.y);
+    return int16;
+}
+
+template<> __device__ inline int8_t cuda_cast<int8_t, float>(float val) {
+    union { int8_t int8[2]; int16_t int16; };
+    asm volatile ("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
+    return int8[0];
+}
+
+template<> __device__ inline int16_t cuda_cast<int16_t, float2>(float2 val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = cuda_cast<int8_t>(val.x);
+    int8[1] = cuda_cast<int8_t>(val.y);
+    return int16;
+}
+
+template<> __device__ inline half2 cuda_cast<half2, int16_t>(int16_t val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int16 = val;
+    return make_half2(int8[0], int8[1]);
+}
+
+template<> __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int16 = val;
+    return make_float2(int8[0], int8[1]);
+}
+
+#ifdef ENABLE_BF16
+template<> __device__ inline __nv_bfloat16 cuda_cast(int32_t val) { return static_cast<float>(val); }
+template<> __device__ inline __nv_bfloat16 cuda_cast(int8_t val) { return static_cast<float>(val); }
+template<> __device__ inline int8_t cuda_cast(__nv_bfloat16 val) { return static_cast<float>(val); }
+
+template<>
+__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) { return __bfloat162float(val); }
+
+template<> __device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) { return bf1622float2(val); }
+
+template<> __device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) { return __float2half(__bfloat162float(val)); }
+
+template<> __device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) { return bf1622int16(val); }
+
+template<> __device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) { return __float2bfloat16(val); }
+template<> __device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) { return __float2bfloat16(__half2float(val)); }
+
+
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) { return bf162bf162(val); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) { return __float2bfloat162_rn(val); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) { return float22bf162(val); }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) {
+    union { int8_t int8[2]; int16_t int16; };
+    int16 = val;
+    __nv_bfloat162 res;
+    res.x = cuda_cast<__nv_bfloat16>(int8[0]);
+    res.y = cuda_cast<__nv_bfloat16>(int8[1]);
+    return res;
+}
+
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) { return float22bf162(__half22float2(val)); }
+
+#endif // ENABLE BF16
+
+template<typename T> __device__ inline T cuda_abs(T val);
+template<> __device__ inline float cuda_abs(float val) { return fabs(val); }
+template<> __device__ inline half  cuda_abs(half  val) { return __habs(val); }
+template<> __device__ inline half2 cuda_abs(half2 val) { return __habs2(val); }
+
+#ifdef ENABLE_BF16
+
+#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
+template<> __device__ inline __nv_bfloat16  cuda_abs(__nv_bfloat16  val) { return __habs(val); }
+template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return __habs2(val); }
+#else
+template<> __device__ inline __nv_bfloat16  cuda_abs(__nv_bfloat16  val) { return fabs(val); }
+template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return make_bfloat162(fabs(val.x), fabs(val.y)); }
+#endif
+
+#endif // ENABLE_FP16
+
+// Unary maximum: compute the max of a vector type
+template<typename To, typename Ti> __device__ inline To cuda_max(Ti val)
+{
+    return cuda_cast<To>(val);
+};
+
+template<> __device__ inline half cuda_max(half2 val) { return (val.x > val.y) ? val.x : val.y; }
+#ifdef ENABLE_BF16
+template<> __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) { return (val.x > val.y) ? val.x : val.y; }
+#endif
+
+// Binary maximum: compute the max of two scalar types
+template<typename T> __device__ inline T cuda_max(T val1, T val2) { return (val1 > val2) ? val1 : val2; }
+
+#ifdef ENABLE_FP8
+template<> __device__ inline float2 cuda_cast<float2, __nv_fp8x2_e4m3>(__nv_fp8x2_e4m3 val) { return bf1622float2(fp8x2_e4m3_to_bfloat2(&val)); }
+template<> __device__ inline __nv_fp8x2_e4m3 cuda_cast<__nv_fp8x2_e4m3, float2>(float2 val) { return __nv_fp8x2_e4m3(bf1622float2(float22bf162(val))); }
+
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, half>(half val) { return __nv_fp8_e4m3(val); }
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, __nv_bfloat16>(__nv_bfloat16 val) { return __nv_fp8_e4m3(val); }
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, float>(float val) { return __nv_fp8_e4m3(val); }
+template<> __device__ inline float cuda_cast<float, __nv_fp8_e4m3>(__nv_fp8_e4m3 val) { return (float)val; }
+template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_fp8x2_e4m3>(__nv_fp8x2_e4m3 val) { return fp8x2_e4m3_to_bfloat2(&val); }
+
+template<> __device__ inline int8_t cuda_cast<int8_t, __nv_fp8_e4m3>(__nv_fp8_e4m3 val)
+{
+    // no impl
+    return 0;
+}
+
+template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
+{
+    return cuda_cast<__nv_fp8_e4m3>(cuda_cast<__nv_bfloat16>(cuda_cast<float>(val)));
+}
+
+#endif // ENABLE_FP8
+
+}
diff --git a/src/fastertransformer/utils/cuda_utils.cc b/src/fastertransformer/utils/cuda_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..765cc73bb5c6cde965e2d216fb04b6d27b391131
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_utils.cc
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+
+namespace fastertransformer {
+
+/* **************************** debug tools ********************************* */
+
+template<typename T>
+void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode)
+{
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    printf("[INFO] file: %s with size %d.\n", file, size);
+    std::ofstream outFile(file, open_mode);
+    if (outFile) {
+        T* tmp = new T[size];
+        check_cuda_error(cudaMemcpyAsync(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost, stream));
+        for (int i = 0; i < size; ++i) {
+            float val = (float)(tmp[i]);
+            outFile << val << std::endl;
+        }
+        delete[] tmp;
+    }
+    else {
+        throw std::runtime_error(std::string("[FT][ERROR] Cannot open file: ") + file + "\n");
+    }
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+}
+
+template void
+print_to_file(const float* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
+template void
+print_to_file(const half* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
+#ifdef ENABLE_BF16
+template void print_to_file(
+    const __nv_bfloat16* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
+#endif
+
+template<typename T>
+void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name)
+{
+    if (buf == nullptr) {
+        FT_LOG_WARNING("It is an nullptr, skip!");
+        return;
+    }
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    T* h_tmp = new T[size];
+    cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+    double   sum        = 0.0f;
+    uint64_t zero_count = 0;
+    float    max_val    = -1e10;
+    bool     find_inf   = false;
+    for (uint i = 0; i < size; i++) {
+        if (std::isinf((float)(h_tmp[i]))) {
+            find_inf = true;
+            continue;
+        }
+        sum += abs((double)h_tmp[i]);
+        if ((float)h_tmp[i] == 0.0f) {
+            zero_count++;
+        }
+        max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));
+    }
+    printf("[INFO][FT] %20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s",
+           name.c_str(),
+           size,
+           sum / size,
+           sum,
+           max_val,
+           find_inf ? "true" : "false");
+    std::cout << std::endl;
+    delete[] h_tmp;
+    cudaDeviceSynchronize();
+    check_cuda_error(cudaGetLastError());
+}
+
+template void print_abs_mean(const float* buf, uint size, cudaStream_t stream, std::string name);
+template void print_abs_mean(const half* buf, uint size, cudaStream_t stream, std::string name);
+#ifdef ENABLE_BF16
+template void print_abs_mean(const __nv_bfloat16* buf, uint size, cudaStream_t stream, std::string name);
+#endif
+template void print_abs_mean(const int* buf, uint size, cudaStream_t stream, std::string name);
+template void print_abs_mean(const uint* buf, uint size, cudaStream_t stream, std::string name);
+template void print_abs_mean(const int8_t* buf, uint size, cudaStream_t stream, std::string name);
+#ifdef ENABLE_FP8
+template void print_abs_mean(const __nv_fp8_e4m3* buf, uint size, cudaStream_t stream, std::string name);
+#endif
+
+template<typename T>
+void print_to_screen(const T* result, const int size)
+{
+    if (result == nullptr) {
+        FT_LOG_WARNING("It is an nullptr, skip! \n");
+        return;
+    }
+    T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
+    check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
+    for (int i = 0; i < size; ++i) {
+        printf("%d, %f\n", i, static_cast<float>(tmp[i]));
+    }
+    free(tmp);
+}
+
+template void print_to_screen(const float* result, const int size);
+template void print_to_screen(const half* result, const int size);
+#ifdef ENABLE_BF16
+template void print_to_screen(const __nv_bfloat16* result, const int size);
+#endif
+template void print_to_screen(const int* result, const int size);
+template void print_to_screen(const uint* result, const int size);
+template void print_to_screen(const bool* result, const int size);
+#ifdef ENABLE_FP8
+template void print_to_screen(const __nv_fp8_e4m3* result, const int size);
+#endif
+
+template<typename T>
+void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    T* tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%7.3f ", (float)tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%7d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+
+template void printMatrix(float* ptr, int m, int k, int stride, bool is_device_ptr);
+template void printMatrix(half* ptr, int m, int k, int stride, bool is_device_ptr);
+#ifdef ENABLE_BF16
+template void printMatrix(__nv_bfloat16* ptr, int m, int k, int stride, bool is_device_ptr);
+#endif
+
+void printMatrix(unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    typedef unsigned long long T;
+    T*                         tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%4llu ", tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%4d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+
+void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    typedef int T;
+    T*          tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%4d ", tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%4d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+
+void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
+{
+    typedef size_t T;
+    T*             tmp;
+    if (is_device_ptr) {
+        // k < stride ; stride = col-dimension.
+        tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
+        check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
+        cudaDeviceSynchronize();
+    }
+    else {
+        tmp = ptr;
+    }
+
+    for (int ii = -1; ii < m; ++ii) {
+        if (ii >= 0) {
+            printf("%02d ", ii);
+        }
+        else {
+            printf("   ");
+        }
+
+        for (int jj = 0; jj < k; jj += 1) {
+            if (ii >= 0) {
+                printf("%4ld ", tmp[ii * stride + jj]);
+            }
+            else {
+                printf("%4d ", jj);
+            }
+        }
+        printf("\n");
+    }
+    if (is_device_ptr) {
+        free(tmp);
+    }
+}
+
+template<typename T>
+void check_max_val(const T* result, const int size)
+{
+    T* tmp = new T[size];
+    cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    float max_val = -100000;
+    for (int i = 0; i < size; i++) {
+        float val = static_cast<float>(tmp[i]);
+        if (val > max_val) {
+            max_val = val;
+        }
+    }
+    delete tmp;
+    printf("[INFO][CUDA] addr %p max val: %f \n", result, max_val);
+}
+
+template void check_max_val(const float* result, const int size);
+template void check_max_val(const half* result, const int size);
+#ifdef ENABLE_BF16
+template void check_max_val(const __nv_bfloat16* result, const int size);
+#endif
+
+template<typename T>
+void check_abs_mean_val(const T* result, const int size)
+{
+    T* tmp = new T[size];
+    cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
+    float sum = 0.0f;
+    for (int i = 0; i < size; i++) {
+        sum += abs(static_cast<float>(tmp[i]));
+    }
+    delete tmp;
+    printf("[INFO][CUDA] addr %p abs mean val: %f \n", result, sum / size);
+}
+
+template void check_abs_mean_val(const float* result, const int size);
+template void check_abs_mean_val(const half* result, const int size);
+#ifdef ENABLE_BF16
+template void check_abs_mean_val(const __nv_bfloat16* result, const int size);
+#endif
+
+/* ***************************** common utils ****************************** */
+
+cudaError_t getSetDevice(int i_device, int* o_device)
+{
+    int         current_dev_id = 0;
+    cudaError_t err            = cudaSuccess;
+
+    if (o_device != NULL) {
+        err = cudaGetDevice(&current_dev_id);
+        if (err != cudaSuccess) {
+            return err;
+        }
+        if (current_dev_id == i_device) {
+            *o_device = i_device;
+        }
+        else {
+            err = cudaSetDevice(i_device);
+            if (err != cudaSuccess) {
+                return err;
+            }
+            *o_device = current_dev_id;
+        }
+    }
+    else {
+        err = cudaSetDevice(i_device);
+        if (err != cudaSuccess) {
+            return err;
+        }
+    }
+
+    return cudaSuccess;
+}
+
+FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
+{
+    FtCudaDataType model_file_type;
+    INIReader      reader = INIReader(ini_file);
+    if (reader.ParseError() < 0) {
+        FT_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str());
+        model_file_type = FtCudaDataType::FP32;
+    }
+    else {
+        std::string weight_data_type_str = std::string(reader.Get(section_name, "weight_data_type"));
+        if (weight_data_type_str.find("fp32") != std::string::npos) {
+            model_file_type = FtCudaDataType::FP32;
+        }
+        else if (weight_data_type_str.find("fp16") != std::string::npos) {
+            model_file_type = FtCudaDataType::FP16;
+        }
+        else if (weight_data_type_str.find("bf16") != std::string::npos) {
+            model_file_type = FtCudaDataType::BF16;
+        }
+        else {
+            FT_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str());
+            model_file_type = FtCudaDataType::FP32;
+        }
+    }
+    return model_file_type;
+}
+
+/* ************************** end of common utils ************************** */
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/cuda_utils.h b/src/fastertransformer/utils/cuda_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..331b6f297288c01808ddc86f0744f3a6d82ae54a
--- /dev/null
+++ b/src/fastertransformer/utils/cuda_utils.h
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "3rdparty/INIReader.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/logger.h"
+
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#ifdef SPARSITY_ENABLED
+#include <cusparseLt.h>
+#endif
+
+namespace fastertransformer {
+
+#define MAX_CONFIG_NUM 20
+#define COL32_ 32
+// workspace for cublas gemm : 32MB
+#define CUBLAS_WORKSPACE_SIZE 33554432
+
+typedef struct __align__(4)
+{
+    half x, y, z, w;
+}
+half4;
+
+/* **************************** type definition ***************************** */
+
+enum CublasDataType {
+    FLOAT_DATATYPE    = 0,
+    HALF_DATATYPE     = 1,
+    BFLOAT16_DATATYPE = 2,
+    INT8_DATATYPE     = 3,
+    FP8_DATATYPE      = 4
+};
+
+enum FtCudaDataType {
+    FP32 = 0,
+    FP16 = 1,
+    BF16 = 2,
+    INT8 = 3,
+    FP8  = 4
+};
+
+enum class OperationType {
+    FP32,
+    FP16,
+    BF16,
+    INT8,
+    FP8
+};
+
+/* **************************** debug tools ********************************* */
+static const char* _cudaGetErrorEnum(cudaError_t error)
+{
+    return cudaGetErrorString(error);
+}
+
+static const char* _cudaGetErrorEnum(cublasStatus_t error)
+{
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+
+template<typename T>
+void check(T result, char const* const func, const char* const file, int const line)
+{
+    if (result) {
+        throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+                                 + file + ":" + std::to_string(line) + " \n");
+    }
+}
+
+#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
+#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
+
+inline void syncAndCheck(const char* const file, int const line)
+{
+    // When FT_DEBUG_LEVEL=DEBUG, must check error
+    static char* level_name = std::getenv("FT_DEBUG_LEVEL");
+    if (level_name != nullptr) {
+        static std::string level = std::string(level_name);
+        if (level == "DEBUG") {
+            cudaDeviceSynchronize();
+            cudaError_t result = cudaGetLastError();
+            if (result) {
+                throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
+                                         + " " + file + ":" + std::to_string(line) + " \n");
+            }
+            FT_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
+        }
+    }
+
+#ifndef NDEBUG
+    cudaDeviceSynchronize();
+    cudaError_t result = cudaGetLastError();
+    if (result) {
+        throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+                                 + file + ":" + std::to_string(line) + " \n");
+    }
+#endif
+}
+
+#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
+
+#define checkCUDNN(expression)                                                                                         \
+    {                                                                                                                  \
+        cudnnStatus_t status = (expression);                                                                           \
+        if (status != CUDNN_STATUS_SUCCESS) {                                                                          \
+            std::cerr << "Error on file " << __FILE__ << " line " << __LINE__ << ": " << cudnnGetErrorString(status)   \
+                      << std::endl;                                                                                    \
+            std::exit(EXIT_FAILURE);                                                                                   \
+        }                                                                                                              \
+    }
+
+template<typename T>
+void print_to_file(const T*           result,
+                   const int          size,
+                   const char*        file,
+                   cudaStream_t       stream    = 0,
+                   std::ios::openmode open_mode = std::ios::out);
+
+template<typename T>
+void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name = "");
+
+template<typename T>
+void print_to_screen(const T* result, const int size);
+
+template<typename T>
+void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr);
+
+void printMatrix(unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr);
+void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr);
+void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr);
+
+template<typename T>
+void check_max_val(const T* result, const int size);
+
+template<typename T>
+void check_abs_mean_val(const T* result, const int size);
+
+#define PRINT_FUNC_NAME_()                                                                                             \
+    do {                                                                                                               \
+        std::cout << "[FT][CALL] " << __FUNCTION__ << " " << std::endl;                                                \
+    } while (0)
+
+[[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "")
+{
+    throw std::runtime_error(std::string("[FT][ERROR] ") + info + " Assertion fail: " + file + ":"
+                             + std::to_string(line) + " \n");
+}
+
+inline void myAssert(bool result, const char* const file, int const line, std::string const& info = "")
+{
+    if (!result) {
+        throwRuntimeError(file, line, info);
+    }
+}
+
+#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__)
+#define FT_CHECK_WITH_INFO(val, info)                                                                                  \
+    do {                                                                                                               \
+        bool is_valid_val = (val);                                                                                     \
+        if (!is_valid_val) {                                                                                           \
+            fastertransformer::myAssert(is_valid_val, __FILE__, __LINE__, (info));                                     \
+        }                                                                                                              \
+    } while (0)
+
+#define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info)
+
+#ifdef SPARSITY_ENABLED
+#define CHECK_CUSPARSE(func)                                                                                           \
+    {                                                                                                                  \
+        cusparseStatus_t status = (func);                                                                              \
+        if (status != CUSPARSE_STATUS_SUCCESS) {                                                                       \
+            throw std::runtime_error(std::string("[FT][ERROR] CUSPARSE API failed at line ")                           \
+                                     + std::to_string(__LINE__) + " in file " + __FILE__ + ": "                        \
+                                     + cusparseGetErrorString(status) + " " + std::to_string(status));                 \
+        }                                                                                                              \
+    }
+#endif
+
+/*************Time Handling**************/
+class CudaTimer {
+private:
+    cudaEvent_t  event_start_;
+    cudaEvent_t  event_stop_;
+    cudaStream_t stream_;
+
+public:
+    explicit CudaTimer(cudaStream_t stream = 0)
+    {
+        stream_ = stream;
+    }
+    void start()
+    {
+        check_cuda_error(cudaEventCreate(&event_start_));
+        check_cuda_error(cudaEventCreate(&event_stop_));
+        check_cuda_error(cudaEventRecord(event_start_, stream_));
+    }
+    float stop()
+    {
+        float time;
+        check_cuda_error(cudaEventRecord(event_stop_, stream_));
+        check_cuda_error(cudaEventSynchronize(event_stop_));
+        check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
+        check_cuda_error(cudaEventDestroy(event_start_));
+        check_cuda_error(cudaEventDestroy(event_stop_));
+        return time;
+    }
+    ~CudaTimer() {}
+};
+
+static double diffTime(timeval start, timeval end)
+{
+    return (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
+}
+
+/* ***************************** common utils ****************************** */
+
+inline void print_mem_usage(std::string time = "after allocation")
+{
+    size_t free_bytes, total_bytes;
+    check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
+    float free  = static_cast<float>(free_bytes) / 1024.0 / 1024.0 / 1024.0;
+    float total = static_cast<float>(total_bytes) / 1024.0 / 1024.0 / 1024.0;
+    float used  = total - free;
+    printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used);
+}
+
+inline int getSMVersion()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int sm_major = 0;
+    int sm_minor = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
+    check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
+    return sm_major * 10 + sm_minor;
+}
+
+inline int getMaxSharedMemoryPerBlock()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    int max_shared_memory_size = 0;
+    check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device));
+    return max_shared_memory_size;
+}
+
+inline std::string getDeviceName()
+{
+    int device{-1};
+    check_cuda_error(cudaGetDevice(&device));
+    cudaDeviceProp props;
+    check_cuda_error(cudaGetDeviceProperties(&props, device));
+    return std::string(props.name);
+}
+
+inline int div_up(int a, int n)
+{
+    return (a + n - 1) / n;
+}
+
+cudaError_t getSetDevice(int i_device, int* o_device = NULL);
+
+inline int getDevice()
+{
+    int current_dev_id = 0;
+    check_cuda_error(cudaGetDevice(&current_dev_id));
+    return current_dev_id;
+}
+
+inline int getDeviceCount()
+{
+    int count = 0;
+    check_cuda_error(cudaGetDeviceCount(&count));
+    return count;
+}
+
+template<typename T>
+CublasDataType getCublasDataType()
+{
+    if (std::is_same<T, half>::value) {
+        return HALF_DATATYPE;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        return BFLOAT16_DATATYPE;
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        return FLOAT_DATATYPE;
+    }
+    else {
+        FT_CHECK(false);
+        return FLOAT_DATATYPE;
+    }
+}
+
+template<typename T>
+cudaDataType_t getCudaDataType()
+{
+    if (std::is_same<T, half>::value) {
+        return CUDA_R_16F;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        return CUDA_R_16BF;
+    }
+#endif
+    else if (std::is_same<T, float>::value) {
+        return CUDA_R_32F;
+    }
+    else {
+        FT_CHECK(false);
+        return CUDA_R_32F;
+    }
+}
+
+template<CublasDataType T>
+struct getTypeFromCudaDataType {
+    using Type = float;
+};
+
+template<>
+struct getTypeFromCudaDataType<HALF_DATATYPE> {
+    using Type = half;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> {
+    using Type = __nv_bfloat16;
+};
+#endif
+
+FtCudaDataType getModelFileType(std::string ini_file, std::string section_name);
+
+// clang-format off
+template<typename T> struct packed_type;
+template <>          struct packed_type<float>         { using type = float; }; // we don't need to pack float by default
+template <>          struct packed_type<half>          { using type = half2; };
+
+#ifdef ENABLE_BF16
+template<>
+struct packed_type<__nv_bfloat16> {
+    using type = __nv_bfloat162;
+};
+#endif
+
+template<typename T> struct num_elems;
+template <>          struct num_elems<float>           { static constexpr int value = 1; };
+template <>          struct num_elems<float2>          { static constexpr int value = 2; };
+template <>          struct num_elems<float4>          { static constexpr int value = 4; };
+template <>          struct num_elems<half>            { static constexpr int value = 1; };
+template <>          struct num_elems<half2>           { static constexpr int value = 2; };
+#ifdef ENABLE_BF16
+template <>          struct num_elems<__nv_bfloat16>   { static constexpr int value = 1; };
+template <>          struct num_elems<__nv_bfloat162>  { static constexpr int value = 2; };
+#endif
+
+template<typename T, int num> struct packed_as;
+template<typename T>          struct packed_as<T, 1>              { using type = T; };
+template<>                    struct packed_as<half,  2>          { using type = half2; };
+template<>                    struct packed_as<float,  2>         { using type = float2; };
+template<>                    struct packed_as<int8_t, 2>         { using type = int16_t; };
+template<>                    struct packed_as<int32_t, 2>        { using type = int2; };
+template<>                    struct packed_as<half2, 1>          { using type = half; };
+#ifdef ENABLE_BF16
+template<> struct packed_as<__nv_bfloat16,  2> { using type = __nv_bfloat162; };
+template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16;  };
+#endif
+
+inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+inline __device__ float2 operator*(float2 a, float  b) { return make_float2(a.x * b, a.y * b); }
+// clang-format on
+
+template<typename T1, typename T2>
+void compareTwoTensor(
+    const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "")
+{
+    T1* h_pred = new T1[size];
+    T2* h_ref  = new T2[size];
+    check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost));
+    check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost));
+
+    FILE* fd = nullptr;
+    if (filename != "") {
+        fd = fopen(filename.c_str(), "w");
+        fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)");
+    }
+
+    if (print_size > 0) {
+        FT_LOG_INFO("  id |   pred  |   ref   |abs diff | rel diff (%) |");
+    }
+    float mean_abs_diff = 0.0f;
+    float mean_rel_diff = 0.0f;
+    int   count         = 0;
+    for (int i = 0; i < size; i++) {
+        if (i < print_size) {
+            FT_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
+                        i,
+                        (float)h_pred[i],
+                        (float)h_ref[i],
+                        abs((float)h_pred[i] - (float)h_ref[i]),
+                        abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
+        }
+        if ((float)h_pred[i] == 0) {
+            continue;
+        }
+        count += 1;
+        mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]);
+        mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f;
+
+        if (fd != nullptr) {
+            fprintf(fd,
+                    "| %10.5f | %10.5f | %10.5f | %11.5f |\n",
+                    (float)h_pred[i],
+                    (float)h_ref[i],
+                    abs((float)h_pred[i] - (float)h_ref[i]),
+                    abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
+        }
+    }
+    mean_abs_diff = mean_abs_diff / (float)count;
+    mean_rel_diff = mean_rel_diff / (float)count;
+    FT_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
+
+    if (fd != nullptr) {
+        fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
+        fclose(fd);
+    }
+    delete[] h_pred;
+    delete[] h_ref;
+}
+
+/* ************************** end of common utils ************************** */
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/custom_ar_comm.cc b/src/fastertransformer/utils/custom_ar_comm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a0658d1c22f87469600ea071a695fef53f2a89f
--- /dev/null
+++ b/src/fastertransformer/utils/custom_ar_comm.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "custom_ar_comm.h"
+
+namespace fastertransformer {
+
+template<typename T>
+CustomAllReduceComm<T>::CustomAllReduceComm(size_t rank_size, size_t rank): rank_size_(rank_size), rank_(rank)
+{
+    param_.barrier_flag = 0;
+    // NOTE: assume All Reduce happens within the node (DGX A100)
+    param_.rank       = rank_;
+    param_.local_rank = rank_;
+    param_.node_id    = 0;
+}
+
+template<typename T>
+CustomAllReduceComm<T>::~CustomAllReduceComm()
+{
+    cudaPointerAttributes comm_buffer_attributes, barrier_attributes;
+    check_cuda_error(cudaPointerGetAttributes(&comm_buffer_attributes, param_.peer_comm_buffer_ptrs[rank_]));
+    check_cuda_error(cudaPointerGetAttributes(&barrier_attributes, param_.peer_barrier_ptrs[rank_]));
+    if (comm_buffer_attributes.type == 2) {
+        check_cuda_error(cudaFree(param_.peer_comm_buffer_ptrs[rank_]));
+    }
+    if (barrier_attributes.type == 2) {
+        check_cuda_error(cudaFree(param_.peer_barrier_ptrs[rank_]));
+    }
+}
+
+template<typename T>
+void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream)
+{
+    param_.elts_total   = elts;
+    param_.barrier_flag = FLAG(param_.barrier_flag + 1);
+
+    invokeOneOrTwoShotAllReduceKernel<T>(param_, stream);
+
+    // swap back
+    output_tensor_->at(0).data = (const void*)tmp_tensor_data_;
+}
+
+template<typename T>
+void CustomAllReduceComm<T>::allocateAndExchangePeerAccessPointer(
+    std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms)
+{
+    assert(custom_all_reduce_comms->size() == rank_size_);
+    assert(rank_ == 0);
+    // Enable Peer to Peer Access
+    enableP2P(rank_size_);
+    for (size_t i = 0; i < rank_size_; i++) {
+        check_cuda_error(cudaSetDevice(i));
+        check_cuda_error(cudaMalloc(&(param_.peer_comm_buffer_ptrs[i]), CUSTOM_AR_SIZE_THRESHOLD));
+        check_cuda_error(
+            cudaMalloc(&(param_.peer_barrier_ptrs[i]), rank_size_ * (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)));
+        check_cuda_error(
+            cudaMemset(param_.peer_barrier_ptrs[i], 0, rank_size_ * (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)));
+        T*        current_peer_comm_buffer_ptr = param_.peer_comm_buffer_ptrs[i];
+        uint32_t* current_peer_barrier_ptr     = param_.peer_barrier_ptrs[i];
+        // Assume current comm allocates device memory on all ranks (rank_ == 0)
+        for (size_t j = 1; j < rank_size_; j++) {
+            static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(j).get())
+                ->param_.peer_comm_buffer_ptrs[i] = current_peer_comm_buffer_ptr;
+            static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(j).get())->param_.peer_barrier_ptrs[i] =
+                current_peer_barrier_ptr;
+        }
+    }
+
+    // Set default local_output_buffer_ptr to local peer_comm_buffer_ptrs
+    for (size_t i = 0; i < rank_size_; i++) {
+        static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(i).get())->param_.local_output_buffer_ptr =
+            static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(i).get())->param_.peer_comm_buffer_ptrs[i];
+    }
+}
+
+template<typename T>
+void CustomAllReduceComm<T>::enableP2P(int ngpus)
+{
+    int peer_access_available = 0;
+    for (int i = 0; i < ngpus; i++) {
+        cudaSetDevice(i);
+        for (int j = 0; j < ngpus; j++) {
+            if (i == j) {
+                continue;
+            }
+            cudaDeviceCanAccessPeer(&peer_access_available, i, j);
+            // Custom AR Kernels need DGX A100 NVSWITCH connections
+            assert(peer_access_available);
+            cudaDeviceEnablePeerAccess(j, 0);
+        }
+    }
+}
+
+template<typename T>
+bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts)
+{
+    // Check if all reduce elts meet the requirement of custom kernels
+    // If meet, then swap the local comm buffer ptr with output tensor data pointer (avoid additional
+    // memory movement)
+    if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) {
+        tmp_tensor_data_               = (T*)(tensor_buffer->at(0).data);
+        output_tensor_                 = tensor_buffer;
+        tensor_buffer->at(0).data      = param_.peer_comm_buffer_ptrs[rank_];
+        param_.local_output_buffer_ptr = tmp_tensor_data_;
+        return true;
+    }
+    return false;
+}
+
+template<typename T>
+void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                             int                                               enable_custom_all_reduce,
+                             size_t                                            rank_size)
+{
+    if (enable_custom_all_reduce == 0) {
+        // don't use custom all reduce kernels, fall back to NCCL
+        for (size_t i = 0; i < rank_size; i++) {
+            custom_all_reduce_comms->push_back(nullptr);
+        }
+        return;
+    }
+
+    if (rank_size != RANKS_PER_NODE) {
+#ifdef BUILD_MULTI_GPU
+        if (rank_size > 1) {
+            FT_LOG_WARNING("Custom All Reduce only supports 8 Ranks currently. Using NCCL as Comm.");
+        }
+#else
+        FT_CHECK_WITH_INFO(rank_size == 1,
+                           fmtstr("Custom All Reduce only supports 8 Ranks currently, got rank_size %ld. FT needs "
+                                  "the NCCL library to communicate among devices but has built without NCCL. "
+                                  "Please use the flag -DBUILD_MULTI_GPU=ON when compiling.",
+                                  rank_size));
+#endif
+        for (size_t i = 0; i < rank_size; i++) {
+            custom_all_reduce_comms->push_back(nullptr);
+        }
+        return;
+    }
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
+    for (size_t i = 0; i < rank_size; i++) {
+        custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
+    }
+    custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
+#else
+    FT_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
+    for (size_t i = 0; i < rank_size; i++) {
+        custom_all_reduce_comms->push_back(nullptr);
+    }
+#endif
+}
+
+// Template instantiation
+template class CustomAllReduceComm<uint16_t>;
+#ifdef ENABLE_BF16
+template class CustomAllReduceComm<__nv_bfloat16>;
+#endif
+template class CustomAllReduceComm<uint32_t>;
+template void
+initCustomAllReduceComm<uint16_t>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                  int                                               enable_custom_all_reduce,
+                                  size_t                                            rank_size);
+#ifdef ENABLE_BF16
+template void
+initCustomAllReduceComm<__nv_bfloat16>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                       int                                               enable_custom_all_reduce,
+                                       size_t                                            rank_size);
+#endif
+template void
+initCustomAllReduceComm<uint32_t>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                                  int                                               enable_custom_all_reduce,
+                                  size_t                                            rank_size);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/custom_ar_comm.h b/src/fastertransformer/utils/custom_ar_comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..773afb606b75cdc70c5f45be0c54e1c6010ea167
--- /dev/null
+++ b/src/fastertransformer/utils/custom_ar_comm.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "src/fastertransformer/kernels/custom_ar_kernels.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+
+namespace fastertransformer {
+
+class AbstractCustomComm {
+public:
+    AbstractCustomComm()                                                             = default;
+    virtual ~AbstractCustomComm()                                                    = default;
+    virtual void customAllReduce(size_t elts, cudaStream_t stream)                   = 0;
+    virtual void enableP2P(int ngpus)                                                = 0;
+    virtual bool swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts) = 0;
+    virtual void
+    allocateAndExchangePeerAccessPointer(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms) = 0;
+};
+
+template<typename T>
+class CustomAllReduceComm: public AbstractCustomComm {
+public:
+    CustomAllReduceComm(size_t rank_size, size_t rank);
+    ~CustomAllReduceComm();
+
+    void customAllReduce(size_t elts, cudaStream_t stream);
+
+    void allocateAndExchangePeerAccessPointer(
+        std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms) override;
+
+    bool swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts) override;
+
+    void enableP2P(int ngpus) override;
+
+private:
+    AllReduceParams<T>   param_;
+    std::vector<Tensor>* output_tensor_;
+    T*                   tmp_tensor_data_;
+    size_t               rank_size_;
+    size_t               rank_;
+};
+
+template<typename T>
+void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
+                             int                                               enable_custom_all_reduce,
+                             size_t                                            rank_size);
+
+template<typename T>
+struct CustomARCommTypeConverter {
+    using Type = uint32_t;
+};
+
+template<>
+struct CustomARCommTypeConverter<half> {
+    using Type = uint16_t;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct CustomARCommTypeConverter<__nv_bfloat16> {
+    using Type = __nv_bfloat16;
+};
+#endif
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/utils/gemm.cc b/src/fastertransformer/utils/gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0470459630785a9fd427877a50f977d7b8d4e944
--- /dev/null
+++ b/src/fastertransformer/utils/gemm.cc
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm.h"
+
+namespace fastertransformer {
+
+/* ***************************** GEMM Impl ******************************** */
+
+Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
+{
+    allocator_ = allocator;
+    stream_    = stream;
+    mutex_     = new std::mutex();  // mutex per process
+    check_cuda_error(cublasCreate(&cublas_handle_));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle_));
+    check_cuda_error(cublasSetStream(cublas_handle_, stream));
+
+    if (allocator_ != nullptr) {
+        workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
+    }
+    loadGemmConfig(config_file);
+}
+
+Gemm::~Gemm()
+{
+    if (allocator_ != nullptr) {
+        allocator_->free((void**)(&workspace_));
+        allocator_ = nullptr;
+    }
+    cublasLtDestroy(cublaslt_handle_);
+    cublasDestroy(cublas_handle_);
+    delete cublas_algo_map_;
+    delete mutex_;
+}
+
+std::string Gemm::toString()
+{
+    const char* a_type_str       = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* b_type_str       = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* c_type_str       = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    return fmtstr(
+        "Gemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]", a_type_str, b_type_str, c_type_str, compute_type_str);
+}
+
+void Gemm::setAllocator(IAllocator* allocator)
+{
+    if (allocator_ != nullptr && workspace_ != nullptr) {
+        allocator_->free((void**)(&workspace_));
+    }
+    allocator_ = allocator;
+    if (allocator_ != nullptr) {
+        workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
+    }
+}
+
+void Gemm::setCudaStream(cudaStream_t& stream)
+{
+    stream_ = stream;
+    cublasSetStream(cublas_handle_, stream);
+}
+
+void Gemm::setComputeType(DataType compute_type)
+{
+    checkDataTypeValidity(compute_type);
+    compute_type_ = compute_type;
+}
+
+void Gemm::setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type)
+{
+    checkDataTypeValidity(a_type);
+    checkDataTypeValidity(b_type);
+    checkDataTypeValidity(c_type);
+    a_type_ = a_type;
+    b_type_ = b_type;
+    c_type_ = c_type;
+    setComputeType(compute_type);
+}
+
+template<typename T>
+void Gemm::setDefaultTypes()
+{
+    if (std::is_same<T, float>::value) {
+        setTypes(TYPE_FP32, TYPE_FP32, TYPE_FP32, TYPE_FP32);
+    }
+    else if (std::is_same<T, half>::value) {
+        setTypes(TYPE_FP16, TYPE_FP16, TYPE_FP16, TYPE_FP16);
+    }
+    else {
+        throw GemmNotSupportedException("Gemm supports float or half type.");
+    }
+}
+
+void Gemm::loadGemmConfig(std::string config_file)
+{
+    if (cublas_algo_map_ != nullptr) {
+        delete cublas_algo_map_;  // unload the previous cublas map.
+    }
+    cublas_algo_map_ = new cublasAlgoMap(config_file);
+}
+
+void Gemm::gemm(const GemmOp              transa,
+                const GemmOp              transb,
+                const size_t              m,
+                const size_t              n,
+                const size_t              k,
+                const void*               input,
+                const DenseWeight<float>& weight,
+                void*                     output,
+                const float               alpha,
+                const float               beta)
+{
+    gemm(transa,
+         transb,
+         m,
+         n,
+         k,
+         input,
+         a_type_,
+         (transa == GEMM_OP_N) ? k : m,
+         (const void*)weight.kernel,
+         b_type_,
+         (transb == GEMM_OP_N) ? n : k,
+         output,
+         c_type_,
+         n,
+         alpha,
+         beta);
+}
+
+void Gemm::gemm(const GemmOp             transa,
+                const GemmOp             transb,
+                const size_t             m,
+                const size_t             n,
+                const size_t             k,
+                const void*              input,
+                const DenseWeight<half>& weight,
+                void*                    output,
+                const float              alpha,
+                const float              beta)
+{
+    gemm(transa,
+         transb,
+         m,
+         n,
+         k,
+         input,
+         a_type_,
+         (transa == GEMM_OP_N) ? k : m,
+         (const void*)weight.kernel,
+         b_type_,
+         (transb == GEMM_OP_N) ? n : k,
+         output,
+         c_type_,
+         n,
+         alpha,
+         beta);
+}
+
+void Gemm::gemm(const GemmOp transa,
+                const GemmOp transb,
+                const size_t m,
+                const size_t n,
+                const size_t k,
+                const void*  A,
+                const void*  B,
+                void*        C,
+                const float  alpha,
+                const float  beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
+}
+
+void Gemm::gemm(const GemmOp transa,
+                const GemmOp transb,
+                const size_t m,
+                const size_t n,
+                const size_t k,
+                const void*  A,
+                const size_t lda,
+                const void*  B,
+                const size_t ldb,
+                void*        C,
+                const size_t ldc,
+                const float  alpha,
+                const float  beta)
+{
+    gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
+}
+
+void Gemm::gemm(const GemmOp   transa,
+                const GemmOp   transb,
+                const size_t   m,
+                const size_t   n,
+                const size_t   k,
+                const void*    A,
+                const DataType Atype,
+                const size_t   lda,
+                const void*    B,
+                const DataType Btype,
+                const size_t   ldb,
+                void*          C,
+                const DataType Ctype,
+                const size_t   ldc,
+                const float    alpha,
+                const float    beta)
+{
+    FT_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
+
+    // Implementation copied from cublasMMWrapper::Gemm
+    // Switch A and B since both cublas and cublasLt assume a column major layout,
+    // while A and B are both row major layout.
+    const void* a_data_ptr = B;
+    const void* b_data_ptr = A;
+
+    cublasOperation_t a_op = getCublasOperation(transb);
+    cublasOperation_t b_op = getCublasOperation(transa);
+
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+
+    // swap m and n
+    const size_t _m = n;
+    const size_t _n = m;
+
+    // swap lda and ldb;
+    const size_t _lda = ldb;
+    const size_t _ldb = lda;
+
+    mutex_->lock();
+    // Use cublas as default in FP32 and cublasLt as default in FP16
+    bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
+    bool using_cublasLt       = Atype == TYPE_FP16;
+    int  batch_count          = 1;
+
+    half        h_alpha = (half)alpha;
+    half        h_beta  = (half)beta;
+    const void* alpha_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
+    const void* beta_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
+
+    // TODO: unify CUBLAS_DATA_TYPE and DataType.
+    int findAlgo =
+        cublas_algo_map_->isExist(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+    cublasLtMatmulAlgo_info info =
+        cublas_algo_map_->getAlgo(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+    if (findAlgo) {
+        using_cublasLt = (info.stages != -1);
+    }
+
+    if (using_cublasLt) {
+        const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
+        const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
+        const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
+        const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
+
+        cublasLtMatmulDesc_t   matmul_desc = NULL;
+        cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
+        cudaDataType_t         scale_type   = getCublasDataType(compute_type_);
+        auto                   compute_type = getCublasComputeType(compute_type_);
+
+        // --------------------------------------
+        // Create descriptors for the original matrices
+        cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
+        cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
+        cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
+#else
+        cublasLtMatmulDescCreate(&matmul_desc, compute_type);
+#endif
+
+        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
+        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
+
+        cublasLtMatmulAlgo_t algo;
+        void*                workspace      = workspace_;
+        int                  workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
+        if (findAlgo) {
+            if (info.workspaceSize > workspace_size) {
+                findAlgo = 0;
+            }
+            else {
+                cublasLtMatmulAlgoInit(
+                    cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
+#if (CUDART_VERSION >= 11000)
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
+#endif
+            }
+        }
+
+        cublasLtMatmul(cublaslt_handle_,
+                       matmul_desc,
+                       alpha_ptr,
+                       a_data_ptr,
+                       a_desc,
+                       b_data_ptr,
+                       b_desc,
+                       beta_ptr,
+                       C,
+                       c_desc,
+                       C,
+                       c_desc,
+                       (findAlgo == 1 ? (&algo) : NULL),
+                       workspace,
+                       workspace_size,
+                       stream_);
+
+        cublasLtMatmulDescDestroy(matmul_desc);
+        cublasLtMatrixLayoutDestroy(a_desc);
+        cublasLtMatrixLayoutDestroy(b_desc);
+        cublasLtMatrixLayoutDestroy(c_desc);
+        sync_check_cuda_error();
+    }
+    else {
+        cudaDataType_t compute_type = getCublasDataType(compute_type_);
+        int            cublas_algo  = info.algoId;
+        check_cuda_error(cublasGemmEx(cublas_handle_,
+                                      a_op,
+                                      b_op,
+                                      _m,
+                                      _n,
+                                      k,
+                                      alpha_ptr,
+                                      a_data_ptr,
+                                      a_type,
+                                      _lda,
+                                      b_data_ptr,
+                                      b_type,
+                                      _ldb,
+                                      beta_ptr,
+                                      C,
+                                      c_type,
+                                      ldc,
+                                      compute_type,
+                                      static_cast<cublasGemmAlgo_t>(cublas_algo)));
+        sync_check_cuda_error();
+    }
+    mutex_->unlock();
+}
+
+void Gemm::batchedGemm(const GemmOp       transa,
+                       const GemmOp       transb,
+                       const size_t       m,
+                       const size_t       n,
+                       const size_t       k,
+                       const void* const* A,
+                       const void* const* B,
+                       void* const*       C,
+                       const size_t       batch_size,
+                       const float        alpha,
+                       const float        beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
+}
+
+void Gemm::batchedGemm(const GemmOp       transa,
+                       const GemmOp       transb,
+                       const size_t       m,
+                       const size_t       n,
+                       const size_t       k,
+                       const void* const* A,
+                       const size_t       lda,
+                       const void* const* B,
+                       const size_t       ldb,
+                       void* const*       C,
+                       const size_t       ldc,
+                       const size_t       batch_size,
+                       const float        alpha,
+                       const float        beta)
+{
+    batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
+}
+
+void Gemm::batchedGemm(const GemmOp       transa,
+                       const GemmOp       transb,
+                       const size_t       m,
+                       const size_t       n,
+                       const size_t       k,
+                       const void* const* A,
+                       const DataType     Atype,
+                       const size_t       lda,
+                       const void* const* B,
+                       const DataType     Btype,
+                       const size_t       ldb,
+                       void* const*       C,
+                       const DataType     Ctype,
+                       const size_t       ldc,
+                       const size_t       batch_size,
+                       const float        alpha,
+                       const float        beta)
+{
+    FT_LOG_TRACE(
+        "Gemm::batchedGemm [b=%ld m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", batch_size, m, n, k, lda, ldb, ldc);
+
+    // Switch A and B.
+    const void* const* a_data_ptr = B;
+    const void* const* b_data_ptr = A;
+
+    cublasOperation_t a_op = getCublasOperation(transb);
+    cublasOperation_t b_op = getCublasOperation(transa);
+
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+
+    // swap m and n, lda and ldb
+    const size_t _m   = n;
+    const size_t _n   = m;
+    const size_t _lda = ldb;
+    const size_t _ldb = lda;
+
+    half h_alpha = (half)alpha;
+    half h_beta  = (half)beta;
+
+    mutex_->lock();
+    bool        is_fp16_compute_type = compute_type_ == TYPE_FP16;
+    const void* alpha_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
+    const void* beta_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
+    cublasLtMatmulAlgo_info info =
+        cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+
+    check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
+                                         a_op,
+                                         b_op,
+                                         _m,
+                                         _n,
+                                         k,
+                                         alpha_ptr,
+                                         a_data_ptr,
+                                         a_type,
+                                         _lda,
+                                         b_data_ptr,
+                                         b_type,
+                                         _ldb,
+                                         beta_ptr,
+                                         C,
+                                         c_type,
+                                         ldc,
+                                         batch_size,
+                                         getCublasComputeType(compute_type_),
+                                         static_cast<cublasGemmAlgo_t>(info.algoId)));
+    mutex_->unlock();
+}
+
+void Gemm::stridedBatchedGemm(GemmOp       transa,
+                              GemmOp       transb,
+                              const size_t m,
+                              const size_t n,
+                              const size_t k,
+                              const void*  A,
+                              const void*  B,
+                              void*        C,
+                              const size_t batch_size,
+                              const float  alpha,
+                              const float  beta)
+{
+    size_t  lda     = (transa == GEMM_OP_N) ? k : m;
+    size_t  ldb     = (transb == GEMM_OP_N) ? n : k;
+    size_t  ldc     = n;
+    int64_t stridea = m * k;
+    int64_t strideb = k * n;
+    int64_t stridec = m * n;
+
+    stridedBatchedGemm(transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       A,
+                       a_type_,
+                       lda,
+                       stridea,
+                       B,
+                       b_type_,
+                       ldb,
+                       strideb,
+                       C,
+                       c_type_,
+                       ldc,
+                       stridec,
+                       batch_size,
+                       compute_type_,
+                       alpha,
+                       beta);
+}
+
+void Gemm::stridedBatchedGemm(GemmOp        transa,
+                              GemmOp        transb,
+                              const size_t  m,
+                              const size_t  n,
+                              const size_t  k,
+                              const void*   A,
+                              const int64_t strideA,
+                              const void*   B,
+                              const int64_t strideB,
+                              void*         C,
+                              const int64_t strideC,
+                              const size_t  batch_size,
+                              const float   alpha,
+                              const float   beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    stridedBatchedGemm(transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       A,
+                       a_type_,
+                       lda,
+                       strideA,
+                       B,
+                       b_type_,
+                       ldb,
+                       strideB,
+                       C,
+                       c_type_,
+                       ldc,
+                       strideC,
+                       batch_size,
+                       compute_type_,
+                       alpha,
+                       beta);
+}
+
+void Gemm::stridedBatchedGemm(GemmOp        transa,
+                              GemmOp        transb,
+                              const size_t  m,
+                              const size_t  n,
+                              const size_t  k,
+                              const void*   A,
+                              const size_t  lda,
+                              const int64_t strideA,
+                              const void*   B,
+                              const size_t  ldb,
+                              const int64_t strideB,
+                              void*         C,
+                              const size_t  ldc,
+                              const int64_t strideC,
+                              const size_t  batch_size,
+                              const float   alpha,
+                              const float   beta)
+{
+    stridedBatchedGemm(transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       A,
+                       a_type_,
+                       lda,
+                       strideA,
+                       B,
+                       b_type_,
+                       ldb,
+                       strideB,
+                       C,
+                       c_type_,
+                       ldc,
+                       strideC,
+                       batch_size,
+                       compute_type_,
+                       alpha,
+                       beta);
+}
+
+void Gemm::stridedBatchedGemm(GemmOp        transa,
+                              GemmOp        transb,
+                              const size_t  m,
+                              const size_t  n,
+                              const size_t  k,
+                              const void*   A,
+                              DataType      Atype,
+                              const size_t  lda,
+                              const int64_t strideA,
+                              const void*   B,
+                              DataType      Btype,
+                              const size_t  ldb,
+                              const int64_t strideB,
+                              void*         C,
+                              DataType      Ctype,
+                              const size_t  ldc,
+                              const int64_t strideC,
+                              const size_t  batch_size,
+                              DataType      compute_type,
+                              const float   alpha,
+                              const float   beta)
+{
+    FT_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]",
+                 batch_size,
+                 m,
+                 n,
+                 k,
+                 lda,
+                 ldb,
+                 ldc);
+
+    // Switch A and B.
+    const void* a_data_ptr = B;
+    const void* b_data_ptr = A;
+
+    cublasOperation_t a_op = getCublasOperation(transb);
+    cublasOperation_t b_op = getCublasOperation(transa);
+
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+
+    // swap m and n, lda and ldb, stride A and B
+    const size_t  _m       = n;
+    const size_t  _n       = m;
+    const size_t  _lda     = ldb;
+    const size_t  _ldb     = lda;
+    const int64_t _stridea = strideB;
+    const int64_t _strideb = strideA;
+
+    half h_alpha = (half)alpha;
+    half h_beta  = (half)beta;
+
+    mutex_->lock();
+    bool        is_fp16_compute_type = compute_type_ == TYPE_FP16;
+    const void* alpha_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
+    const void* beta_ptr =
+        is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
+    cublasLtMatmulAlgo_info info =
+        cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
+
+    check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
+                                                a_op,
+                                                b_op,
+                                                _m,
+                                                _n,
+                                                k,
+                                                alpha_ptr,
+                                                a_data_ptr,
+                                                a_type,
+                                                _lda,
+                                                _stridea,
+                                                b_data_ptr,
+                                                b_type,
+                                                _ldb,
+                                                _strideb,
+                                                beta_ptr,
+                                                C,
+                                                c_type,
+                                                ldc,
+                                                strideC,
+                                                batch_size,
+                                                getCublasComputeType(compute_type),
+                                                static_cast<cublasGemmAlgo_t>(info.algoId)));
+    mutex_->unlock();
+}
+
+void Gemm::checkDataTypeValidity(const DataType& type)
+{
+    if (type != TYPE_FP32 && type != TYPE_FP16) {
+        throw GemmNotSupportedException("Gemm supports TYPE_FP16 or TYPE_FP32");
+    }
+}
+
+/* ************************* End of GEMM Impl **************************** */
+
+// void Int8Gemm::gemm(Tensor& C,
+//                     const GemmOp transa,
+//                     const GemmOp transb,
+//                     const Tensor& A,
+//                     const Tensor& B,
+//                     const float alpha,
+//                     const float beta)
+// {
+
+// }
+
+/* ************************* SpGEMM Impl *********************************** */
+#ifdef SPARSITY_ENABLED
+SpGemm::SpGemm(IAllocator* allocator, cudaStream_t stream, std::string config_file, std::string spconfig_file):
+    Gemm(allocator, stream, config_file)
+{
+    CHECK_CUSPARSE(cusparseLtInit(&cusparselt_handle_));
+    // TODO(jaedeokk):
+    //   Let's make cublasAlgoMap load gemm/spgemm config separtely,
+    //   allowing us to inherit Gemm's constructor.
+    // cublas_algo_map_.loadSpGemmConfig(spconfig_file);  // enable this line later.
+
+    a_type_       = TYPE_FP16;
+    b_type_       = TYPE_FP16;
+    c_type_       = TYPE_FP16;
+    compute_type_ = TYPE_FP16;
+}
+
+SpGemm::~SpGemm()
+{
+    cusparseLtDestroy(&cusparselt_handle_);
+    // Need to destroy matmul description cache.
+    for (auto& kv : a_desc_map_) {  // kv = (mark, a_desc)
+        cusparseLtMatDescriptorDestroy(&a_desc_map_[kv.first]);
+    }
+    for (auto& kv : b_desc_map_) {  // kv = (mark, b_desc)
+        cusparseLtMatDescriptorDestroy(&b_desc_map_[kv.first]);
+    }
+    for (auto& kv : c_desc_map_) {  // kv = (mark, c_desc)
+        cusparseLtMatDescriptorDestroy(&c_desc_map_[kv.first]);
+    }
+}
+
+std::string SpGemm::toString()
+{
+    const char* a_type_str       = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* b_type_str       = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* c_type_str       = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
+    return fmtstr("SpGemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]",
+                  a_type_str,
+                  b_type_str,
+                  c_type_str,
+                  compute_type_str);
+}
+
+void SpGemm::loadGemmConfig(std::string config_file, std::string spconfig_file)
+{
+    if (cublas_algo_map_ != nullptr) {
+        delete cublas_algo_map_;  // unload algo map.
+    }
+    cublas_algo_map_ = new cublasAlgoMap(config_file, spconfig_file);
+}
+
+void SpGemm::checkDataTypeValidity(const DataType& type)
+{
+    if (type != TYPE_FP16) {
+        throw GemmNotSupportedException("Sparse GEMM only supports FP16 data type now.");
+    }
+}
+
+bool SpGemm::useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k)
+{
+    return !cublas_algo_map_->isUseSparse(batch_size, m, n, k);
+}
+
+// Temporal gemm helper mtehod to use template T.
+template<typename T>
+void SpGemm::weightGemmHelper(const GemmOp          transa,
+                              const GemmOp          transb,
+                              const size_t          m,
+                              const size_t          n,
+                              const size_t          k,
+                              const void*           input,
+                              const DenseWeight<T>& weight,
+                              void*                 output,
+                              const float           alpha,
+                              const float           beta)
+{
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+    if (useBaseGemm(1, m, n, k) || weight.sp_kernel == nullptr) {
+        Gemm::gemm(transa,
+                   transb,
+                   m,
+                   n,
+                   k,
+                   input,
+                   a_type_,
+                   lda,
+                   (const void*)weight.kernel,
+                   b_type_,
+                   ldb,
+                   output,
+                   c_type_,
+                   ldc,
+                   alpha,
+                   beta);
+    }
+    else {
+        gemm(transa,
+             transb,
+             m,
+             n,
+             k,
+             input,
+             a_type_,
+             lda,
+             (const void*)weight.sp_kernel,
+             b_type_,
+             ldb,
+             output,
+             c_type_,
+             ldc,
+             alpha,
+             beta);
+    }
+}
+
+void SpGemm::gemm(const GemmOp              transa,
+                  const GemmOp              transb,
+                  const size_t              m,
+                  const size_t              n,
+                  const size_t              k,
+                  const void*               input,
+                  const DenseWeight<float>& weight,
+                  void*                     output,
+                  const float               alpha,
+                  const float               beta)
+{
+    weightGemmHelper<float>(transa, transb, m, n, k, input, weight, output, alpha, beta);
+}
+void SpGemm::gemm(const GemmOp             transa,
+                  const GemmOp             transb,
+                  const size_t             m,
+                  const size_t             n,
+                  const size_t             k,
+                  const void*              input,
+                  const DenseWeight<half>& weight,
+                  void*                    output,
+                  const float              alpha,
+                  const float              beta)
+{
+    weightGemmHelper<half>(transa, transb, m, n, k, input, weight, output, alpha, beta);
+}
+
+void SpGemm::gemm(const GemmOp   transa,
+                  const GemmOp   transb,
+                  const size_t   m,
+                  const size_t   n,
+                  const size_t   k,
+                  const void*    A,
+                  const DataType Atype,
+                  const size_t   lda,
+                  const void*    B,
+                  const DataType Btype,
+                  const size_t   ldb,
+                  void*          C,
+                  const DataType Ctype,
+                  const size_t   ldc,
+                  const float    alpha,
+                  const float    beta)
+{
+    FT_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
+    checkDataTypeValidity(Atype);
+    checkDataTypeValidity(Btype);
+    checkDataTypeValidity(Ctype);
+    checkDataTypeValidity(compute_type_);
+
+    if (useBaseGemm(1, m, n, k)) {
+        // Compute by the base GEMM.
+        Gemm::gemm(transa, transb, m, n, k, A, Atype, lda, B, Btype, ldb, C, Ctype, ldc, alpha, beta);
+        return;
+    }
+
+    // Switch A/B due to column major layout in computation.
+    //  Typical usecase of Gemm family is to compute Y = X * W where X is an
+    //  input tensor and W is a kernel weight. Compression takes a lot time
+    //  so only the kernel weight (which is fixed in inference time) can be
+    //  sparse. Using B as sparse seems not stable, unfortunately.
+    //  (e.g. caching matrix descriptions is not correctly working.)
+    //  Thus, SpGemm considers a column major layout in computation to make
+    //  C^T = B^T * A^T, where a kernel weight "B" is located at the front.
+    const void* a_data = B;
+    const void* b_data = A;
+
+    cusparseOrder_t order = CUSPARSE_ORDER_COL;
+
+    cusparseOperation_t opA = getCusparseOperation(transb);
+    cusparseOperation_t opB = getCusparseOperation(transa);
+
+    cudaDataType_t a_type = getCublasDataType(Btype);
+    cudaDataType_t b_type = getCublasDataType(Atype);
+    cudaDataType_t c_type = getCublasDataType(Ctype);
+
+    const size_t _m   = n;
+    const size_t _n   = m;
+    const size_t _lda = ldb;
+    const size_t _ldb = lda;
+
+    const size_t a_rows = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _m : k;
+    const size_t a_cols = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _m;
+    const size_t b_rows = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _n;
+    const size_t b_cols = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _n : k;
+    const size_t c_rows = _m;
+    const size_t c_cols = _n;
+
+    const unsigned      alignment    = 16;
+    cusparseComputeType compute_type = getCusparseComputeType(compute_type_);
+
+    cusparseLtMatmulDescriptor_t   matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t         plan;
+
+    char mark[256];
+    sprintf(mark, "%d_%ld_%ld_%ld_%s_%s", 1, m, n, k, getGemmOpString(transb).c_str(), getGemmOpString(transa).c_str());
+    if (a_desc_map_.find(mark) != a_desc_map_.end()) {
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &a_desc_map_[mark],
+                                                      &b_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      compute_type));
+    }
+    else {
+        // initializing MatDesc takes a lot of time
+        cusparseLtMatDescriptor_t a_desc, b_desc, c_desc;
+        a_desc_map_[mark] = a_desc;
+        b_desc_map_[mark] = b_desc;
+        c_desc_map_[mark] = c_desc;
+        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
+                                                          &a_desc_map_[mark],
+                                                          a_rows,
+                                                          a_cols,
+                                                          _lda,
+                                                          alignment,
+                                                          a_type,
+                                                          order,
+                                                          CUSPARSELT_SPARSITY_50_PERCENT));
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &b_desc_map_[mark], b_rows, b_cols, _ldb, alignment, b_type, order));
+        CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
+            &cusparselt_handle_, &c_desc_map_[mark], c_rows, c_cols, ldc, alignment, c_type, order));
+        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
+                                                      &matmul,
+                                                      opA,
+                                                      opB,
+                                                      &a_desc_map_[mark],
+                                                      &b_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      &c_desc_map_[mark],
+                                                      compute_type));
+    }
+
+    mutex_->lock();
+    CHECK_CUSPARSE(
+        cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
+    int alg = cublas_algo_map_->getSpAlgo(1, a_rows, b_cols, a_cols);
+    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+        &cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)));
+    size_t workspace_size;
+    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size));
+    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size));
+
+    void*        d_workspace = nullptr;  // Can we use the workspace of the class?
+    int          num_streams = 1;
+    cudaStream_t streams[1]  = {stream_};
+    CHECK_CUSPARSE(cusparseLtMatmul(
+        &cusparselt_handle_, &plan, &alpha, a_data, b_data, &beta, C, C, d_workspace, streams, num_streams))
+    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+    mutex_->unlock();
+    sync_check_cuda_error();
+}
+#endif
+
+/* ************************* End of SpGEMM Impl ************************** */
+
+/* ***************************** GEMM utils ****************************** */
+
+std::shared_ptr<Gemm> createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse, bool quantized)
+{
+    FT_LOG_TRACE(
+        "Create Gemm instance [sparse=%s, quantized=%s]", sparse ? "true" : "false", quantized ? "true" : "false");
+    std::shared_ptr<Gemm> gemm;
+    if (!sparse) {
+        if (!quantized) {
+            gemm = std::make_shared<Gemm>(allocator, stream);
+        }
+        else {
+            throw GemmNotSupportedException("Int8 Gemm is not supported yet");
+        }
+    }
+    else {
+#ifdef SPARSITY_ENABLED
+        if (sparse && !quantized) {
+            gemm = std::make_shared<SpGemm>(allocator, stream);
+        }
+        else {
+            throw GemmNotSupportedException("Int8 Sparse Gemm is not supported yet");
+        }
+#else
+        throw GemmNotSupportedException("Sparsity support is not enabled. To enabled sparisty, "
+                                        "please provide `-DSPARSITY_SUPPORT` flag for compilation.");
+#endif
+    }
+    return gemm;
+}
+
+cudaDataType_t getCublasDataType(DataType dtype)
+{
+    switch (dtype) {
+        case TYPE_FP16:
+            return CUDA_R_16F;
+        case TYPE_FP32:
+            return CUDA_R_32F;
+        default:
+            throw GemmNotSupportedException("Not supported data type.");
+    }
+}
+
+#if (CUDART_VERSION >= 11000)
+cublasComputeType_t getCublasComputeType(DataType ctype)
+{
+    switch (ctype) {
+        case TYPE_FP16:
+            return CUBLAS_COMPUTE_16F;
+        case TYPE_FP32:
+            return CUBLAS_COMPUTE_32F;
+        default:
+            throw GemmNotSupportedException("Not supported cublas compute type.");
+    }
+}
+#else
+cudaDataType_t getCublasComputeType(DataType ctype)
+{
+    switch (ctype) {
+        case TYPE_FP16:
+            return CUDA_R_16F;
+        case TYPE_FP32:
+            return CUDA_R_32F;
+        default:
+            throw GemmNotSupportedException("Not supported cublas compute type.");
+    }
+}
+#endif
+
+cublasOperation_t getCublasOperation(GemmOp op)
+{
+    switch (op) {
+        case GEMM_OP_N:
+            return CUBLAS_OP_N;
+        case GEMM_OP_T:
+            return CUBLAS_OP_T;
+        default:
+            throw GemmNotSupportedException("Unknown GemmOp provided.");
+    }
+}
+
+std::string getGemmOpString(const GemmOp& op)
+{
+    switch (op) {
+        case GEMM_OP_T:
+            return "T";
+        case GEMM_OP_N:
+            return "N";
+    }
+    throw GemmNotSupportedException("Unknown GemmOp provided.");
+}
+
+#ifdef SPARSITY_ENABLED
+cusparseOperation_t getCusparseOperation(GemmOp op)
+{
+    switch (op) {
+        case GEMM_OP_N:
+            return CUSPARSE_OPERATION_NON_TRANSPOSE;
+        case GEMM_OP_T:
+            return CUSPARSE_OPERATION_TRANSPOSE;
+        default:
+            throw GemmNotSupportedException("Unknown GemmOp provided.");
+    }
+}
+
+cusparseComputeType getCusparseComputeType(DataType ctype)
+{
+    if (ctype != TYPE_FP16) {
+        throw GemmNotSupportedException("Sparse GEMM supports TYPE_FP16 compute type only.");
+    }
+    return CUSPARSE_COMPUTE_16F;
+}
+
+void pruneMatrixB(void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans)
+{
+    FT_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str());
+
+    // Due to A/B switching, the matrix B will be used as a matrix A.
+    const cusparseOrder_t order     = CUSPARSE_ORDER_COL;
+    const size_t          rows      = (trans == GEMM_OP_N) ? n : k;
+    const size_t          cols      = (trans == GEMM_OP_N) ? k : n;
+    const size_t          ld        = rows;
+    const unsigned        alignment = 16;
+
+    const cusparseLtPruneAlg_t prune_alg = CUSPARSELT_PRUNE_SPMMA_STRIP;
+    const cusparseOperation_t  op        = getCusparseOperation(trans);
+    const cudaDataType_t       dtype     = CUDA_R_16F;  // fixed under cusparselt == 0.2.0.
+
+    // 0: B is sparse,  1: A is sparse
+    // B matrix will be used as A matrix at the SpGemm::gemm.
+    const int is_sparse_a = 1;
+
+    // TODO: Let the resource manager handle GPU-related resources later.
+    cusparseLtHandle_t handle;
+    CHECK_CUSPARSE(cusparseLtInit(&handle));
+    cusparseLtMatDescriptor_t mat_desc;
+    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+        &handle, &mat_desc, rows, cols, ld, alignment, dtype, order, CUSPARSELT_SPARSITY_50_PERCENT));
+    CHECK_CUSPARSE(cusparseLtSpMMAPrune2(&handle, &mat_desc, is_sparse_a, op, data, data, prune_alg, stream));
+    CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
+    CHECK_CUSPARSE(cusparseLtDestroy(&handle));
+}
+
+size_t compressMatrixB(void**              output,
+                       IAllocator&         allocator,
+                       const cudaStream_t& stream,
+                       const void*         input,
+                       const size_t        k,
+                       const size_t        n,
+                       const GemmOp        trans)
+{
+    FT_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n);
+
+    // swap A/B due to column/row major layout mismatch.
+    cusparseOrder_t order = CUSPARSE_ORDER_COL;
+    const size_t    rows  = (trans == GEMM_OP_N) ? n : k;
+    const size_t    cols  = (trans == GEMM_OP_N) ? k : n;
+    const size_t    ld    = rows;
+
+    cudaDataType_t            dtype    = CUDA_R_16F;  // fixed under cusparselt == 0.2.0.
+    cusparseLtSparsity_t      sparsity = CUSPARSELT_SPARSITY_50_PERCENT;
+    cusparseOperation_t       op       = getCusparseOperation(trans);
+    cusparseLtMatDescriptor_t mat_desc;
+    const unsigned            alignment   = 16;
+    const int                 is_sparse_a = 1;  // 0: B is sparse,  1: A is sparse
+
+    cusparseLtHandle_t handle;
+    CHECK_CUSPARSE(cusparseLtInit(&handle));
+
+    CHECK_CUSPARSE(
+        cusparseLtStructuredDescriptorInit(&handle, &mat_desc, rows, cols, ld, alignment, dtype, order, sparsity))
+
+    size_t compressed_size = 0;
+    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_desc, &compressed_size));
+    if (compressed_size == 0) {
+        throw GemmInvalidException("Fail to compute correct compressed_size, got 0. This error may be "
+                                   "caused by a too small input matrix.");
+    }
+
+    *output = allocator.malloc(compressed_size, false);
+    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_desc, is_sparse_a, op, input, *output, stream))
+
+    CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
+    CHECK_CUSPARSE(cusparseLtDestroy(&handle));
+    return compressed_size;
+}
+
+#endif
+
+/* ************************* End of GEMM utils **************************** */
+
+}  // end of namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm.h b/src/fastertransformer/utils/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..378d9ccf25b37f4a06b9af087d53ebd58234b22b
--- /dev/null
+++ b/src/fastertransformer/utils/gemm.h
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+
+// TODO: Need to remove the dependency of the layer module.
+//   e.g. refactor Weight class to some base module.
+#include "src/fastertransformer/layers/DenseWeight.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+// cublas default workspace size: 32MB. Let me make this as a Gemm property.
+#define WORKSPACE_SIZE 33554432
+
+namespace fastertransformer {
+
+// A wrapper of cublas or cusparse matrix operator.
+//  - GEMM_OP_N = CUBLAS_OP_N or CUSPARSE_OP_N
+//  - GEMM_OP_T = CUBLAS_OP_T or CUSPARSE_OP_T
+enum GemmOp {
+    GEMM_OP_N,
+    GEMM_OP_T
+};
+
+// A base class of the GEMM family.
+// In the current version Gemm is as a base class as well as an interface.
+class Gemm {
+
+public:
+    Gemm() = delete;  // Disable a default constructor
+    /**
+     * A Gemm class.
+     *
+     * NOTE:
+     *   A, B, C are assumed to have a row major layout, while a backend cuda libraries
+     *   assumes a column major layout. However, a family of Gemm has already handled
+     *   such discrepancy internally. Please use naively without a trick like switching
+     *   inputs A and B that aligns the matrix layout.
+     *
+     * Restriction: Supported in/out data or compute types: TYPE_FP16, TYPE_FP32.
+     *
+     * TODO:
+     *   Unify resource allocation/release from a singleton GPU resource managers.
+     *   Thus, allocator, stream can be replaced by a resource handler later.
+     *   E.g. Gemm(std::shared_ptr<ResourceManager> resource_manager), and
+     *        stream_ = resource_manager.getCudaStream();
+     *        buffer = resource_manager.malloc(...);
+     *
+     * @param allocator   Resource allocator.
+     * @param stream      A CUDA stream.
+     * @param config_file A file path of a GEMM configuration.
+     */
+    Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file = GEMM_CONFIG);
+    Gemm(Gemm const& other) = delete;
+    virtual ~Gemm();
+
+    virtual std::string toString();
+
+    /**
+     * @brief Set GEMM compute type.
+     *
+     * @param compute_type The data type of accumulation type inside GEMM computation.
+     *                     (Choices: TYPE_FP16, TYPE_FP32)
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    void setComputeType(DataType compute_type);
+
+    /**
+     * @brief Set matrix data types and compute precision.
+     *
+     * Supported data or compute types: TYPE_FP16, TYPE_FP32
+     *
+     * @param a_type  The data type of a matrix A.
+     * @param b_type  The data type of a matrix B.
+     * @param c_type  The data type of a matrix C.
+     * @param compute_type  The data type of accumulation type inside GEMM computation.
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    void setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type);
+
+    /**
+     * @brief Set matrix data and compute types by default values.
+     *
+     * Default configs:
+     *  - T=float : data type=TYPE_FP32, compute type=TYPE_FP32
+     *  - T=half  : data type=TYPE_FP16, compute type=TYPE_FP32
+     */
+    template<typename T>
+    void setDefaultTypes();
+
+    void loadGemmConfig(std::string config_file);
+
+    void setAllocator(IAllocator* allocator);
+    void setCudaStream(cudaStream_t& stream);
+
+    // Th APIs below are to see how the interface will change
+    // if it cooperates with Tensor. To enable it, we need to
+    // update the Tensor class. For instance, data is need to
+    // be of type (void*) rather than (const void*) to pass it
+    // as the output C of gemm.
+    // virtual void gemm(Tensor& C,
+    //                   const GemmOp transa,
+    //                   const GemmOp transb,
+    //                   const Tensor& A,
+    //                   const Tensor& B,
+    //                   const float alpha = 1.0f,
+    //                   const float beta = 0.0f);
+    //
+    // virtual void batchedMatmul(std::vector<Tensor> Carray,
+    //                            const GemmOp transa,
+    //                            const GemmOp transb,
+    //                            const std::vector<Tensor> Aarray,
+    //                            const std::vector<Tensor> Barray,
+    //                            const float alpha = 1.0f,
+    //                            const float beta = 0.0f);
+    //
+    // virtual void stridedBatchedGemm(Tensor& C,
+    //                                 const GemmOp transa,
+    //                                 const GemmOp transb,
+    //                                 const Tensor& A,
+    //                                 const Tensor& B,
+    //                                 const float alpha = 1.0f,
+    //                                 const float beta = 0.0f);
+
+    // TODO:
+    // This function cooperates with a Weight object to simply Gemm calls
+    // inside layers, computing the following formula
+    //     output(C) = input(A) * weight_kernel(B)
+    // where weight_kernel can be changed according to Gemm functions.
+    // DenseWeight is of a template struct, not allowing override the method.
+    // We temperally add an interface here for two cases float/half,
+    // but to finialze this function, we need an interface of a weight class
+    // which is not a template class.
+    virtual void gemm(const GemmOp              transa,
+                      const GemmOp              transb,
+                      const size_t              m,
+                      const size_t              n,
+                      const size_t              k,
+                      const void*               input,
+                      const DenseWeight<float>& weight,
+                      void*                     output,
+                      const float               alpha = 1.0f,
+                      const float               beta  = 0.0f);
+    virtual void gemm(const GemmOp             transa,
+                      const GemmOp             transb,
+                      const size_t             m,
+                      const size_t             n,
+                      const size_t             k,
+                      const void*              input,
+                      const DenseWeight<half>& weight,
+                      void*                    output,
+                      const float              alpha = 1.0f,
+                      const float              beta  = 0.0f);
+
+    virtual void gemm(const GemmOp transa,
+                      const GemmOp transb,
+                      const size_t m,
+                      const size_t n,
+                      const size_t k,
+                      const void*  A,
+                      const void*  B,
+                      void*        C,
+                      const float  alpha = 1.0f,
+                      const float  beta  = 0.0f);
+
+    virtual void gemm(const GemmOp transa,
+                      const GemmOp transb,
+                      const size_t m,
+                      const size_t n,
+                      const size_t k,
+                      const void*  A,
+                      const size_t lda,
+                      const void*  B,
+                      const size_t ldb,
+                      void*        C,
+                      const size_t ldc,
+                      const float  alpha = 1.0f,
+                      const float  beta  = 0.0f);
+    /**
+     * @brief Compute the matrix multiplication `C = \alpha * op(A) * op(B) + \beta * C`.
+     *
+     * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
+     * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
+     * @param m      A number of rows of a matrix op(A) and C.
+     * @param n      A number of columns of a matrix op(B) or C.
+     * @param k      A number of columns of op(A) and rows of op(B).
+     * @param A      A device pointer of a matrix A of dimension (m x lda).
+     * @param Atype  A data type of A (TYPE_FP16 or TYPE_FP32)
+     * @param lda    A leading dimension of the matrix A.
+     * @param B      A device pointer of a matrix B of dimension (k x ldb).
+     * @param Btype  A data type of B (TYPE_FP16 or TYPE_FP32)
+     * @param ldb    A leading dimension of the matrix B.
+     * @param C      (Output) A device pointer of a matrix C of dimension (m x ldc).
+     * @param Ctype  A data type of C (TYPE_FP16 or TYPE_FP32)
+     * @param ldc    A leading dimension of the matrix C.
+     * @param alpha  A scale factor for A*B (default: 1.0f).
+     * @param beta   A scale factor for C (default: 0.0f).
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    virtual void gemm(const GemmOp   transa,
+                      const GemmOp   transb,
+                      const size_t   m,
+                      const size_t   n,
+                      const size_t   k,
+                      const void*    A,
+                      const DataType Atype,
+                      const size_t   lda,
+                      const void*    B,
+                      const DataType Btype,
+                      const size_t   ldb,
+                      void*          C,
+                      const DataType Ctype,
+                      const size_t   ldc,
+                      const float    alpha = 1.0f,
+                      const float    beta  = 0.0f);
+
+    virtual void batchedGemm(const GemmOp       transa,
+                             const GemmOp       transb,
+                             const size_t       m,
+                             const size_t       n,
+                             const size_t       k,
+                             const void* const* A,
+                             const void* const* B,
+                             void* const*       C,
+                             const size_t       batch_size,
+                             const float        alpha = 1.0f,
+                             const float        beta  = 0.0f);
+
+    virtual void batchedGemm(const GemmOp       transa,
+                             const GemmOp       transb,
+                             const size_t       m,
+                             const size_t       n,
+                             const size_t       k,
+                             const void* const* A,
+                             const size_t       lda,
+                             const void* const* B,
+                             const size_t       ldb,
+                             void* const*       C,
+                             const size_t       ldc,
+                             const size_t       batch_size,
+                             const float        alpha = 1.0f,
+                             const float        beta  = 0.0f);
+
+    /**
+     * @brief Compute the matrix multiplication of batch of matrices As and Bs
+     *
+     * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
+     *  `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
+     *
+     * @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
+     * @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
+     * @param m      A number of rows of a matrix op(A) and C.
+     * @param n      A number of columns of a matrix op(B) or C.
+     * @param k      A number of columns of op(A) and rows of op(B).
+     * @param A      An array of device pointers of batch of input A matrices.
+     * @param Atype  A data type of A (TYPE_FP16 or TYPE_FP32)
+     * @param lda    A leading dimension of the matrix A.
+     * @param B      An array of device pointers of batch of input B matrices.
+     * @param Btype  A data type of B (TYPE_FP16 or TYPE_FP32)
+     * @param ldb    A leading dimension of the matrix B.
+     * @param C      (Output) An array of device pointers of batch of output C matrices.
+     * @param Ctype  A data type of C (TYPE_FP16 or TYPE_FP32)
+     * @param ldc    A leading dimension of the matrix C.
+     * @param alpha  A scale factor for A*B (default: 1.0f).
+     * @param beta   A scale factor for C (default: 0.0f).
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    virtual void batchedGemm(const GemmOp       transa,
+                             const GemmOp       transb,
+                             const size_t       m,
+                             const size_t       n,
+                             const size_t       k,
+                             const void* const* A,
+                             const DataType     Atype,
+                             const size_t       lda,
+                             const void* const* B,
+                             const DataType     Btype,
+                             const size_t       ldb,
+                             void* const*       C,
+                             const DataType     Ctype,
+                             const size_t       ldc,
+                             const size_t       batch_size,
+                             const float        alpha = 1.0f,
+                             const float        beta  = 0.0f);
+
+    virtual void stridedBatchedGemm(GemmOp       transa,
+                                    GemmOp       transb,
+                                    const size_t m,
+                                    const size_t n,
+                                    const size_t k,
+                                    const void*  A,
+                                    const void*  B,
+                                    void*        C,
+                                    const size_t batch_size,
+                                    const float  alpha = 1.0f,
+                                    const float  beta  = 0.0f);
+
+    virtual void stridedBatchedGemm(GemmOp        transa,
+                                    GemmOp        transb,
+                                    const size_t  m,
+                                    const size_t  n,
+                                    const size_t  k,
+                                    const void*   A,
+                                    const int64_t strideA,
+                                    const void*   B,
+                                    const int64_t strideB,
+                                    void*         C,
+                                    const int64_t strideC,
+                                    const size_t  batch_size,
+                                    const float   alpha = 1.0f,
+                                    const float   beta  = 0.0f);
+
+    virtual void stridedBatchedGemm(GemmOp        transa,
+                                    GemmOp        transb,
+                                    const size_t  m,
+                                    const size_t  n,
+                                    const size_t  k,
+                                    const void*   A,
+                                    const size_t  lda,
+                                    const int64_t strideA,
+                                    const void*   B,
+                                    const size_t  ldb,
+                                    const int64_t strideB,
+                                    void*         C,
+                                    const size_t  ldc,
+                                    const int64_t strideC,
+                                    const size_t  batch_size,
+                                    const float   alpha = 1.0f,
+                                    const float   beta  = 0.0f);
+    /**
+     * @brief Compute the strided matrix multiplication of batch of matrices As and Bs
+     *
+     * For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
+     *  `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
+     *
+     * @param transa   A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
+     * @param transb   A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
+     * @param m        A number of rows of a matrix op(A) and C.
+     * @param n        A number of columns of a matrix op(B) or C.
+     * @param k        A number of columns of op(A) and rows of op(B).
+     * @param A        An array of device pointers of batch of input A matrices.
+     * @param Atype    A data type of A (TYPE_FP16 or TYPE_FP32)
+     * @param lda      A leading dimension of the matrix A.
+     * @param strideA  An offset in number of elements between matrix A[i] and A[i+1].
+     * @param B        An array of device pointers of batch of input B matrices.
+     * @param Btype    A data type of B (TYPE_FP16 or TYPE_FP32)
+     * @param ldb      A leading dimension of the matrix B.
+     * @param strideB  An offset in number of elements between matrix B[i] and B[i+1].
+     * @param C        (Output) An array of device pointers of batch of output C matrices.
+     * @param Ctype    A data type of C (TYPE_FP16 or TYPE_FP32)
+     * @param ldc      A leading dimension of the matrix C.
+     * @param strideC  An offset in number of elements between matrix C[i] and C[i+1].
+     * @param compute_type  An accumulation type of GEMM.
+     * @param alpha    A scale factor for A*B (default: 1.0f).
+     * @param beta     A scale factor for C (default: 0.0f).
+     *
+     * @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
+     * @throw std::runtime_error  if any exception inside CUDA.
+     */
+    virtual void stridedBatchedGemm(GemmOp        transa,
+                                    GemmOp        transb,
+                                    const size_t  m,
+                                    const size_t  n,
+                                    const size_t  k,
+                                    const void*   A,
+                                    DataType      Atype,
+                                    const size_t  lda,
+                                    const int64_t strideA,
+                                    const void*   B,
+                                    DataType      Btype,
+                                    const size_t  ldb,
+                                    const int64_t strideB,
+                                    void*         C,
+                                    DataType      Ctype,
+                                    const size_t  ldc,
+                                    const int64_t strideC,
+                                    const size_t  batch_size,
+                                    DataType      compute_type,
+                                    const float   alpha = 1.0f,
+                                    const float   beta  = 0.0f);
+
+protected:
+    IAllocator*    allocator_ = nullptr;
+    cudaStream_t   stream_;
+    std::mutex*    mutex_           = nullptr;
+    cublasAlgoMap* cublas_algo_map_ = nullptr;
+
+    cublasHandle_t   cublas_handle_;
+    cublasLtHandle_t cublaslt_handle_;
+    void*            workspace_ = nullptr;
+
+    // use FP32 as default
+    DataType a_type_       = TYPE_FP32;
+    DataType b_type_       = TYPE_FP32;
+    DataType c_type_       = TYPE_FP32;
+    DataType compute_type_ = TYPE_FP32;
+
+    // Check if data and inputs are valid in the Gemm class.
+    virtual void checkDataTypeValidity(const DataType& type);
+};
+
+// class Int8Gemm : public Gemm {
+
+// protected:
+//     bool use_ORDER_COL32_2R_4R4_; // what is this?
+// };
+
+#ifdef SPARSITY_ENABLED
+
+/**
+ * A Sparse Gemm class.
+ *
+ * NOTE:
+ *   A, B, C are assumed to have a row major layout.
+ *   There are two restrictions:
+ *   - It supports the case when the matrix B is sparse.
+ *   - Supported only TYPE_FP16 for in/out data or compute types.
+ */
+class SpGemm: public Gemm {
+
+protected:
+    cusparseLtHandle_t                               cusparselt_handle_;
+    std::map<std::string, cusparseLtMatDescriptor_t> a_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> b_desc_map_;
+    std::map<std::string, cusparseLtMatDescriptor_t> c_desc_map_;
+    bool                                             useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k);
+
+public:
+    using Gemm::setComputeType;
+    using Gemm::setTypes;
+    using Gemm::setDefaultTypes;
+    using Gemm::setAllocator;
+    using Gemm::setCudaStream;
+    using Gemm::gemm;
+    using Gemm::batchedGemm;
+    using Gemm::stridedBatchedGemm;
+
+    /**
+     * @param allocator   Resource allocator.
+     * @param stream      A CUDA stream.
+     * @param config_file A file path of a GEMM configuration.
+     */
+    // TODO: Let's unify algo map loading part.
+    SpGemm(IAllocator*  allocator,
+           cudaStream_t stream,
+           std::string  config_file   = GEMM_CONFIG,
+           std::string  spconfig_file = SPGEMM_CONFIG);
+    ~SpGemm();
+    std::string toString() override;
+    void        loadGemmConfig(std::string config_file, std::string spconfig_file);
+
+    // Template method cannot be overridden.
+    void gemm(const GemmOp              transa,
+              const GemmOp              transb,
+              const size_t              m,
+              const size_t              n,
+              const size_t              k,
+              const void*               input,
+              const DenseWeight<float>& weight,
+              void*                     output,
+              const float               alpha = 1.0f,
+              const float               beta  = 0.0f) override;
+    void gemm(const GemmOp             transa,
+              const GemmOp             transb,
+              const size_t             m,
+              const size_t             n,
+              const size_t             k,
+              const void*              input,
+              const DenseWeight<half>& weight,
+              void*                    output,
+              const float              alpha = 1.0f,
+              const float              beta  = 0.0f) override;
+
+    void gemm(const GemmOp   transa,
+              const GemmOp   transb,
+              const size_t   m,
+              const size_t   n,
+              const size_t   k,
+              const void*    A,
+              const DataType Atype,
+              const size_t   lda,
+              const void*    B,
+              const DataType Btype,
+              const size_t   ldb,
+              void*          C,
+              const DataType Ctype,
+              const size_t   ldc,
+              const float    alpha = 1.0f,
+              const float    beta  = 0.0f) override;
+
+private:
+    void checkDataTypeValidity(const DataType& type) override;
+
+    // Temporal gemm helper mtehod to use template T.
+    template<typename T>
+    void weightGemmHelper(const GemmOp          transa,
+                          const GemmOp          transb,
+                          const size_t          m,
+                          const size_t          n,
+                          const size_t          k,
+                          const void*           input,
+                          const DenseWeight<T>& weight,
+                          void*                 output,
+                          const float           alpha,
+                          const float           beta);
+};
+
+// class Int8SpGemm : public Int8Gemm, public SpGemm {
+
+// };
+#endif
+
+/* ***************************** GEMM Exceptions ******************************* */
+
+class GemmInvalidShapeException: public std::exception {
+private:
+    std::string msg_ = "Invalid matrix shapes.";
+
+public:
+    explicit GemmInvalidShapeException() = default;
+
+    template<typename... Args>
+    explicit GemmInvalidShapeException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
+    {
+    }
+
+    const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+};
+
+class GemmNotSupportedException: public std::exception {
+private:
+    std::string msg_ = "Not supported exception.";
+
+public:
+    explicit GemmNotSupportedException() = default;
+
+    template<typename... Args>
+    explicit GemmNotSupportedException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
+    {
+    }
+
+    const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+};
+
+class GemmInvalidException: public std::exception {
+private:
+    std::string msg_ = "Invalid use of gemm.";
+
+public:
+    explicit GemmInvalidException() = default;
+
+    template<typename... Args>
+    explicit GemmInvalidException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
+    {
+    }
+
+    const char* what() const throw()
+    {
+        return msg_.c_str();
+    }
+};
+
+/* ************************ End of GEMM Exceptions ************************ */
+
+/* ***************************** GEMM utils ******************************* */
+
+/**
+ * @brief Create method for the Gemm family.
+ *
+ * @param allocator  Resource allocator.
+ * @param stream     A CUDA stream.
+ * @param sparse     Whether to use sparse GEMM
+ * @param quantized  Whether to use int8 quantized GEMM.
+ * @return A shared pointer of a GemmCls instance.
+ */
+std::shared_ptr<Gemm>
+createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false);
+
+cudaDataType_t getCublasDataType(DataType dtype);
+#if (CUDART_VERSION >= 11000)
+cublasComputeType_t getCublasComputeType(DataType dtype);
+#else
+cudaDataType_t getCublasComputeType(DataType dtype);
+#endif
+cublasOperation_t getCublasOperation(GemmOp op);
+std::string       getGemmOpString(const GemmOp& op);
+
+#ifdef SPARSITY_ENABLED
+cusparseOperation_t getCusparseOperation(GemmOp op);
+cusparseComputeType getCusparseComputeType(DataType dtype);
+
+/**
+ * @brief Prune a weight matrix (in-place).
+ *
+ * SpGemm supports a case when the sparse matrix is B in C=A*B.
+ *
+ * @param data    A data pointer
+ * @param stream  A cuda stream object.
+ * @param k       A number of rows of op(B).
+ * @param n       A number of columns of op(B).
+ * @param trans   A transpose operation that will be applied to the matrix
+ *                (default: GEMM_OP_N).
+ */
+void pruneMatrixB(
+    void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans = GEMM_OP_N);
+
+/**
+ * @brief Compress the B matrix in a specific sparsity format.
+ *
+ * @param output A pointer where to allocate memory buffer to store a compressed matrix.
+ * @param alloactor  A resource allocator.
+ * @param stream A cuda stream object.
+ * @param input  An input matrix to compress.
+ * @param k      A number of rows of op(B).
+ * @param n      A number of columns of op(B).
+ * @param trans  A transpose operation that will be applied to the matrix (default: GEMM_OP_N).
+ *
+ * @return A size of the allocated device buffer of the compressed matrix.
+ *
+ * @throw GemmInvalidException  if the input matrix does not have 2:4 sparsity.
+ *              or if fail to compute a correct buffer size to store the compressed matrix.
+ * @throw std::runtime_error  if any exception inside CUDA.
+ */
+size_t compressMatrixB(void**              output,
+                       IAllocator&         allocator,
+                       const cudaStream_t& stream,
+                       const void*         input,
+                       const size_t        k,
+                       const size_t        n,
+                       const GemmOp        trans = GEMM_OP_N);
+
+#endif
+
+/* ************************* End of GEMM utils **************************** */
+
+}  // end of namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/CMakeLists.txt b/src/fastertransformer/utils/gemm_test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e4726c7e681d34c0e07e6fc20096379d67fc2c19
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/CMakeLists.txt
@@ -0,0 +1,107 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+cmake_minimum_required(VERSION 3.8)
+
+set(gemm_func_files
+  gemm_func.cc
+)
+
+set(encoder_gemm_func_files
+  encoder_gemm_func.cc
+)
+
+set(encoder_igemm_func_files
+  encoder_igemm_func.cc
+)
+
+set(decoding_gemm_func_files
+  decoding_gemm_func.cc
+)
+
+set(gpt_gemm_func_files
+  gpt_gemm_func.cc
+)
+
+set(xlnet_gemm_func_files
+  xlnet_gemm_func.cc
+)
+
+set(t5_gemm_func_files
+  t5_gemm_func.cc
+)
+
+set(swin_igemm_func_files
+  swin_igemm_func.cc
+)
+
+set(swin_gemm_func_files
+  swin_gemm_func.cc
+)
+
+add_library(gemm_func STATIC ${gemm_func_files})
+target_link_libraries(gemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
+set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
+target_link_libraries(encoder_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+if (SPARSITY_SUPPORT)
+target_link_libraries(encoder_gemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
+target_link_libraries(encoder_igemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
+if (SPARSITY_SUPPORT)
+target_link_libraries(encoder_igemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
+target_link_libraries(decoding_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
+target_link_libraries(gpt_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+if (SPARSITY_SUPPORT)
+  target_link_libraries(gpt_gemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
+target_link_libraries(xlnet_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
+target_link_libraries(t5_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+if (SPARSITY_SUPPORT)
+  target_link_libraries(t5_gemm_func PUBLIC -lcusparse -lcusparseLt)
+endif()
+set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
+target_link_libraries(swin_igemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func encoder_igemm_func cuda_utils logger)
+set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
+target_link_libraries(swin_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
+set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/src/fastertransformer/utils/gemm_test/decoding_gemm_func.cc b/src/fastertransformer/utils/gemm_test/decoding_gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de53051f3555232a22964ddb325cb105b23f2ec
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/decoding_gemm_func.cc
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm_test/decoding_gemm_func.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_decoding_gemm_config(int   batch_size,
+                                   int   beam_width,
+                                   int   max_mem_seq_len,
+                                   int   head_num,
+                                   int   size_per_head,
+                                   int   inter_size,
+                                   int   vocab_size,
+                                   int   mem_hidden_units,
+                                   void* buffer_in,
+                                   bool  isAppend)
+{
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+
+    const int hidden_units = head_num * size_per_head;
+    const int gemm_num     = 6;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1};
+    char      mess[gemm_num][256];
+
+    // gemm 0
+    M[0] = batch_size * beam_width;
+    K[0] = hidden_units;
+    N[0] = K[0] * 3;
+    strcpy(mess[0], "from_tensor * weightQKV");
+
+    // gemm 1
+    M[1] = batch_size * beam_width;
+    K[1] = hidden_units;
+    N[1] = K[1];
+    strcpy(mess[1], "attr * output_kernel");
+
+    // gemm2
+    M[2] = batch_size * beam_width * max_mem_seq_len;
+    K[2] = mem_hidden_units;
+    N[2] = hidden_units;
+    strcpy(mess[2], "mem_tensor * weightK/V in cross attention");
+
+    // gemm 3
+    M[3] = batch_size * beam_width;
+    K[3] = hidden_units;
+    N[3] = inter_size;
+    strcpy(mess[3], "ffn gemm1 ");
+
+    // gemm 4
+    M[4] = batch_size * beam_width;
+    K[4] = inter_size;
+    N[4] = hidden_units;
+    strcpy(mess[4], "ffn gemm2");
+
+    // gemm5
+    M[5] = batch_size * beam_width;
+    K[5] = hidden_units;
+    N[5] = ceil(vocab_size / 8.) * 8;
+    strcpy(mess[5], "decoder_output * embedding_kernel -> embedding_output");
+
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+    using scaleT = typename ScaleTypeConverter<T>::Type;
+
+    scaleT alpha = (scaleT)1.0f;
+    scaleT beta  = (scaleT)0.0f;
+
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
+    }
+    for (int i = 0; i < gemm_num; ++i) {
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        int   seq_len   = i == 2 ? max_mem_seq_len : 1;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                status = cublasGemmEx(cublas_handle,
+                                      CUBLAS_OP_N,
+                                      CUBLAS_OP_N,
+                                      n,
+                                      m,
+                                      k,
+                                      &alpha,
+                                      d_B,
+                                      BType,
+                                      n,
+                                      d_A,
+                                      AType,
+                                      k,
+                                      &beta,
+                                      d_C,
+                                      CType,
+                                      n,
+                                      computeType,
+                                      static_cast<cublasGemmAlgo_t>(algo));
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }
+            }
+        }
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+
+        // for fp16 and bf16, we compare cublasLt
+        if (data_type != FLOAT_DATATYPE) {
+            printf("***cublasLt Gemm Testing Begin***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+
+            LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                         batch_size * beam_width,
+                                         seq_len,
+                                         head_num,
+                                         size_per_head,
+                                         n,
+                                         m,
+                                         k,
+                                         &alpha,
+                                         d_B,
+                                         d_A,
+                                         &beta,
+                                         d_C,
+                                         cublas_workspace,
+                                         workSpaceSize,
+                                         fd,
+                                         perfResults,
+                                         ALGO_COMBINATIONS);
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(batch_size * beam_width,
+                                   seq_len,
+                                   head_num,
+                                   size_per_head,
+                                   n,
+                                   m,
+                                   k,
+                                   perfResults[0],
+                                   fd,
+                                   data_type,
+                                   0);
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size * beam_width,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size * beam_width,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+    printf("***Decoding Gemm Testing End***\n");
+    return;
+}
+
+template void generate_decoding_gemm_config<float>(int   batch_size,
+                                                   int   beam_width,
+                                                   int   seq_len,
+                                                   int   head_num,
+                                                   int   size_per_head,
+                                                   int   inter_size,
+                                                   int   vocab_size,
+                                                   int   mem_hidden_units,
+                                                   void* buffer_in,
+                                                   bool  isAppend);
+
+template void generate_decoding_gemm_config<half>(int   batch_size,
+                                                  int   beam_width,
+                                                  int   seq_len,
+                                                  int   head_num,
+                                                  int   size_per_head,
+                                                  int   inter_size,
+                                                  int   vocab_size,
+                                                  int   mem_hidden_units,
+                                                  void* buffer_in,
+                                                  bool  isAppend);
+
+#ifdef ENABLE_BF16
+template void generate_decoding_gemm_config<__nv_bfloat16>(int   batch_size,
+                                                           int   beam_width,
+                                                           int   seq_len,
+                                                           int   head_num,
+                                                           int   size_per_head,
+                                                           int   inter_size,
+                                                           int   vocab_size,
+                                                           int   mem_hidden_units,
+                                                           void* buffer_in,
+                                                           bool  isAppend);
+#endif
+
+size_t calDecodingGemmTestBufSizeInByte(int            batch_size,
+                                        int            beam_width,
+                                        int            max_mem_seq_len,
+                                        int            head_num,
+                                        int            size_per_head,
+                                        int            inter_size,
+                                        int            memory_hidden_units,
+                                        int            vocab_size,
+                                        CublasDataType data_type)
+{
+    size_t       buf_size_in_byte   = 0;
+    const size_t tensor_para_size   = 1;
+    const size_t hidden_units       = head_num * size_per_head;
+    const size_t local_head_num     = head_num / tensor_para_size;
+    const size_t local_hidden_units = local_head_num * size_per_head;
+
+    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
+    // Because we always use float for some buffer, set the wordSize to float directly.
+    int wordSize = sizeof(float);
+
+    size_t              m = batch_size * beam_width;
+    std::vector<size_t> buff_size;
+    // for qkv gemm
+    buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
+    // for attention output gemm
+    buff_size.push_back(m * hidden_units + hidden_units * local_hidden_units + m * local_hidden_units);
+    // for memory_tensor gemm
+    buff_size.push_back(m * max_mem_seq_len * memory_hidden_units + memory_hidden_units * local_hidden_units
+                        + m * max_mem_seq_len * local_hidden_units);
+    // for context ffn gemm
+    buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
+                        + m * hidden_units);
+    // for vocab
+    buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
+                        + m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
+
+    for (auto t : buff_size) {
+        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
+    }
+    buf_size_in_byte *= wordSize;
+    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
+
+    return buf_size_in_byte;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/decoding_gemm_func.h b/src/fastertransformer/utils/gemm_test/decoding_gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..faa793baaec0156bef24ab84c004c5ed8659ba5f
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/decoding_gemm_func.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_decoding_gemm_config(int   batch_size,
+                                   int   beam_width,
+                                   int   seq_len,
+                                   int   head_num,
+                                   int   size_per_head,
+                                   int   inter_size,
+                                   int   vocab_size,
+                                   int   mem_hidden_units,
+                                   void* buffer_in,
+                                   bool  isAppend);
+
+size_t calDecodingGemmTestBufSizeInByte(int            batch_size,
+                                        int            beam_width,
+                                        int            max_mem_seq_len,
+                                        int            head_num,
+                                        int            size_per_head,
+                                        int            inter_size,
+                                        int            memory_hidden_units,
+                                        int            vocab_size,
+                                        CublasDataType data_type);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe9a00b53f51668e921ad1e630160177ddebd50c
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc
@@ -0,0 +1,563 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm_test/encoder_gemm_func.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_encoder_gemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend, int tensor_para_size)
+{
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+
+    const int gemm_num = 7;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
+    char      mess[gemm_num][256];
+    float     exec_times[gemm_num];
+
+    // gemm1
+    M[0] = batch_size * seq_len;
+    K[0] = head_num * size_per_head;
+    N[0] = (head_num / tensor_para_size) * size_per_head;
+    strcpy(mess[0], "from_tensor * weightQ/K/V");
+
+    // gemm2
+    M[1] = M[0];
+    K[1] = head_num * size_per_head;
+    N[1] = 4 * head_num * size_per_head / tensor_para_size;
+    strcpy(mess[1], "attr_output * inter_kernel");
+
+    // gemm3
+    M[2] = M[0];
+    K[2] = 4 * head_num * size_per_head / tensor_para_size;
+    N[2] = head_num * size_per_head;
+    strcpy(mess[2], "inter_matmul * output_kernel");
+
+    M[3]          = seq_len;
+    N[3]          = seq_len;
+    K[3]          = size_per_head;
+    batchCount[3] = batch_size * (head_num / tensor_para_size);
+    strcpy(mess[3], "attention batched Gemm1");
+
+    M[4]          = seq_len;
+    N[4]          = size_per_head;
+    K[4]          = seq_len;
+    batchCount[4] = batch_size * (head_num / tensor_para_size);
+    strcpy(mess[4], "attention batched Gemm2");
+
+    M[5]          = batch_size * seq_len;
+    N[5]          = (head_num / tensor_para_size) * size_per_head;
+    K[5]          = head_num * size_per_head;
+    batchCount[5] = 3;
+    strcpy(mess[5], "from_tensor * weight_QKV in BatchGemm");
+
+    M[6] = batch_size * seq_len;
+    K[6] = (head_num / tensor_para_size) * size_per_head;
+    N[6] = head_num * size_per_head;
+    strcpy(mess[6], "attr * output_kernel");
+
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+    using scaleT = typename ScaleTypeConverter<T, false>::Type;
+
+    scaleT alpha = (scaleT)1.0f;
+    scaleT beta  = (scaleT)0.0f;
+
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
+    }
+    for (int i = 0; i < gemm_num; ++i) {
+        // if(i != 0 && i != 5) continue;
+
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+
+        // array of pointer for batchedGemm
+        T* harray[12];
+        harray[0]  = (T*)buffer;
+        harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
+        harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
+        harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
+        harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
+        harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
+        harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
+        harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
+        harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
+
+        T** darray = 0;
+        check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
+        cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
+        T** dAarray = darray;
+        T** dBarray = darray + 4;
+        T** dCarray = darray + 8;
+
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                if (i < 3) {
+                    status = cublasGemmEx(cublas_handle,
+                                          CUBLAS_OP_N,
+                                          CUBLAS_OP_N,
+                                          n,
+                                          m,
+                                          k,
+                                          &alpha,
+                                          d_B,
+                                          BType,
+                                          n,
+                                          d_A,
+                                          AType,
+                                          k,
+                                          &beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 3) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_T,
+                                                        CUBLAS_OP_N,
+                                                        seq_len,
+                                                        seq_len,
+                                                        size_per_head,
+                                                        &alpha,
+                                                        d_B,
+                                                        BType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        &beta,
+                                                        d_C,
+                                                        CType,
+                                                        seq_len,
+                                                        seq_len * seq_len,
+                                                        batch_size * head_num,
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 4) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_N,
+                                                        CUBLAS_OP_N,
+                                                        size_per_head,
+                                                        seq_len,
+                                                        seq_len,
+                                                        &alpha,
+                                                        d_B,
+                                                        BType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        seq_len,
+                                                        seq_len * seq_len,
+                                                        &beta,
+                                                        d_C,
+                                                        CType,
+                                                        size_per_head,
+                                                        seq_len * size_per_head,
+                                                        batch_size * head_num,
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 5) {
+                    status = cublasGemmBatchedEx(cublas_handle,
+                                                 CUBLAS_OP_N,
+                                                 CUBLAS_OP_N,
+                                                 n,
+                                                 m,
+                                                 k,
+                                                 &alpha,
+                                                 (const void* const*)dBarray,
+                                                 BType,
+                                                 n,
+                                                 (const void* const*)dAarray,
+                                                 AType,
+                                                 k,
+                                                 &beta,
+                                                 (void* const*)dCarray,
+                                                 CType,
+                                                 n,
+                                                 3,
+                                                 computeType,
+                                                 static_cast<cublasGemmAlgo_t>(algo));
+                }
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }
+            }
+        }
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+
+        // for fp16 and bf16, we compare cublasLt
+        if (i < 3 && data_type != FLOAT_DATATYPE) {
+            printf("***cublasLt Gemm Testing Begin***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+            LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                         batch_size,
+                                         seq_len,
+                                         head_num,
+                                         size_per_head,
+                                         n,
+                                         m,
+                                         k,
+                                         &alpha,
+                                         d_B,
+                                         d_A,
+                                         &beta,
+                                         d_C,
+                                         cublas_workspace,
+                                         workSpaceSize,
+                                         fd,
+                                         perfResults,
+                                         ALGO_COMBINATIONS);
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(
+                    batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
+                exec_time = perfResults[0].time;
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }
+        exec_times[i] = exec_time;
+        cudaFree(darray);
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+    printf("***Encoder Gemm Testing End***\n");
+
+#ifdef SPARSITY_ENABLED
+    bool do_sparse_test = false;
+    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
+        do_sparse_test = true;
+    }
+    if (do_sparse_test && sizeof(T) == sizeof(half)) {
+        printf("***cusparseLt Gemm Testing Begin***\n");
+        // only first 3 cases can be sparse
+        const int spgemm_num = 3;
+        if (!isAppend) {
+            fd = fopen(SPGEMM_CONFIG, "w+");
+        }
+        else {
+            fd = fopen(SPGEMM_CONFIG, "a+");
+            std::vector<std::string> config;
+            char                     line[1024];
+            while (fgets(line, 1024, fd) != NULL) {
+                config.push_back(std::string(line));
+            }
+            line_count = config.size();
+            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1))  // 6 cublas/cublasLt, first row is not included
+            {
+                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
+                fclose(fd);
+                fd = fopen(SPGEMM_CONFIG, "w+");
+                fprintf(fd, "%s", config[0].c_str());
+                for (uint i = startIdx; i < config.size(); i++) {
+                    fprintf(fd, "%s", config[i].c_str());
+                }
+                line_count = config.size() - (spgemm_num + 3);
+            }
+        }
+        if (line_count == 0) {
+            fprintf(
+                fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
+        }
+        cusparseLtHandle_t handle;
+        CHECK_CUSPARSE(cusparseLtInit(&handle));
+        cusparseOrder_t     order        = CUSPARSE_ORDER_COL;
+        cusparseOperation_t opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
+        unsigned            alignment    = 16;
+        cudaStream_t        stream       = 0;
+        float               alpha2       = 1.0f;
+        float               beta2        = 0.0f;
+        for (int i = 0; i < spgemm_num; ++i) {
+            // to be compatible with spgemm wrapper, we let A be the weight matrix
+            // so m and n are swapped
+            // A: mxk B: kxn C:mxn
+            int m = N[i], n = M[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
+            T* d_A = (T*)buffer;
+            T* d_B = d_A + m * k * batchCount[i];
+            T* d_C = d_B + k * n * batchCount[i];
+            T* dA_compressed;
+            {
+                cusparseLtMatDescriptor_t matA;
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(
+                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                size_t compressed_size;
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+            }
+
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            for (int alg = 0; alg < 4; ++alg) {
+                cudaDeviceSynchronize();
+                cusparseLtMatDescriptor_t matA, matB, matC;
+                void*                     d_workspace = nullptr;
+                int                       num_streams = 1;
+                cudaStream_t              streams[1]  = {stream};
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                gettimeofday(&start, NULL);
+                for (int ite = 0; ite < ites; ++ite) {
+                    // initializing MatDesc takes a lot of time
+                    // and these descs can be stored to other place
+                    // whereas storing MatMulPlan to other place will cause errors
+                    cusparseLtMatmulDescriptor_t   matmul;
+                    cusparseLtMatmulAlgSelection_t alg_sel;
+                    cusparseLtMatmulPlan_t         plan;
+                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+                        &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                    CHECK_CUSPARSE(
+                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+                        &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+                    size_t workspace_size;
+                    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmul(&handle,
+                                                    &plan,
+                                                    &alpha2,
+                                                    dA_compressed,
+                                                    d_B,
+                                                    &beta2,
+                                                    d_C,
+                                                    d_C,
+                                                    d_workspace,
+                                                    streams,
+                                                    num_streams))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+                }
+                cudaDeviceSynchronize();
+                gettimeofday(&end, NULL);
+                printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
+                if (diffTime(start, end) < exec_time) {
+                    exec_time = diffTime(start, end);
+                    fast_algo = alg;
+                }
+            }
+            exec_time /= ites;
+            if (exec_time >= exec_times[i]) {
+                fast_algo = -1;
+            }
+            printf("fast_algo %d\n", fast_algo);
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    HALF_DATATYPE,
+                    batchCount[i],
+                    m,
+                    n,
+                    k,
+                    fast_algo,
+                    exec_time);
+            cudaFree(dA_compressed);
+        }
+        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
+        fclose(fd);
+        printf("***cusparseLt Gemm Testing End***\n");
+    }
+#endif
+    return;
+}
+
+template void generate_encoder_gemm_config<float>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
+template void generate_encoder_gemm_config<half>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
+#ifdef ENABLE_BF16
+template void generate_encoder_gemm_config<__nv_bfloat16>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/encoder_gemm_func.h b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..11029c6bf7ba5bfd5fc9c36c0f883fe171f21e19
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_encoder_gemm_config(int   batch_size,
+                                  int   seq_len,
+                                  int   head_num,
+                                  int   size_per_head,
+                                  void* buffer,
+                                  bool  isAppend         = true,
+                                  int   tensor_para_size = 1);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6077ccc2b98a2d141e7dd26f5d6d673c29a6bb7
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc
@@ -0,0 +1,1332 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "encoder_igemm_func.h"
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+
+int batch_size_;
+int seq_len_;
+int head_num_;
+int size_per_head_;
+
+static const char* showStatus(cublasStatus_t error)
+{
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+
+    return "<unknown>";
+}
+
+// Utility function to print customMatmulPerf_t structure
+int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
+{
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+
+    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+    stages                     = 0;
+#endif
+
+    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
+           "time %f workspace=%d mathMode=%d waves=%f\n",
+           algoId,
+           tile,
+           matmulTileName[tile],
+           numSplitsK,
+           reductionScheme,
+           swizzle,
+           customOption,
+           stages,
+           perf.status,
+           perf.time,
+           (int)perf.workspaceSize,
+           (int)perf.mathMode,
+           perf.wavesCount);
+
+    // chose the fastest algo that does not need workspace
+    if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
+        fprintf(fout,
+                "%d %d %d %d %d ### 1 %d %d %d %d %d %d %d %d %d %d %d %f\n",
+                batch_size_,
+                seq_len_,
+                head_num_,
+                size_per_head_,
+                INT8_DATATYPE,
+                m,
+                n,
+                k,
+                algoId,
+                customOption,
+                tile,
+                numSplitsK,
+                swizzle,
+                reductionScheme,
+                (int)perf.workspaceSize,
+                stages,
+                perf.time);
+        return 1;
+    }
+    else {
+        return hasPrint;
+    }
+}
+
+int printBatchPerfStructure(
+    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
+{
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+
+    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+    stages                     = 0;
+#endif
+
+    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
+           "time %f workspace=%d mathMode=%d waves=%f\n",
+           algoId,
+           tile,
+           matmulTileName[tile],
+           numSplitsK,
+           reductionScheme,
+           swizzle,
+           customOption,
+           stages,
+           perf.status,
+           perf.time,
+           (int)perf.workspaceSize,
+           (int)perf.mathMode,
+           perf.wavesCount);
+
+    // chose the fastest algo that does not need workspace
+    if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
+        fprintf(fout,
+                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d %f\n",
+                batch_size_,
+                seq_len_,
+                head_num_,
+                size_per_head_,
+                INT8_DATATYPE,
+                batchCount,
+                m,
+                n,
+                k,
+                algoId,
+                customOption,
+                tile,
+                numSplitsK,
+                swizzle,
+                reductionScheme,
+                (int)perf.workspaceSize,
+                stages,
+                perf.time);
+        return 1;
+    }
+    else {
+        return hasPrint;
+    }
+}
+
+static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
+{
+    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
+}
+
+static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
+                                      cublasLtMatmulDesc_t        operationDesc,
+                                      const void*                 alpha, /* host or device pointer */
+                                      const void*                 A,
+                                      cublasLtMatrixLayout_t      Adesc,
+                                      const void*                 B,
+                                      cublasLtMatrixLayout_t      Bdesc,
+                                      const void*                 beta, /* host or device pointer */
+                                      const void*                 C,
+                                      cublasLtMatrixLayout_t      Cdesc,
+                                      void*                       D,
+                                      cublasLtMatrixLayout_t      Ddesc,
+                                      const cublasLtMatmulAlgo_t& algo,
+                                      int                         kernelRepeats,
+                                      void*                       workSpace,
+                                      size_t                      workSpaceSizeInBytes,
+                                      customMatmulPerf_t&         perfResults,
+                                      cudaStream_t                stream)
+{
+    cublasLtMatmulHeuristicResult_t heurResult;
+    /* Looping over the Algo */
+    int            repeats = kernelRepeats;
+    cublasStatus_t algoStatus =
+        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
+    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
+            struct timeval start, end;
+            cublasStatus_t oneRunStatus;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int loop = 0; loop < repeats; loop++) {
+                oneRunStatus = cublasLtMatmul(ltHandle,
+                                              operationDesc,
+                                              alpha,
+                                              A,
+                                              Adesc,
+                                              B,
+                                              Bdesc,
+                                              beta,
+                                              C,
+                                              Cdesc,
+                                              D,
+                                              Ddesc,
+                                              &algo,
+                                              workSpace,
+                                              workSpaceSizeInBytes,
+                                              stream);
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                algoStatus = oneRunStatus;
+            }
+            float time = diffTime(start, end);
+            // For the moment only add successful findings
+            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                perfResults.algo          = algo;
+                perfResults.time          = time / repeats;
+                perfResults.workspaceSize = heurResult.workspaceSize;
+                perfResults.wavesCount    = heurResult.wavesCount;
+            }
+        }
+        else {
+            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
+            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+        }
+    }
+    else {
+        // printf("check fail!\n");
+    }
+    return algoStatus;
+}
+
+// Sample wrapper running through multiple algo and config attributes combination for INT8 gemm using cublasLt low-level
+// API
+template<typename T, typename scaleT>
+int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                      int              m,
+                      int              n,
+                      int              k,
+                      const scaleT*    alpha, /* host pointer */
+                      const int8_t*    A,
+                      const int8_t*    B,
+                      const scaleT*    beta, /* host pointer */
+                      T*               C,
+                      void*            workSpace,
+                      size_t           workSpaceSize,
+                      FILE*            fout)
+{
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cudaStream_t           stream = 0;
+    // SplitK value that we are going to try when SplitK is supported for a given algo
+    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    // Let try a fixed number of combinations
+#define ALGO_COMBINATIONS 50000
+    int                AlgoCombinations = ALGO_COMBINATIONS;
+    int                AlgoCount        = 0;
+    int                kernelRepeats    = 100;  // number of time the CUDA kernels will be run back to back
+    customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+    int                nbAlgoIds = 0;
+#define ALGO_IDS 100
+    int algoIdA[ALGO_IDS];
+
+    cudaDataType_t Atype, Btype, Ctype, scaleType;
+    Atype = CUDA_R_8I;
+    Btype = CUDA_R_8I;
+
+    if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
+        Ctype     = CUDA_R_32I;
+        scaleType = CUDA_R_32I;
+    }
+    else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
+        Ctype     = CUDA_R_8I;
+        scaleType = CUDA_R_32F;
+    }
+    else {
+        printf("[ERROR]<T,scaleT> of igemm is invalid\n");
+        exit(-1);
+    }
+
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+
+    bool use_ORDER_COL32_2R_4R4 = false;
+#if (CUDART_VERSION >= 11000)
+    int device{-1};
+    cudaGetDevice(&device);
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device);
+    if (props.major * 10 + props.minor >= 80) {
+        use_ORDER_COL32_2R_4R4 = true;
+    }
+#endif
+    cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+
+    int ldaTransform = 32 * m;
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+
+    int ldcTransform = 32 * m;
+
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+
+    // Create matrix descriptors.
+    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status =
+        cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    // Request AlgoId available for IGEMM
+    status = cublasLtMatmulAlgoGetIds(
+        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    // Loop over the Algo IDs
+    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+        /* Initialize algo structure with given Algp ID */
+        status =
+            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int              nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        }
+        else {
+            cublasLtMatmulAlgoCapGetAttribute(
+                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+        /* Loop over the different tiles */
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            /* Loop over different stages count */
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                /* Loop over the different custom option if any */
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    /* Loop over the CTAs swizzling support */
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
+                        // where splitK is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
+                            /* Setup attribute of the algo to run */
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceA[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceA[l - 1],
+                                                                     sizeof(splitKSequenceA[l - 1]));
+                                /* Going over all the reduction scheme  */
+                                for (redScheme = 1;
+                                     redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+                                        status                        = customMatmulRun(ltHandle,
+                                                                 operationDesc,
+                                                                 alpha, /* host or device pointer */
+                                                                 A,
+                                                                 Adesc,
+                                                                 B,
+                                                                 Bdesc,
+                                                                 beta, /* host or device pointer */
+                                                                 C,
+                                                                 Cdesc,
+                                                                 C,
+                                                                 Cdesc,
+                                                                 algo,
+                                                                 kernelRepeats,
+                                                                 workSpace,
+                                                                 workSpaceSize,
+                                                                 perfResults[AlgoCount],
+                                                                 stream);
+                                        perfResults[AlgoCount].status = status;
+                                        if (status == CUBLAS_STATUS_SUCCESS) {
+                                            AlgoCount++;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            }
+                            else {  // Non-splitK case
+                                /* if user preference is ok with workspace */
+                                if (AlgoCount < AlgoCombinations) {
+                                    status                        = customMatmulRun(ltHandle,
+                                                             operationDesc,
+                                                             alpha, /* host or device pointer */
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta, /* host or device pointer */
+                                                             C,
+                                                             Cdesc,
+                                                             C,
+                                                             Cdesc,
+                                                             algo,
+                                                             kernelRepeats,
+                                                             workSpace,
+                                                             workSpaceSize,
+                                                             perfResults[AlgoCount],
+                                                             stream);
+                                    perfResults[AlgoCount].status = status;
+                                    if (status == CUBLAS_STATUS_SUCCESS) {
+                                        AlgoCount++;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileA;
+    }  // end idx
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount, time_compare);
+    // Print timing and perf details
+    for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
+        printf("result %03d : ", i);
+        hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint);
+    }
+
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+
+template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                               int              m,
+                               int              n,
+                               int              k,
+                               const int*       alpha, /* host pointer */
+                               const int8_t*    A,
+                               const int8_t*    B,
+                               const int*       beta, /* host pointer */
+                               int32_t*         C,
+                               void*            workSpace,
+                               size_t           workSpaceSize,
+                               FILE*            fout);
+
+template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                               int              m,
+                               int              n,
+                               int              k,
+                               const float*     alpha, /* host pointer */
+                               const int8_t*    A,
+                               const int8_t*    B,
+                               const float*     beta, /* host pointer */
+                               int8_t*          C,
+                               void*            workSpace,
+                               size_t           workSpaceSize,
+                               FILE*            fout);
+
+template<typename T, typename scaleT>
+int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                           int              batchCount,
+                           int              m,
+                           int              n,
+                           int              k,
+                           const scaleT*    alpha, /* host pointer */
+                           const int8_t*    A,
+                           const int8_t*    B,
+                           const scaleT*    beta, /* host pointer */
+                           T*               C,
+                           void*            workSpace,
+                           size_t           workSpaceSize,
+                           FILE*            fout)
+{
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cudaStream_t           stream = 0;
+    // SplitK value that we are going to try when SplitK is supported for a given algo
+    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    // Let try a fixed number of combinations
+#define ALGO_COMBINATIONS 50000
+    int                AlgoCombinations = ALGO_COMBINATIONS;
+    int                AlgoCount        = 0;
+    int                kernelRepeats    = 100;  // number of time the CUDA kernels will be run back to back
+    customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+    int                nbAlgoIds = 0;
+#define ALGO_IDS 100
+    int algoIdA[ALGO_IDS];
+
+    cudaDataType_t Atype, Btype, Ctype, scaleType;
+    Atype = CUDA_R_8I;
+    Btype = CUDA_R_8I;
+
+    if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
+        Ctype     = CUDA_R_32I;
+        scaleType = CUDA_R_32I;
+    }
+    else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
+        Ctype     = CUDA_R_8I;
+        scaleType = CUDA_R_32F;
+    }
+    else {
+        printf("[ERROR]<T,scaleT> of igemm is invalid\n");
+        exit(-1);
+    }
+
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
+#else
+    cudaDataType_t computeType = CUDA_R_32I;
+#endif
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+
+    bool use_ORDER_COL32_2R_4R4 = false;
+#if (CUDART_VERSION >= 11000)
+    int device{-1};
+    cudaGetDevice(&device);
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, device);
+    if (props.major * 10 + props.minor >= 80) {
+        use_ORDER_COL32_2R_4R4 = true;
+    }
+#endif
+    cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
+    cublasLtOrder_t order_matrixB;
+#if (CUDART_VERSION >= 11000)
+    if (use_ORDER_COL32_2R_4R4) {
+        order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
+    }
+    else {
+        order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
+    }
+#else
+    order_matrixB              = CUBLASLT_ORDER_COL4_4R2_8C;
+#endif
+
+    int ldaTransform = 32 * m;
+    int ldbTransform;
+    if (use_ORDER_COL32_2R_4R4) {
+        ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
+    }
+    else {
+        ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
+    }
+
+    int ldcTransform = 32 * m;
+
+    int64_t stridea, strideb, stridec;
+    stridea = m * k;
+    strideb = n * k;
+    stridec = m * n;
+
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
+#else
+    status                     = cublasLtMatmulDescCreate(&operationDesc, scaleType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
+
+    // Create matrix descriptors.
+    status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+    cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
+
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status =
+        cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+    cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
+
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
+    cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
+
+    // Request AlgoId available for IGEMM
+    status = cublasLtMatmulAlgoGetIds(
+        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    // Loop over the Algo IDs
+    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+        /* Initialize algo structure with given Algp ID */
+        status =
+            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int              nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        }
+        else {
+            cublasLtMatmulAlgoCapGetAttribute(
+                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+        /* Loop over the different tiles */
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            /* Loop over different stages count */
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                /* Loop over the different custom option if any */
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    /* Loop over the CTAs swizzling support */
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
+                        // where splitK is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
+                            /* Setup attribute of the algo to run */
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceA[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceA[l - 1],
+                                                                     sizeof(splitKSequenceA[l - 1]));
+                                /* Going over all the reduction scheme  */
+                                for (redScheme = 1;
+                                     redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+                                        status                        = customMatmulRun(ltHandle,
+                                                                 operationDesc,
+                                                                 alpha, /* host or device pointer */
+                                                                 A,
+                                                                 Adesc,
+                                                                 B,
+                                                                 Bdesc,
+                                                                 beta, /* host or device pointer */
+                                                                 C,
+                                                                 Cdesc,
+                                                                 C,
+                                                                 Cdesc,
+                                                                 algo,
+                                                                 kernelRepeats,
+                                                                 workSpace,
+                                                                 workSpaceSize,
+                                                                 perfResults[AlgoCount],
+                                                                 stream);
+                                        perfResults[AlgoCount].status = status;
+                                        if (status == CUBLAS_STATUS_SUCCESS) {
+                                            AlgoCount++;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            }
+                            else {  // Non-splitK case
+                                /* if user preference is ok with workspace */
+                                if (AlgoCount < AlgoCombinations) {
+                                    status                        = customMatmulRun(ltHandle,
+                                                             operationDesc,
+                                                             alpha, /* host or device pointer */
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta, /* host or device pointer */
+                                                             C,
+                                                             Cdesc,
+                                                             C,
+                                                             Cdesc,
+                                                             algo,
+                                                             kernelRepeats,
+                                                             workSpace,
+                                                             workSpaceSize,
+                                                             perfResults[AlgoCount],
+                                                             stream);
+                                    perfResults[AlgoCount].status = status;
+                                    if (status == CUBLAS_STATUS_SUCCESS) {
+                                        AlgoCount++;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileA;
+    }  // end idx
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount, time_compare);
+    // Print timing and perf details
+    for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
+        printf("result %03d : ", i);
+        hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint);
+    }
+
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+
+template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                                    int              batchCount,
+                                    int              m,
+                                    int              n,
+                                    int              k,
+                                    const int*       alpha, /* host pointer */
+                                    const int8_t*    A,
+                                    const int8_t*    B,
+                                    const int*       beta, /* host pointer */
+                                    int32_t*         C,
+                                    void*            workSpace,
+                                    size_t           workSpaceSize,
+                                    FILE*            fout);
+
+template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                                    int              batchCount,
+                                    int              m,
+                                    int              n,
+                                    int              k,
+                                    const float*     alpha, /* host pointer */
+                                    const int8_t*    A,
+                                    const int8_t*    B,
+                                    const float*     beta, /* host pointer */
+                                    int8_t*          C,
+                                    void*            workSpace,
+                                    size_t           workSpaceSize,
+                                    FILE*            fout);
+
+// initialize matrix in column-major
+void matInit(int rows, int cols, int8_t* p, int ld)
+{
+    srand(time(NULL));
+
+    for (int c = 0; c < cols; c++) {
+        for (int r = 0; r < rows; r++) {
+            int index = r + c * ld;
+
+            p[index] = rand() % 255 - 127;
+        }
+    }
+}
+
+int batch_igemm_config(int batchCount, int m, int n, int k, FILE* fout, void* buffer)
+{
+    printf("batchCount %d m %d n %d k %d\n", batchCount, m, n, k);
+    int alpha = 1;
+    int beta  = 0;
+
+    int8_t*  d_A = (int8_t*)buffer;                       // m * k, stored in column-major
+    int8_t*  d_B = d_A + batchCount * m * k;              // k * n, stored in column-major
+    int32_t* d_C = (int32_t*)(d_B + batchCount * k * n);  // m * n, stored in column-major
+
+    cublasLtHandle_t ltHandle;
+    cublasLtCreate(&ltHandle);
+
+    LtBatchIgemmCustomFind(ltHandle,
+                           batchCount,
+                           m,
+                           n,
+                           k,
+                           &alpha, /* host pointer */
+                           d_A,
+                           d_B,
+                           &beta, /* host pointer */
+                           d_C,
+                           NULL,
+                           0,
+                           fout);
+    // free memory
+    cublasLtDestroy(ltHandle);
+    return 0;
+}
+
+int igemm_config(int m, int n, int k, FILE* fout, void* buffer)
+{
+    printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
+    int alpha = 1;
+    int beta  = 0;
+
+    int8_t*  d_A = (int8_t*)buffer;          // m * k, stored in column-major
+    int8_t*  d_B = d_A + m * k;              // k * n, stored in column-major
+    int32_t* d_C = (int32_t*)(d_B + k * n);  // m * n, stored in column-major
+
+    cublasLtHandle_t ltHandle;
+    cublasLtCreate(&ltHandle);
+
+    LtIgemmCustomFind(ltHandle,
+                      m,
+                      n,
+                      k,
+                      &alpha, /* host pointer */
+                      d_A,
+                      d_B,
+                      &beta, /* host pointer */
+                      d_C,
+                      NULL,
+                      0,
+                      fout);
+
+    cublasLtDestroy(ltHandle);
+    return 0;
+}
+
+int generate_encoder_igemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
+{
+
+    // ensure program running on SM >= 7.5
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
+        printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
+        exit(-1);
+    }
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fout;
+    if (!isAppend) {
+        fout = fopen(IGEMM_CONFIG, "w+");
+        fprintf(
+            fout,
+            "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
+    }
+    else {
+        fout = fopen(IGEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fout) != NULL) {
+            config.push_back(std::string(line));
+        }
+        if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
+            int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
+            fclose(fout);
+            fout = fopen(IGEMM_CONFIG, "w+");
+            for (int i = startIdx; i < (int)config.size(); i++) {
+                fprintf(fout, "%s", config[i].c_str());
+            }
+        }
+    }
+
+    batch_size_    = batch_size;
+    seq_len_       = seq_len;
+    head_num_      = head_num;
+    size_per_head_ = size_per_head;
+    int m          = batch_size * seq_len;
+    int n          = head_num * size_per_head;
+    int k          = n;
+    int batchCount;
+
+    printf("***Encoder IGemm Testing Begin***\n");
+    printf("\n-----------------------------\n");
+
+    batchCount = 3;
+    m          = batch_size * seq_len;
+    k          = head_num * size_per_head;
+    n          = k;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        batch_igemm_config(batchCount, m, n, k, fout, buffer);
+    }
+
+    printf("\n-----------------------------\n");
+    m          = seq_len;
+    n          = seq_len;
+    k          = size_per_head;
+    batchCount = batch_size * head_num;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        batch_igemm_config(batchCount, m, n, k, fout, buffer);
+    }
+
+    printf("\n-----------------------------\n");
+    m          = seq_len;
+    n          = size_per_head;
+    k          = seq_len;
+    batchCount = batch_size * head_num;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        batch_igemm_config(batchCount, m, n, k, fout, buffer);
+    }
+
+    printf("\n-----------------------------\n");
+    m = batch_size * seq_len;
+    n = head_num * size_per_head;
+    k = head_num * size_per_head;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        igemm_config(m, n, k, fout, buffer);
+    }
+
+    printf("\n-----------------------------\n");
+    n = 4 * n;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        igemm_config(m, n, k, fout, buffer);
+    }
+
+    printf("\n-----------------------------\n");
+    n = k;
+    k = 4 * n;
+    if (n % 32 != 0 || k % 32 != 0) {
+        printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+    }
+    else {
+        igemm_config(m, n, k, fout, buffer);
+    }
+
+    fclose(fout);
+    printf("\n-----------------------------\n");
+    printf("***Encoder IGemm Testing End***\n");
+
+#ifdef SPARSITY_ENABLED
+    bool do_sparse_test = false;
+    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
+        do_sparse_test = true;
+    }
+    if (do_sparse_test) {
+        printf("***cusparseLt Gemm Testing Begin***\n");
+        const int      spgemm_num = 3;
+        FILE*          fd;
+        int            line_count = 0;
+        const int      ites       = 100;
+        struct timeval start, end;
+        if (!isAppend) {
+            fd = fopen(SPIGEMM_CONFIG, "w+");
+        }
+        else {
+            fd = fopen(SPIGEMM_CONFIG, "a+");
+            std::vector<std::string> config;
+            char                     line[1024];
+            while (fgets(line, 1024, fd) != NULL) {
+                config.push_back(std::string(line));
+            }
+            line_count = config.size();
+            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1))  // 6 cublas/cublasLt, first row is not included
+            {
+                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
+                fclose(fd);
+                fd = fopen(SPIGEMM_CONFIG, "w+");
+                fprintf(fd, "%s", config[0].c_str());
+                for (uint i = startIdx; i < config.size(); i++) {
+                    fprintf(fd, "%s", config[i].c_str());
+                }
+                line_count = config.size() - (spgemm_num + 3);
+            }
+        }
+        if (line_count == 0) {
+            fprintf(
+                fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
+        }
+
+        int M[spgemm_num];
+        int N[spgemm_num];
+        int K[spgemm_num];
+        // gemm1
+        M[0] = batch_size * seq_len;
+        K[0] = head_num * size_per_head;
+        N[0] = K[0];
+        // gemm2
+        M[1] = M[0];
+        K[1] = K[0];
+        N[1] = 4 * N[0];
+        // gemm3
+        M[2] = M[0];
+        K[2] = 4 * K[0];
+        N[2] = N[0];
+
+        cusparseLtHandle_t handle;
+        CHECK_CUSPARSE(cusparseLtInit(&handle));
+        cusparseOrder_t     col_order    = CUSPARSE_ORDER_COL;
+        cusparseOrder_t     row_order    = CUSPARSE_ORDER_ROW;
+        cusparseOperation_t opA          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t opB          = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I;
+        unsigned            alignment    = 16;
+        cudaStream_t        stream       = 0;
+        float               alpha2       = 1.0f;
+        float               beta2        = 0.0f;
+        for (int i = 0; i < spgemm_num; ++i) {
+            // to be compatible with spgemm wrapper, we let A be the weight matrix
+            // so m and n are swapped
+            // A: mxk B: kxn C:mxn
+            int m = N[i], n = M[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
+            int8_t* d_A = (int8_t*)buffer;
+            int8_t* d_B = d_A + m * k;
+            int8_t* d_C = d_B + k * n;
+            int8_t* dA_compressed;
+            {
+                cusparseLtMatDescriptor_t matA;
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(
+                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                size_t compressed_size;
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+            }
+            cudaDeviceSynchronize();
+            cudaError_t result = cudaGetLastError();
+            if (result) {
+                throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: "));
+            }
+
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            for (int alg = 0; alg < 4; ++alg) {
+                cudaDeviceSynchronize();
+                cusparseLtMatDescriptor_t matA, matB, matC;
+                void*                     d_workspace = nullptr;
+                int                       num_streams = 1;
+                cudaStream_t              streams[1]  = {stream};
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_8I, col_order))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_8I, col_order))
+                gettimeofday(&start, NULL);
+                for (int ite = 0; ite < ites; ++ite) {
+                    // initializing MatDesc takes a lot of time
+                    // and these descs can be stored to other place
+                    // whereas storing MatMulPlan to other place will cause errors
+                    cusparseLtMatmulDescriptor_t   matmul;
+                    cusparseLtMatmulAlgSelection_t alg_sel;
+                    cusparseLtMatmulPlan_t         plan;
+                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+                        &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                    CHECK_CUSPARSE(
+                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+                        &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+                    size_t workspace_size;
+                    CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
+                    CHECK_CUSPARSE(cusparseLtMatmul(&handle,
+                                                    &plan,
+                                                    &alpha2,
+                                                    dA_compressed,
+                                                    d_B,
+                                                    &beta2,
+                                                    d_C,
+                                                    d_C,
+                                                    d_workspace,
+                                                    streams,
+                                                    num_streams))
+                    CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+                }
+                cudaDeviceSynchronize();
+                gettimeofday(&end, NULL);
+                printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
+                if (diffTime(start, end) < exec_time) {
+                    exec_time = diffTime(start, end);
+                    fast_algo = alg;
+                }
+            }
+            exec_time /= ites;
+            printf("fast_algo %d\n", fast_algo);
+            fprintf(fd,
+                    "%d %d %d %d %d ### 1 %d %d %d %d %f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    HALF_DATATYPE,
+                    m,
+                    n,
+                    k,
+                    fast_algo,
+                    exec_time);
+            cudaFree(dA_compressed);
+        }
+        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
+        fclose(fd);
+        printf("***cusparseLt Gemm Testing End***\n");
+    }
+#endif
+    return 0;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..3604d8f90f5ef5729b10b6080df7ed38b4901889
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <algorithm>
+#include <cublasLt.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+/* CAUTION : must match cublasLtMatmulTile_t */
+const char* const matmulTileName[] = {"UNDEF",  "8x8",     "8x16",    "16x8",    "8x32",    "16x16",  "32x8",
+                                      "8x64",   "16x32",   "32x16",   "64x8",    "32x32",   "32x64",  "64x32",
+                                      "32x128", "64x64",   "128x32",  "64x128",  "128x64",  "64x256", "128x128",
+                                      "256x64", "64x512",  "128x256", "256x128", "512x64",  "64x96",  "96*64",
+                                      "96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"};
+
+int generate_encoder_igemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
+
+int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
+
+int printBatchPerfStructure(
+    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
+
+template<typename T, typename scaleT>
+int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
+                      int              m,
+                      int              n,
+                      int              k,
+                      const scaleT*    alpha, /* host pointer */
+                      const int8_t*    A,
+                      const int8_t*    B,
+                      const scaleT*    beta, /* host pointer */
+                      T*               C,
+                      void*            workSpace,
+                      size_t           workSpaceSize,
+                      FILE*            fout);
+
+template<typename T, typename scaleT>
+int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
+                           int              batchCount,
+                           int              m,
+                           int              n,
+                           int              k,
+                           const scaleT*    alpha, /* host pointer */
+                           const int8_t*    A,
+                           const int8_t*    B,
+                           const scaleT*    beta, /* host pointer */
+                           T*               C,
+                           void*            workSpace,
+                           size_t           workSpaceSize,
+                           FILE*            fout);
+
+void matInit(int rows, int cols, int8_t* p, int ld);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/gemm_func.cc b/src/fastertransformer/utils/gemm_test/gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86779f4853c528d44116e1b6fa3a9b34540f0639
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/gemm_func.cc
@@ -0,0 +1,989 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "encoder_gemm_func.h"
+#include <assert.h>
+#include <sys/types.h>
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#endif
+
+namespace fastertransformer {
+
+// Utility function to print customMatmulPerf_t structure
+int printPerfStructure(int                       batch_size,
+                       int                       seq_len,
+                       int                       head_num,
+                       int                       size_per_head,
+                       int                       m,
+                       int                       n,
+                       int                       k,
+                       const customMatmulPerf_t& perf,
+                       FILE*                     fout,
+                       CublasDataType            data_type,
+                       int                       hasPrint,
+                       int                       batch_count)
+{
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
+
+    const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+#if (CUDART_VERSION >= 11000)
+    cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
+#else
+    stages = 0;
+#endif
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+    uint16_t inner_shapeId, cluster_shapeId;
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &inner_shapeId, sizeof(inner_shapeId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &cluster_shapeId, sizeof(cluster_shapeId), NULL);
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+    uint16_t mma_shapeId, cga_shapeId, sche_mode;
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &mma_shapeId, sizeof(mma_shapeId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &cga_shapeId, sizeof(cga_shapeId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(
+        matmulAlgo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &sche_mode, sizeof(sche_mode), NULL);
+#endif
+
+    printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d "
+#if (CUDART_VERSION >= 11000)
+           "stages=%d "
+#endif
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+           "inner_shapeId=%d cluster_shapeId=%d"
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+           "mma_shapeId=%d cga_shapeId=%d schedule_mode=%d"
+#endif
+           "} status %d "
+           "time %fms workspace=%d mathMode=%d waves=%f\n",
+           algoId,
+           tile,
+           matmulTileName[tile],
+           numSplitsK,
+           reductionScheme,
+           swizzle,
+           customOption,
+#if (CUDART_VERSION >= 11000)
+           stages,
+#endif
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+           inner_shapeId,
+           cluster_shapeId,
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+           mma_shapeId,
+           cga_shapeId,
+           sche_mode,
+#endif
+           perf.status,
+           perf.time,
+           (int)perf.workspaceSize,
+           (int)perf.mathMode,
+           perf.wavesCount);
+    if (hasPrint == 0) {
+        fprintf(fout,
+                "%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                "%d %d "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                "%d %d %d "
+#endif
+                "%f\n",
+                batch_size,
+                seq_len,
+                head_num,
+                size_per_head,
+                data_type,
+                batch_count,
+                m,
+                n,
+                k,
+                algoId,
+                customOption,
+                tile,
+                numSplitsK,
+                swizzle,
+                reductionScheme,
+                (int)perf.workspaceSize,
+                stages,
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                inner_shapeId,
+                cluster_shapeId,
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                mma_shapeId,
+                cga_shapeId,
+                sche_mode,
+#endif
+                perf.time);
+        return 1;
+    }
+    else {
+        return hasPrint;
+    }
+}
+
+static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
+{
+    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
+}
+
+static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
+                                      cublasLtMatmulDesc_t        operationDesc,
+                                      const void*                 alpha, /* host or device pointer */
+                                      const void*                 A,
+                                      cublasLtMatrixLayout_t      Adesc,
+                                      const void*                 B,
+                                      cublasLtMatrixLayout_t      Bdesc,
+                                      const void*                 beta, /* host or device pointer */
+                                      const void*                 C,
+                                      cublasLtMatrixLayout_t      Cdesc,
+                                      void*                       D,
+                                      cublasLtMatrixLayout_t      Ddesc,
+                                      const cublasLtMatmulAlgo_t& algo,
+                                      int                         kernelRepeats,
+                                      void*                       workSpace,
+                                      size_t                      workSpaceSizeInBytes,
+                                      customMatmulPerf_t&         perfResults,
+                                      cudaStream_t                stream,
+                                      cudaEvent_t&                startEvent,
+                                      cudaEvent_t&                stopEvent)
+{
+    cublasLtMatmulHeuristicResult_t heurResult;
+    /* Looping over the Algo */
+    int            repeats = kernelRepeats;
+    cublasStatus_t algoStatus =
+        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
+
+    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
+            cudaError_t err, err1, err2, err3;
+            err = cudaEventRecord(startEvent, stream);
+            for (int loop = 0; loop < repeats; loop++) {
+                cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle,
+                                                             operationDesc,
+                                                             alpha,
+                                                             A,
+                                                             Adesc,
+                                                             B,
+                                                             Bdesc,
+                                                             beta,
+                                                             C,
+                                                             Cdesc,
+                                                             D,
+                                                             Ddesc,
+                                                             &algo,
+                                                             workSpace,
+                                                             workSpaceSizeInBytes,
+                                                             stream);
+                if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                    algoStatus = oneRunStatus;
+                    break;
+                }
+            }
+            err1 = cudaEventRecord(stopEvent, stream);
+            err2 = cudaEventSynchronize(stopEvent);
+            float time;
+            err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
+            if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
+                algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
+            }
+            // For the moment only add successful findings
+            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                perfResults.algo          = algo;
+                perfResults.time          = time / repeats;
+                perfResults.workspaceSize = heurResult.workspaceSize;
+                perfResults.wavesCount    = heurResult.wavesCount;
+            }
+        }
+        else {
+            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
+            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+        }
+    }
+
+    return algoStatus;
+}
+
+template<typename T, typename scaleT>
+int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                      int                batch_size,
+                      int                seq_len,
+                      int                head_num,
+                      int                size_per_head,
+                      int                m,
+                      int                n,
+                      int                k,
+                      const scaleT*      alpha, /* host pointer */
+                      const T*           A,
+                      const T*           B,
+                      const scaleT*      beta, /* host pointer */
+                      T*                 C,
+                      void*              workSpace,
+                      size_t             workSpaceSize,
+                      FILE*              fout,
+                      customMatmulPerf_t perfResults[],
+                      int                AlgoCombinations,
+                      cudaDataType_t     dtype_fp8,
+                      int                batchCount,
+                      int64_t            strideA,
+                      int64_t            strideB,
+                      int64_t            strideD)
+{
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    cudaEvent_t    startEvent;
+    cudaEvent_t    stopEvent;
+    CublasDataType data_type;
+
+    cublasLtMatmulDesc_t   operationDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+
+    cudaStream_t stream = 0;
+    // SplitK value that we are going to try when SplitK is supported for a
+    // given algo
+    const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+    // Let try a fixed number of combinations
+    int                  AlgoCount         = 0;
+    int                  AlgoCountRestrict = 0;            // workspace == 0
+    int                  maxNumTraversal   = 50;           // max number of traversal
+    cublasLtMatmulAlgo_t algos[AlgoCombinations];          // 0 <= workspace <= 32MB
+    cublasLtMatmulAlgo_t algosRestrict[AlgoCombinations];  // workspace == 0
+    int                  kernelRepeats = 100;              // number of time the CUDA kernels will be run back to back
+    int                  nbAlgoIds     = 0;                // Number of algorithms actually returned by
+                                                           // cublasLtMatmulAlgoGetIds function.
+#define ALGO_IDS 100                                       // Number of algorithms requested.
+    int algoIdA[ALGO_IDS];                                 // Array containing the algorithm IDs returned by
+                                                           // cublasLtMatmulAlgoGetIds function.
+    cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
+#if (CUDART_VERSION >= 11000)
+    cublasComputeType_t computeType;
+#else
+    cudaDataType_t computeType;
+#endif
+
+    if (std::is_same<T, float>::value) {
+        data_type = FLOAT_DATATYPE;
+        Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type = HALF_DATATYPE;
+        Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type = BFLOAT16_DATATYPE;
+        Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF;
+    }
+#endif
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value) {
+        data_type = FP8_DATATYPE;
+        Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF;
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+        Dtype = CUDA_R_16BF;
+#else
+        Dtype = dtype_fp8;
+#endif
+    }
+#endif
+
+    if (sizeof(scaleT) == sizeof(float)) {
+        scaleType = CUDA_R_32F;
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_32F;
+#else
+        computeType = CUDA_R_32F;
+#endif
+    }
+    else {
+        scaleType = CUDA_R_16F;
+#if (CUDART_VERSION >= 11000)
+        computeType = CUBLAS_COMPUTE_16F;
+#else
+        computeType = CUDA_R_16F;
+#endif
+    }
+
+    const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+// Create operation descriptor; see cublasLtMatmulDescAttributes_t for
+// details about defaults; here we just need to set the transforms for A and
+// B
+#if (CUDART_VERSION >= 11000)
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType,
+                                      scaleType);  //  creates a matrix multiply descriptor
+#else
+    status = cublasLtMatmulDescCreate(&operationDesc, computeType);
+#endif
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+#ifdef ENABLE_FP8
+    if (data_type == FP8_DATATYPE) {
+        const int8_t fastAccuMode = 1;  // enable fast imprecise accum
+        status                    = cublasLtMatmulDescSetAttribute(
+            operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode)));
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            goto CLEANUP;
+        }
+    }
+#endif
+
+    // Create matrix descriptors. We are good with the details here so no need
+    // to set any extra attributes
+    if (data_type == FP8_DATATYPE) {
+        status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k);
+    }
+    else {
+        status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
+    }
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+
+    if (batchCount > 1) {
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
+
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+        check_cuda_error(cublasLtMatrixLayoutSetAttribute(
+            Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
+    }
+
+    // Create CUDA event to time the execution time of each algo
+    if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
+        goto CLEANUP;
+    }
+    if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
+        goto CLEANUP;
+    }
+
+    // Request the 100 first AlgoId available
+    status = cublasLtMatmulAlgoGetIds(
+        ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        goto CLEANUP;
+    }
+    if (nbAlgoIds > ALGO_IDS) {
+        printf(
+            "Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS);
+    }
+
+    // Loop over the Algo IDs
+    // This loop doesn't work for fp8 gemm
+    for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
+        cublasLtMatmulAlgo_t algo;
+        size_t               sizeWritten = 0;
+        /* Initialize algo structure with given Algp ID */
+        status =
+            cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo);
+        if (status != CUBLAS_STATUS_SUCCESS) {
+            continue;
+        }
+        // Query the tiles enums supported by that algo
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
+        int  nbTiles = int(sizeWritten / sizeof(int));
+        int* tileA   = new int[nbTiles == 0 ? 1 : nbTiles];
+        if (nbTiles == 0) {
+            tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+            nbTiles  = 1;
+        }
+#if (CUDART_VERSION >= 11000)
+        cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
+        int              nbStages = int(sizeWritten / sizeof(int));
+        std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
+        if (nbStages == 0) {
+            stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+            nbStages   = 1;
+        }
+        else {
+            cublasLtMatmulAlgoCapGetAttribute(
+                &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
+        }
+#endif
+        int splitkSupport, redMask, swizzlingMax, customOptionMax;
+        // Retrieve Algo Capabilities attributes to be able to setup loop over
+        // the different combinations
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
+        cublasLtMatmulAlgoCapGetAttribute(
+            &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
+
+        /* Loop over the different tiles */
+        for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
+#if (CUDART_VERSION >= 11000)
+            /* Loop over different stages count */
+            for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
+                cublasLtMatmulAlgoConfigSetAttribute(
+                    &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
+#endif
+                /* Loop over the different custom option if any */
+                for (int customOption = 0; customOption <= customOptionMax; customOption++) {
+                    cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
+                    /* Loop over the CTAs swizzling support */
+                    for (int k = 0; k <= swizzlingMax; k++) {
+                        int splitK_trial = 0;
+                        if (splitkSupport) {
+                            splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
+                        }
+                        // Loop over the splitK value over a fixed sequence
+                        // splitKSequenceA in addition to the case where splitK
+                        // is not enabled
+                        for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
+                            /* Setup attribute of the algo to run */
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
+                            int splitK_val = 0;
+                            int redScheme  = CUBLASLT_REDUCTION_SCHEME_NONE;
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
+                            cublasLtMatmulAlgoConfigSetAttribute(
+                                &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
+
+                            if (l > 0) {  // Split-K case
+                                splitK_val = splitKSequenceA[l - 1];
+                                cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                     CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                                                     &splitKSequenceA[l - 1],
+                                                                     sizeof(splitKSequenceA[l - 1]));
+                                /* Going over all the reduction scheme  */
+                                for (redScheme = 1;
+                                     redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
+                                     redScheme = redScheme << 1) {
+                                    if (redScheme & redMask) {
+                                        cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                                                             CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                                                                             &redScheme,
+                                                                             sizeof(redScheme));
+
+                                        cublasLtMatmulHeuristicResult_t heurResult;
+                                        cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
+                                            ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+                                        if (heurResult.workspaceSize > workSpaceSize) {
+                                            // printf("not enough workspace!
+                                            // %ld\n",
+                                            // heurResult.workspaceSize);
+                                            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+                                        }
+                                        else if (heurResult.workspaceSize == 0) {
+                                            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                                algosRestrict[AlgoCountRestrict++] = algo;
+                                            }
+                                        }
+                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                            algos[AlgoCount++] = algo;
+                                        }
+                                    }  // end if
+                                }      // end for
+                            }
+                            else {  // Non-splitK case
+                                /* if user preference is ok with workspace */
+                                if (AlgoCount < AlgoCombinations) {
+                                    cublasLtMatmulHeuristicResult_t heurResult;
+                                    cublasStatus_t                  algoStatus = cublasLtMatmulAlgoCheck(
+                                        ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
+                                    if (heurResult.workspaceSize > workSpaceSize) {
+                                        // printf("not enough workspace! %ld\n",
+                                        // heurResult.workspaceSize);
+                                        algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not
+                                                                                   // enough
+                                                                                   // workspace
+                                    }
+                                    else if (heurResult.workspaceSize == 0) {
+                                        if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                            algosRestrict[AlgoCountRestrict++] = algo;
+                                        }
+                                    }
+                                    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                                        algos[AlgoCount++] = algo;
+                                    }
+                                }
+                            }
+                        }  // end l
+                    }      // end k
+                }          // end customOption
+#if (CUDART_VERSION >= 11000)
+            }  // end stagesIdx
+#endif
+        }  // end tileIdx
+        delete[] tileA;
+    }  // end idx
+
+    printf("AlgoCount: %d\n", AlgoCount);
+    if (data_type == FP8_DATATYPE) {
+        assert(AlgoCount == 0);
+    }
+    if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) {
+        // 0 <= workspacesize <= 32MB
+        for (int i = 0; i < AlgoCount; i++) {
+            status                = customMatmulRun(ltHandle,
+                                     operationDesc,
+                                     alpha, /* host or device pointer */
+                                     A,
+                                     Adesc,
+                                     B,
+                                     Bdesc,
+                                     beta, /* host or device pointer */
+                                     C,
+                                     Cdesc,
+                                     C,
+                                     Cdesc,
+                                     algos[i],
+                                     kernelRepeats,
+                                     workSpace,
+                                     workSpaceSize,
+                                     perfResults[i],
+                                     stream,
+                                     startEvent,
+                                     stopEvent);
+            perfResults[i].status = status;
+            // if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++;
+        }
+    }
+    else {
+        // Heuristic + workspacesize==0
+        AlgoCount = 0;
+        nbAlgoIds = 0;
+        cublasLtMatmulPreference_t pref;
+        cublasLtMatmulPreferenceCreate(&pref);
+        uint64_t maxWorkSpaceSize = workSpaceSize;  //(32MB)
+        cublasLtMatmulPreferenceSetAttribute(
+            pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize));
+        cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal];
+
+        cublasLtMatmulAlgoGetHeuristic(ltHandle,
+                                       operationDesc,
+                                       Adesc,
+                                       Bdesc,
+                                       Cdesc,
+                                       Ddesc,
+                                       pref,
+                                       maxNumTraversal,
+                                       heuristicResultsArray,
+                                       &nbAlgoIds);
+        cublasLtMatmulPreferenceDestroy(pref);
+        printf("return %d and run heuristic algo\n", nbAlgoIds);
+        for (int i = 0; i < nbAlgoIds; i++) {
+            if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) {
+                status                        = customMatmulRun(ltHandle,
+                                         operationDesc,
+                                         alpha, /* host or device pointer */
+                                         A,
+                                         Adesc,
+                                         B,
+                                         Bdesc,
+                                         beta, /* host or device pointer */
+                                         C,
+                                         Cdesc,
+                                         C,
+                                         Ddesc,
+                                         heuristicResultsArray[i].algo,
+                                         kernelRepeats,
+                                         workSpace,
+                                         workSpaceSize,
+                                         perfResults[AlgoCount],
+                                         stream,
+                                         startEvent,
+                                         stopEvent);
+                perfResults[AlgoCount].status = status;
+                if (status == CUBLAS_STATUS_SUCCESS) {
+                    AlgoCount++;
+                }
+            }
+        }
+
+        // workspacesize==0
+        printf("workspacesize==0, run %d algos\n", AlgoCountRestrict);
+        for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) {
+            status                        = customMatmulRun(ltHandle,
+                                     operationDesc,
+                                     alpha, /* host or device pointer */
+                                     A,
+                                     Adesc,
+                                     B,
+                                     Bdesc,
+                                     beta, /* host or device pointer */
+                                     C,
+                                     Cdesc,
+                                     C,
+                                     Ddesc,
+                                     algosRestrict[i],
+                                     kernelRepeats,
+                                     NULL,
+                                     0,
+                                     perfResults[AlgoCount],
+                                     stream,
+                                     startEvent,
+                                     stopEvent);
+            perfResults[AlgoCount].status = status;
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                AlgoCount++;
+            }
+        }
+    }
+
+    // Sort the results per run duration
+    std::sort(perfResults, perfResults + AlgoCount, time_compare);
+    // Print timing and perf details
+    for (int i = 0, hasPrint = 1; i < AlgoCount; i++) {
+        printf("result %03d : ", i);
+        hasPrint = printPerfStructure(batch_size,
+                                      seq_len,
+                                      head_num,
+                                      size_per_head,
+                                      m,
+                                      n,
+                                      k,
+                                      perfResults[i],
+                                      fout,
+                                      data_type,
+                                      hasPrint,
+                                      batchCount);
+    }
+
+CLEANUP:
+    // Descriptors are no longer needed as all GPU work was already enqueued
+    if (Cdesc) {
+        cublasLtMatrixLayoutDestroy(Cdesc);
+    }
+    if (Bdesc) {
+        cublasLtMatrixLayoutDestroy(Bdesc);
+    }
+    if (Adesc) {
+        cublasLtMatrixLayoutDestroy(Adesc);
+    }
+    if (operationDesc) {
+        cublasLtMatmulDescDestroy(operationDesc);
+    }
+    if (startEvent) {
+        cudaEventDestroy(startEvent);
+    }
+    if (stopEvent) {
+        cudaEventDestroy(stopEvent);
+    }
+    return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+
+template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                               int                batch_size,
+                               int                seq_len,
+                               int                head_num,
+                               int                size_per_head,
+                               int                m,
+                               int                n,
+                               int                k,
+                               const float*       alpha, /* host pointer */
+                               const float*       A,
+                               const float*       B,
+                               const float*       beta, /* host pointer */
+                               float*             C,
+                               void*              workSpace,
+                               size_t             workSpaceSize,
+                               FILE*              fout,
+                               customMatmulPerf_t perfResults[],
+                               int                AlgoCombinations,
+                               cudaDataType_t     dtype_fp8,
+                               int                batchCount,
+                               int64_t            strideA,
+                               int64_t            strideB,
+                               int64_t            strideD);
+
+template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                               int                batch_size,
+                               int                seq_len,
+                               int                head_num,
+                               int                size_per_head,
+                               int                m,
+                               int                n,
+                               int                k,
+                               const half*        alpha, /* host pointer */
+                               const half*        A,
+                               const half*        B,
+                               const half*        beta, /* host pointer */
+                               half*              C,
+                               void*              workSpace,
+                               size_t             workSpaceSize,
+                               FILE*              fout,
+                               customMatmulPerf_t perfResults[],
+                               int                AlgoCombinations,
+                               cudaDataType_t     dtype_fp8,
+                               int                batchCount,
+                               int64_t            strideA,
+                               int64_t            strideB,
+                               int64_t            strideD);
+
+#ifdef ENABLE_BF16
+template int LtHgemmCustomFind(cublasLtHandle_t     ltHandle,
+                               int                  batch_size,
+                               int                  seq_len,
+                               int                  head_num,
+                               int                  size_per_head,
+                               int                  m,
+                               int                  n,
+                               int                  k,
+                               const float*         alpha, /* host pointer */
+                               const __nv_bfloat16* A,
+                               const __nv_bfloat16* B,
+                               const float*         beta, /* host pointer */
+                               __nv_bfloat16*       C,
+                               void*                workSpace,
+                               size_t               workSpaceSize,
+                               FILE*                fout,
+                               customMatmulPerf_t   perfResults[],
+                               int                  AlgoCombinations,
+                               cudaDataType_t       dtype_fp8,
+                               int                  batchCount,
+                               int64_t              strideA,
+                               int64_t              strideB,
+                               int64_t              strideD);
+#endif
+
+#ifdef ENABLE_FP8
+template int LtHgemmCustomFind(cublasLtHandle_t     ltHandle,
+                               int                  batch_size,
+                               int                  seq_len,
+                               int                  head_num,
+                               int                  size_per_head,
+                               int                  m,
+                               int                  n,
+                               int                  k,
+                               const float*         alpha, /* host pointer */
+                               const __nv_fp8_e4m3* A,
+                               const __nv_fp8_e4m3* B,
+                               const float*         beta, /* host pointer */
+                               __nv_fp8_e4m3*       C,
+                               void*                workSpace,
+                               size_t               workSpaceSize,
+                               FILE*                fout,
+                               customMatmulPerf_t   perfResults[],
+                               int                  AlgoCombinations,
+                               cudaDataType_t       dtype_fp8,
+                               int                  batchCount,
+                               int64_t              strideA,
+                               int64_t              strideB,
+                               int64_t              strideD);
+#endif
+
+template int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                               int                batch_size,
+                               int                seq_len,
+                               int                head_num,
+                               int                size_per_head,
+                               int                m,
+                               int                n,
+                               int                k,
+                               const float*       alpha, /* host pointer */
+                               const half*        A,
+                               const half*        B,
+                               const float*       beta, /* host pointer */
+                               half*              C,
+                               void*              workSpace,
+                               size_t             workSpaceSize,
+                               FILE*              fout,
+                               customMatmulPerf_t perfResults[],
+                               int                AlgoCombinations,
+                               cudaDataType_t     dtype_fp8,
+                               int                batchCount,
+                               int64_t            strideA,
+                               int64_t            strideB,
+                               int64_t            strideD);
+
+size_t calGemmTestBufSizeInByte(int            batch_size,
+                                int            seq_len,
+                                int            head_num,
+                                int            size_per_head,
+                                int            inter_size,
+                                int            vocab_size,
+                                int            int8_mode,
+                                CublasDataType data_type)
+{
+    size_t buf_size_in_byte;
+    if (int8_mode > 0) {
+        int m = batch_size * seq_len;
+        int n = head_num * size_per_head;
+        int k = n;
+
+        size_t size1 = 3 * (m * k * sizeof(int8_t) + k * n * sizeof(int8_t) + m * n * sizeof(int));
+        size_t size2 = batch_size * head_num
+                       * (seq_len * size_per_head * sizeof(int8_t) + size_per_head * seq_len * sizeof(int8_t)
+                          + seq_len * seq_len * sizeof(int));
+        size_t size3 = batch_size * head_num
+                       * (seq_len * seq_len * sizeof(int8_t) + seq_len * size_per_head * sizeof(int8_t)
+                          + seq_len * size_per_head * sizeof(int));
+        size_t size4     = m * k * sizeof(int8_t) + k * inter_size * sizeof(int8_t) + m * inter_size * sizeof(int);
+        size_t size5     = m * k * sizeof(int8_t) + k * vocab_size * sizeof(int8_t) + m * vocab_size * sizeof(int);
+        buf_size_in_byte = size1 > size2 ? size1 : size2;
+        buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
+        buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
+        buf_size_in_byte = buf_size_in_byte > size5 ? buf_size_in_byte : size5;
+    }
+    else {
+        size_t m = batch_size * seq_len;
+        size_t n = head_num * size_per_head;
+        size_t k = n;
+        // TODO need to add bfloat16 here
+        int    wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
+        size_t size1    = 3 * (m * k + k * n + m * n) * wordSize;
+        size_t size2    = (size_t)batch_size * (size_t)head_num
+                       * ((size_t)seq_len * (size_t)seq_len + (size_t)seq_len * (size_t)size_per_head
+                          + (size_t)seq_len * (size_t)size_per_head)
+                       * (size_t)wordSize;
+        size_t size3     = (m * k + k * inter_size + m * inter_size) * wordSize;
+        size_t size4     = (m * k + k * vocab_size + m * vocab_size) * wordSize;
+        buf_size_in_byte = size1 > size2 ? size1 : size2;
+        buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
+        buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
+        buf_size_in_byte +=
+            ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
+    }
+    return buf_size_in_byte;
+}
+
+size_t calGemmTestBufSizeInByteXlnet(
+    int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16)
+{
+    int M[10]          = {0};
+    int N[10]          = {0};
+    int K[10]          = {0};
+    int batchCount[10] = {0};
+
+    // gemm1
+    M[0]          = hidden_units;
+    N[0]          = seq_len * batch_size;
+    K[0]          = hidden_units;
+    batchCount[0] = 3;
+
+    // gemm2
+    M[1]          = hidden_units;
+    N[1]          = seq_len * 2;
+    K[1]          = hidden_units;
+    batchCount[1] = 1;
+
+    // gemm3
+    M[2]          = seq_len;
+    N[2]          = seq_len;
+    K[2]          = size_per_head;
+    batchCount[2] = batch_size * head_num;
+
+    // gemm4
+    M[3]          = seq_len * 2;
+    N[3]          = seq_len;
+    K[3]          = size_per_head;
+    batchCount[3] = batch_size * head_num;
+
+    // gemm5
+    M[4]          = 2;
+    N[4]          = seq_len;
+    K[4]          = size_per_head;
+    batchCount[4] = batch_size * head_num;
+
+    // gemm6
+    M[5] = head_num;
+    N[5] = seq_len;
+    K[5] = 2;
+    // gemm7
+    M[6]          = size_per_head;
+    N[6]          = seq_len;
+    K[6]          = seq_len;
+    batchCount[6] = batch_size * head_num;
+
+    // gemm8
+    M[7]          = hidden_units;
+    N[7]          = seq_len;
+    K[7]          = hidden_units;
+    batchCount[7] = batch_size;
+
+    // gemm9
+    M[8]          = inter_size;
+    N[8]          = seq_len;
+    K[8]          = hidden_units;
+    batchCount[8] = batch_size;
+
+    // gemm10
+    M[9]          = hidden_units;
+    N[9]          = seq_len;
+    K[9]          = inter_size;
+    batchCount[9] = batch_size;
+
+    size_t max_size = 0;
+
+    for (int i = 0; i < 10; ++i) {
+        int    m = M[i], n = N[i], k = K[i];
+        size_t size = (M[i] * N[i] + M[i] * K[i] + N[i] * K[i]) * batchCount[i];
+        if (size > max_size) {
+            max_size = size;
+        }
+    }
+
+    int size_per_ele = 4;
+    if (is_fp16 == true) {
+        size_per_ele = 2;
+    }
+    return max_size * size_per_ele;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/gemm_func.h b/src/fastertransformer/utils/gemm_test/gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ab420698ec25b73300cd61851a864d1c27c1591
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/gemm_func.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "encoder_igemm_func.h"  // TODO(bhsueh) Remove this include
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#ifdef ENABLE_BF16
+#include <cuda_fp16.h>
+#endif
+#ifdef ENABLE_FP8
+#include <cuda_fp8.h>
+#endif
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+// Scale Type Converter
+// is_fp16_compute_type is only valid when T = half
+template<typename T, bool is_fp16_compute_type = false>
+struct ScaleTypeConverter {
+    using Type = float;
+};
+
+template<>
+struct ScaleTypeConverter<half, true> {
+    using Type = half;
+};
+
+template<typename T, typename scaleT>
+int LtHgemmCustomFind(cublasLtHandle_t   ltHandle,
+                      int                batch_size,
+                      int                seq_len,
+                      int                head_num,
+                      int                size_per_head,
+                      int                m,
+                      int                n,
+                      int                k,
+                      const scaleT*      alpha, /* host pointer */
+                      const T*           A,
+                      const T*           B,
+                      const scaleT*      beta, /* host pointer */
+                      T*                 C,
+                      void*              workSpace,
+                      size_t             workSpaceSize,
+                      FILE*              fout,
+                      customMatmulPerf_t perfResults[],
+                      int                AlgoCombinations,
+                      cudaDataType_t     dtype_fp8  = CUDA_R_32F,
+                      int                batchCount = 1,
+                      int64_t            strideA    = 0,
+                      int64_t            strideB    = 0,
+                      int64_t            strideD    = 0);
+
+size_t calGemmTestBufSizeInByte(int            batch_size,
+                                int            seq_len,
+                                int            head_num,
+                                int            size_per_head,
+                                int            inter_size,
+                                int            vocab_size,
+                                int            int8_mode,
+                                CublasDataType data_type);
+
+size_t calGemmTestBufSizeInByteXlnet(
+    int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16);
+
+int printPerfStructure(int                       batch_size,
+                       int                       seq_len,
+                       int                       head_num,
+                       int                       size_per_head,
+                       int                       m,
+                       int                       n,
+                       int                       k,
+                       const customMatmulPerf_t& perf,
+                       FILE*                     fout,
+                       CublasDataType            data_type,
+                       int                       hasPrint,
+                       int                       batch_count = 1);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3366b83fb6727cc2f48dcbd25a48f5b37d5c2566
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm_test/gpt_gemm_func.h"
+
+namespace fastertransformer {
+
+bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
+{
+    return m % 8 == 0 && n % 8 == 0 && k % 8 == 0;
+}
+
+template<typename T>
+void generate_gpt_gemm_config(int   batch_size,
+                              int   beam_width,
+                              int   max_input_len,
+                              int   head_num,
+                              int   size_per_head,
+                              int   inter_size,
+                              int   vocab_size,
+                              int   tensor_para_size,
+                              void* buffer_in,
+                              bool  isAppend)
+{
+    FT_CHECK(head_num % tensor_para_size == 0);
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+    bool  workspace_flag = std::is_same<T, half>::value;
+#ifdef ENABLE_FP8
+    workspace_flag = workspace_flag || std::is_same<T, __nv_fp8_e4m3>::value;
+#endif
+#if ENABLE_BF16
+    workspace_flag = workspace_flag || std::is_same<T, __nv_bfloat16>::value;
+#endif
+    if (workspace_flag) {
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        // if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        // {
+        //     int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+        //     fclose(fd);
+        //     fd = fopen(GEMM_CONFIG, "w+");
+        //     fprintf(fd, "%s", config[0].c_str());
+        //     for (uint i = startIdx; i < config.size(); i++) {
+        //         fprintf(fd, "%s", config[i].c_str());
+        //     }
+        //     line_count = config.size() - (GEMM_NUM + 3);
+        // }
+    }
+
+    const int hidden_units         = head_num * size_per_head;
+    const int local_head_num       = head_num / tensor_para_size;
+    const int local_hidden_units   = local_head_num * size_per_head;
+    const int max_input_len_padded = (max_input_len + 15) / 16 * 16;
+    const int gemm_num             = 11;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num];
+    int64_t   strideA[gemm_num];
+    int64_t   strideB[gemm_num];
+    int64_t   strideD[gemm_num];
+    char      mess[gemm_num][256];
+    float     exec_times[gemm_num];
+
+    // gemm 0
+    M[0]          = batch_size * beam_width * max_input_len;
+    K[0]          = hidden_units;
+    N[0]          = 3 * local_hidden_units;
+    batchCount[0] = 1;
+    strideA[0]    = 0;
+    strideB[0]    = 0;
+    strideD[0]    = 0;
+    strcpy(mess[0], "context from_tensor * weightQKV");
+
+    // gemm 1
+    M[1]          = max_input_len_padded;
+    K[1]          = size_per_head;
+    N[1]          = max_input_len_padded;
+    batchCount[1] = batch_size * beam_width * local_head_num;
+    strideA[1]    = max_input_len_padded * size_per_head;
+    strideB[1]    = max_input_len_padded * size_per_head;
+    strideD[1]    = max_input_len_padded * max_input_len_padded;
+    strcpy(mess[1], "context batch gemm Q*K^T");
+
+    // gemm 2
+    M[2]          = max_input_len_padded;
+    K[2]          = max_input_len_padded;
+    N[2]          = size_per_head;
+    batchCount[2] = batch_size * beam_width * local_head_num;
+    strideA[2]    = max_input_len_padded * size_per_head;
+    strideB[2]    = max_input_len_padded * max_input_len_padded;
+    strideD[2]    = max_input_len_padded * size_per_head;
+    strcpy(mess[2], "context batch gemm QK*V^T");
+
+    // gemm 3
+    M[3]          = batch_size * beam_width * max_input_len;
+    K[3]          = local_hidden_units;
+    N[3]          = hidden_units;
+    batchCount[3] = 1;
+    strideA[3]    = 0;
+    strideB[3]    = 0;
+    strideD[3]    = 0;
+    strcpy(mess[3], "context attr * output_kernel");
+
+    // gemm 4
+    M[4]          = batch_size * beam_width * max_input_len;
+    K[4]          = hidden_units;
+    N[4]          = inter_size / tensor_para_size;
+    batchCount[4] = 1;
+    strideA[4]    = 0;
+    strideB[4]    = 0;
+    strideD[4]    = 0;
+    strcpy(mess[4], "context ffn gemm 1");
+
+    // gemm 5
+    M[5]          = batch_size * beam_width * max_input_len;
+    K[5]          = inter_size / tensor_para_size;
+    N[5]          = hidden_units;
+    batchCount[5] = 1;
+    strideA[5]    = 0;
+    strideB[5]    = 0;
+    strideD[5]    = 0;
+    strcpy(mess[5], "context ffn gemm 2");
+
+    // gemm 6
+    M[6]          = batch_size * beam_width;
+    K[6]          = hidden_units;
+    N[6]          = 3 * local_hidden_units;
+    batchCount[6] = 1;
+    strideA[6]    = 0;
+    strideB[6]    = 0;
+    strideD[6]    = 0;
+    strcpy(mess[6], "from_tensor * weightQKV");
+
+    // gemm 7
+    M[7]          = batch_size * beam_width;
+    K[7]          = local_hidden_units;
+    N[7]          = hidden_units;
+    batchCount[7] = 1;
+    strideA[7]    = 0;
+    strideB[7]    = 0;
+    strideD[7]    = 0;
+    strcpy(mess[7], "attr * output_kernel");
+
+    // gemm 8
+    M[8]          = batch_size * beam_width;
+    K[8]          = hidden_units;
+    N[8]          = inter_size / tensor_para_size;
+    batchCount[8] = 1;
+    strideA[8]    = 0;
+    strideB[8]    = 0;
+    strideD[8]    = 0;
+    strcpy(mess[8], "ffn gemm 1");
+
+    // gemm 9
+    M[9]          = batch_size * beam_width;
+    K[9]          = inter_size / tensor_para_size;
+    N[9]          = hidden_units;
+    batchCount[9] = 1;
+    strideA[9]    = 0;
+    strideB[9]    = 0;
+    strideD[9]    = 0;
+    strcpy(mess[9], "ffn gemm 2");
+
+    // gemm 10
+    M[10]          = batch_size * beam_width;
+    K[10]          = hidden_units;
+    N[10]          = ceil(vocab_size / 8.) * 8 / tensor_para_size;
+    batchCount[10] = 1;
+    strideA[10]    = 0;
+    strideB[10]    = 0;
+    strideD[10]    = 0;
+    strcpy(mess[10], "logits gemm");
+
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t DType;
+    cudaDataType_t DType_FP8[gemm_num];
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        DType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        DType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        DType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+#ifdef ENABLE_FP8
+    else if (std::is_same<T, __nv_fp8_e4m3>::value) {
+        data_type = FP8_DATATYPE;
+        AType     = CUDA_R_8F_E4M3;
+        BType     = CUDA_R_8F_E4M3;
+        CType     = CUDA_R_16BF;
+#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
+        DType = CUDA_R_16BF
+#else
+        DType_FP8[0] = CUDA_R_8F_E4M3;
+        DType_FP8[1] = CUDA_R_16BF;
+        DType_FP8[2] = CUDA_R_8F_E4M3;
+        DType_FP8[3] = CUDA_R_16BF;
+        DType_FP8[4] = CUDA_R_16BF;
+        DType_FP8[5] = CUDA_R_16BF;
+#ifdef FP8_MHA
+        DType_FP8[6] = CUDA_R_8F_E4M3;
+#else
+        DType_FP8[6] = CUDA_R_16BF;
+#endif
+        DType_FP8[7] = CUDA_R_16BF;
+        DType_FP8[8] = CUDA_R_16BF;
+        DType_FP8[9] = CUDA_R_16BF;
+#endif
+            computeType = CUDA_R_32F;
+        startAlgo       = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo         = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+    float alpha = (float)1.0f;
+    float beta  = (float)0.0f;
+
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                "inner_shapeId, cluster_shapeId, "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                "mma_shapeId, cga_shapeId, schedule_mode, "
+#endif
+                "exec_time\n");
+    }
+
+    for (int i = 0; i < gemm_num; ++i) {
+        if (i <= 5) {
+            continue;
+        }
+        int seq_len = i <= 5 ? max_input_len : 1;
+
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                if (i == 1) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_T,
+                                                        CUBLAS_OP_N,
+                                                        max_input_len,
+                                                        max_input_len,
+                                                        size_per_head,
+                                                        &alpha,
+                                                        d_B,
+                                                        BType,
+                                                        size_per_head,
+                                                        max_input_len * size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        size_per_head,
+                                                        max_input_len * size_per_head,
+                                                        &beta,
+                                                        d_C,
+                                                        CUDA_R_32F,  // CType,
+                                                        max_input_len,
+                                                        max_input_len * max_input_len,
+                                                        batchCount[i],
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 2) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_N,
+                                                        CUBLAS_OP_N,
+                                                        size_per_head,
+                                                        max_input_len,
+                                                        max_input_len,
+                                                        &alpha,
+                                                        d_B,
+                                                        BType,
+                                                        size_per_head,
+                                                        max_input_len * size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        max_input_len,
+                                                        max_input_len * max_input_len,
+                                                        &beta,
+                                                        d_C,
+                                                        CType,
+                                                        size_per_head,
+                                                        max_input_len * size_per_head,
+                                                        batchCount[i],
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 10) {
+                    status = cublasGemmEx(cublas_handle,
+                                          CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          n,
+                                          m,
+                                          k,
+                                          &alpha,
+                                          d_B,
+                                          BType,
+                                          k,
+                                          d_A,
+                                          AType,
+                                          k,
+                                          &beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else {
+                    status = cublasGemmEx(cublas_handle,
+                                          CUBLAS_OP_N,
+                                          CUBLAS_OP_N,
+                                          n,
+                                          m,
+                                          k,
+                                          &alpha,
+                                          d_B,
+                                          BType,
+                                          n,
+                                          d_A,
+                                          AType,
+                                          k,
+                                          &beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }
+            }
+            sync_check_cuda_error();
+        }
+
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+
+        // for fp16 and bf16, we compare cublasLt
+        // for fp8, compare cublaslt for all gemm kernels
+        if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) {
+            printf("***cublasLt Gemm Testing Beign***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+
+            // for gpt, computeType & scaleType should be FP32
+            LtHgemmCustomFind<T, float>(ltHandle,
+                                        batch_size * beam_width,
+                                        i == 1 || i == 2 ? max_input_len : 1,
+                                        head_num,
+                                        size_per_head,
+                                        n,
+                                        m,
+                                        k,
+                                        &alpha,
+                                        d_B,
+                                        d_A,
+                                        &beta,
+                                        d_C,
+                                        cublas_workspace,
+                                        workSpaceSize,
+                                        fd,
+                                        perfResults,
+                                        ALGO_COMBINATIONS,
+                                        DType_FP8[i],
+                                        batchCount[i],
+                                        strideA[i],
+                                        strideB[i],
+                                        strideD[i]);
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(batch_size * beam_width,
+                                   seq_len,
+                                   head_num,
+                                   size_per_head,
+                                   n,
+                                   m,
+                                   k,
+                                   perfResults[0],
+                                   fd,
+                                   data_type,
+                                   0,
+                                   batchCount[i]);
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size * beam_width,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size * beam_width,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }
+        sync_check_cuda_error();
+        exec_times[i] = exec_time;
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+
+#ifdef SPARSITY_ENABLED
+    bool do_sparse_test = false;
+    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) {
+        do_sparse_test = true;
+    }
+    if (do_sparse_test) {
+        printf("***cusparseLt Gemm Testing Begin***\n");
+        // Only first 8 cases can be sparse
+        // - QKV kernel, Projection, FC1, FC2 in context or decoding.
+        const int spgemm_num = 8;
+        if (!isAppend) {
+            fd = fopen(SPGEMM_CONFIG, "w+");
+        }
+        else {
+            fd = fopen(SPGEMM_CONFIG, "a+");
+            std::vector<std::string> config;
+            char                     line[1024];
+            while (fgets(line, 1024, fd) != NULL) {
+                config.push_back(std::string(line));
+            }
+            line_count = config.size();
+            // gemm_num configs (cublas/cublasLt), first row is not included
+            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) {
+                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
+                fclose(fd);
+                fd = fopen(SPGEMM_CONFIG, "w+");
+                fprintf(fd, "%s", config[0].c_str());
+                for (uint i = startIdx; i < config.size(); i++) {
+                    fprintf(fd, "%s", config[i].c_str());
+                }
+                line_count = config.size() - (spgemm_num + 3);
+            }
+        }
+        if (line_count == 0) {
+            // header line
+            fprintf(fd,
+                    "batch_size, seq_len, head_num, size_per_head dataType "
+                    "### batchCount, m, n, k, algoId, exec_time\n");
+        }
+
+        cusparseLtHandle_t handle;
+        CHECK_CUSPARSE(cusparseLtInit(&handle));
+        cusparseOrder_t     order = CUSPARSE_ORDER_COL;
+        cusparseOperation_t opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t opB   = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        // let's make this optional
+        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
+        unsigned            alignment    = 16;
+        cudaStream_t        stream       = 0;
+        float               alpha2       = 1.0f;
+        float               beta2        = 0.0f;
+        for (int i = 0; i < gemm_num; ++i) {
+            // skip qk or attn or logit gemms.
+            if (i == 1 || i == 2 || i == 10) {
+                continue;
+            }
+
+            // seq_len is always 1 except context gemms.
+            int seq_len = i <= 5 ? max_input_len : 1;
+
+            // to be compatible with spgemm wrapper, we let A be the weight matrix
+            // so m and n are swapped
+            // A: mxk B: kxn C:mxn
+            int m = N[i], n = M[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
+
+            if (n % 8 != 0) {
+                n = div_up(n, 8) * 8;  // pad n to be multiple of 8 as FT does.
+            }
+
+            T* d_A = (T*)buffer;
+            T* d_B = d_A + m * k * batchCount[i];
+            T* d_C = d_B + k * n * batchCount[i];
+            T* dA_compressed;
+            {
+                cusparseLtMatDescriptor_t matA;
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(
+                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                size_t compressed_size;
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+            }
+
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            if (isSparseGemmAvailable(m, n, k)) {
+                for (int alg = 0; alg < 4; ++alg) {
+                    cudaDeviceSynchronize();
+                    cusparseLtMatDescriptor_t matA, matB, matC;
+                    void*                     d_workspace = nullptr;
+                    int                       num_streams = 1;
+                    cudaStream_t              streams[1]  = {stream};
+                    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                        &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                    cudaDeviceSynchronize();
+                    gettimeofday(&start, NULL);
+                    for (int ite = 0; ite < ites; ++ite) {
+                        // initializing MatDesc takes a lot of time
+                        // and these descs can be stored to other place
+                        // whereas storing MatMulPlan to other place will cause errors
+                        cusparseLtMatmulDescriptor_t   matmul;
+                        cusparseLtMatmulAlgSelection_t alg_sel;
+                        cusparseLtMatmulPlan_t         plan;
+                        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+                            &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                        CHECK_CUSPARSE(
+                            cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+                        CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+                            &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+                        size_t workspace_size;
+                        CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
+                        CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
+                        CHECK_CUSPARSE(cusparseLtMatmul(&handle,
+                                                        &plan,
+                                                        &alpha2,
+                                                        dA_compressed,
+                                                        d_B,
+                                                        &beta2,
+                                                        d_C,
+                                                        d_C,
+                                                        d_workspace,
+                                                        streams,
+                                                        num_streams))
+                        CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+                    }
+                    cudaDeviceSynchronize();
+                    gettimeofday(&end, NULL);
+                    printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
+                    if (diffTime(start, end) < exec_time) {
+                        exec_time = diffTime(start, end);
+                        fast_algo = alg;
+                    }
+                }
+            }
+            exec_time /= ites;
+            if (exec_time >= exec_times[i]) {
+                fast_algo = -1;
+            }
+            printf("fast_algo %d\n", fast_algo);
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
+                    batch_size * beam_width,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    m,
+                    n,
+                    k,
+                    fast_algo,
+                    exec_time);
+            cudaFree(dA_compressed);
+        }
+        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
+        fclose(fd);
+        printf("***cusparseLt Gemm Testing End***\n");
+    }
+#endif
+
+    printf("***GPT Gemm Testing End***\n");
+    return;
+}
+
+template void generate_gpt_gemm_config<float>(int   batch_size,
+                                              int   beam_width,
+                                              int   max_input_len,
+                                              int   head_num,
+                                              int   size_per_head,
+                                              int   inter_size,
+                                              int   vocab_size,
+                                              int   tensor_para_size,
+                                              void* buffer_in,
+                                              bool  isAppend);
+
+template void generate_gpt_gemm_config<half>(int   batch_size,
+                                             int   beam_width,
+                                             int   max_input_len,
+                                             int   head_num,
+                                             int   size_per_head,
+                                             int   inter_size,
+                                             int   vocab_size,
+                                             int   tensor_para_size,
+                                             void* buffer_in,
+                                             bool  isAppend);
+
+#ifdef ENABLE_BF16
+template void generate_gpt_gemm_config<__nv_bfloat16>(int   batch_size,
+                                                      int   beam_width,
+                                                      int   max_input_len,
+                                                      int   head_num,
+                                                      int   size_per_head,
+                                                      int   inter_size,
+                                                      int   vocab_size,
+                                                      int   tensor_para_size,
+                                                      void* buffer_in,
+                                                      bool  isAppend);
+#endif
+
+#ifdef ENABLE_FP8
+template void generate_gpt_gemm_config<__nv_fp8_e4m3>(int   batch_size,
+                                                      int   beam_width,
+                                                      int   max_input_len,
+                                                      int   head_num,
+                                                      int   size_per_head,
+                                                      int   inter_size,
+                                                      int   vocab_size,
+                                                      int   tensor_para_size,
+                                                      void* buffer_in,
+                                                      bool  isAppend);
+#endif
+
+size_t calGptGemmTestBufSizeInByte(int            batch_size,
+                                   int            beam_width,
+                                   int            max_input_len,
+                                   int            head_num,
+                                   int            size_per_head,
+                                   int            inter_size,
+                                   int            vocab_size,
+                                   int            tensor_para_size,
+                                   CublasDataType data_type)
+{
+    size_t       buf_size_in_byte   = 0;
+    const size_t hidden_units       = head_num * size_per_head;
+    const size_t local_head_num     = head_num / tensor_para_size;
+    const size_t local_hidden_units = local_head_num * size_per_head;
+
+    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
+    // Because we always use float for some buffer, set the wordSize to float directly.
+    int wordSize = sizeof(float);
+
+    size_t              m = batch_size * beam_width * max_input_len;
+    std::vector<size_t> buff_size;
+    // for context qkv gemm
+    buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
+    // for context batch gemm
+    buff_size.push_back(m * local_hidden_units + m * local_hidden_units
+                        + batch_size * beam_width * head_num * max_input_len * max_input_len);
+    // for context ffn gemm
+    buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
+                        + m * hidden_units);
+    // for vocab
+    buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
+                        + m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
+
+    for (auto t : buff_size) {
+        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
+    }
+    buf_size_in_byte *= wordSize;
+    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE || data_type == FP8_DATATYPE) ?
+                             CUBLAS_WORKSPACE_SIZE :
+                             0);
+
+    return buf_size_in_byte;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..82eec3b1b2fb589e2f001ca3460330f5cc3d6181
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#ifdef ENABLE_BF16
+#include <cuda_fp16.h>
+#endif
+#ifdef ENABLE_FP8
+#include <cuda_fp8.h>
+#endif
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_gpt_gemm_config(int   batch_size,
+                              int   beam_width,
+                              int   seq_len,
+                              int   head_num,
+                              int   size_per_head,
+                              int   inter_size,
+                              int   vocab_size,
+                              int   tensor_para_size,
+                              void* buffer_in,
+                              bool  isAppend);
+
+size_t calGptGemmTestBufSizeInByte(int            batch_size,
+                                   int            beam_width,
+                                   int            max_input_len,
+                                   int            head_num,
+                                   int            size_per_head,
+                                   int            inter_size,
+                                   int            vocab_size,
+                                   int            tensor_para_size,
+                                   CublasDataType data_type);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/swin_gemm_func.cc b/src/fastertransformer/utils/gemm_test/swin_gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec309c4b30fbd494b368ca17b6e27f79d0480384
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/swin_gemm_func.cc
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm_test/swin_gemm_func.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_swin_gemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend)
+{
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+        fprintf(
+            fd,
+            "batch_size seq_len head_num size_per_head dataType ### batchCount n m k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+
+    const int gemm_num            = 7;
+    const int NUM_OF_BASIC_LAYERS = 4;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
+    char      mess[gemm_num][256];
+    float     exec_times[gemm_num];
+
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) {
+        // gemm1
+        M[0] = batch_size * seq_len;
+        K[0] = head_num * size_per_head;
+        N[0] = 3 * K[0];
+        strcpy(mess[0], "from_tensor * weightQ/K/V");
+
+        // gemm2
+        M[1] = M[0];
+        K[1] = K[0];
+        N[1] = K[0];
+        strcpy(mess[1], "attr * output_kernel");
+
+        // gemm3
+        M[2] = M[0];
+        K[2] = K[0];
+        N[2] = 4 * K[0];
+        strcpy(mess[2], "attr_output * inter_kernel");
+
+        // gemm3
+        M[3] = M[0];
+        K[3] = 4 * K[0];
+        N[3] = K[0];
+        strcpy(mess[3], "inter_matmul * output_kernel");
+
+        M[4] = M[0] / 4;
+        K[4] = 4 * K[0];
+        N[4] = 2 * K[0];
+        strcpy(mess[4], "patchMerge gemm");
+
+        M[5]          = seq_len;
+        N[5]          = seq_len;
+        K[5]          = size_per_head;
+        batchCount[5] = batch_size * head_num;
+        strcpy(mess[5], "attention batched Gemm1");
+
+        M[6]          = seq_len;
+        N[6]          = size_per_head;
+        K[6]          = seq_len;
+        batchCount[6] = batch_size * head_num;
+        strcpy(mess[6], "attention batched Gemm2");
+
+        cublasHandle_t cublas_handle;
+        check_cuda_error(cublasCreate(&cublas_handle));
+        cublasLtHandle_t ltHandle;
+        check_cuda_error(cublasLtCreate(&ltHandle));
+
+        cudaDataType_t AType;
+        cudaDataType_t BType;
+        cudaDataType_t CType;
+        cudaDataType_t computeType;
+        int            startAlgo, endAlgo;
+        const int      ites = 100;
+        struct timeval start, end;
+
+        CublasDataType data_type;
+        if (std::is_same<T, float>::value) {
+            data_type   = FLOAT_DATATYPE;
+            AType       = CUDA_R_32F;
+            BType       = CUDA_R_32F;
+            CType       = CUDA_R_32F;
+            computeType = CUDA_R_32F;
+            startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+            endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+        }
+        else if (std::is_same<T, half>::value) {
+            data_type   = HALF_DATATYPE;
+            AType       = CUDA_R_16F;
+            BType       = CUDA_R_16F;
+            CType       = CUDA_R_16F;
+            computeType = CUDA_R_32F;
+            startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+            endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+        }
+#ifdef ENABLE_BF16
+        else if (std::is_same<T, __nv_bfloat16>::value) {
+            data_type   = BFLOAT16_DATATYPE;
+            AType       = CUDA_R_16BF;
+            BType       = CUDA_R_16BF;
+            CType       = CUDA_R_16BF;
+            computeType = CUDA_R_32F;
+            startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+            endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+        }
+#endif
+        using scaleT = typename ScaleTypeConverter<T, false>::Type;
+
+        scaleT alpha = (scaleT)1.0f;
+        scaleT beta  = (scaleT)0.0f;
+
+        for (int i = 0; i < gemm_num; ++i) {
+            // if(i != 0 && i != 5) continue;
+
+            int m = M[i], n = N[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+            T* d_A = (T*)buffer;
+            T* d_B = d_A + m * k * batchCount[i];
+            T* d_C = d_B + k * n * batchCount[i];
+
+            // array of pointer for batchedGemm
+            T* harray[12];
+            harray[0]  = (T*)buffer;
+            harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
+            harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
+            harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
+            harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
+            harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
+            harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
+            harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
+            harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
+
+            T** darray = 0;
+            check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
+            cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
+            T** dAarray = darray;
+            T** dBarray = darray + 4;
+            T** dCarray = darray + 8;
+
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            for (int algo = startAlgo; algo <= endAlgo; algo++) {
+                cublasStatus_t status;
+                cudaDeviceSynchronize();
+                gettimeofday(&start, NULL);
+                for (int ite = 0; ite < ites; ++ite) {
+                    if (i < 5) {
+                        status = cublasGemmEx(cublas_handle,
+                                              CUBLAS_OP_N,
+                                              CUBLAS_OP_N,
+                                              n,
+                                              m,
+                                              k,
+                                              &alpha,
+                                              d_B,
+                                              BType,
+                                              n,
+                                              d_A,
+                                              AType,
+                                              k,
+                                              &beta,
+                                              d_C,
+                                              CType,
+                                              n,
+                                              computeType,
+                                              static_cast<cublasGemmAlgo_t>(algo));
+                    }
+                    else if (i == 5) {
+                        status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                            CUBLAS_OP_T,
+                                                            CUBLAS_OP_N,
+                                                            seq_len,
+                                                            seq_len,
+                                                            size_per_head,
+                                                            &alpha,
+                                                            d_B,
+                                                            BType,
+                                                            size_per_head,
+                                                            seq_len * size_per_head,
+                                                            d_A,
+                                                            AType,
+                                                            size_per_head,
+                                                            seq_len * size_per_head,
+                                                            &beta,
+                                                            d_C,
+                                                            CType,
+                                                            seq_len,
+                                                            seq_len * seq_len,
+                                                            batch_size * head_num,
+                                                            computeType,
+                                                            static_cast<cublasGemmAlgo_t>(algo));
+                    }
+                    else if (i == 6) {
+                        status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                            CUBLAS_OP_N,
+                                                            CUBLAS_OP_N,
+                                                            size_per_head,
+                                                            seq_len,
+                                                            seq_len,
+                                                            &alpha,
+                                                            d_B,
+                                                            BType,
+                                                            size_per_head,
+                                                            seq_len * size_per_head,
+                                                            d_A,
+                                                            AType,
+                                                            seq_len,
+                                                            seq_len * seq_len,
+                                                            &beta,
+                                                            d_C,
+                                                            CType,
+                                                            size_per_head,
+                                                            seq_len * size_per_head,
+                                                            batch_size * head_num,
+                                                            computeType,
+                                                            static_cast<cublasGemmAlgo_t>(algo));
+                    }
+                    if (status != CUBLAS_STATUS_SUCCESS) {
+                        break;
+                    }
+                }
+                cudaDeviceSynchronize();
+                gettimeofday(&end, NULL);
+                if (status == CUBLAS_STATUS_SUCCESS) {
+                    printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                    if (diffTime(start, end) / ites < exec_time) {
+                        exec_time = diffTime(start, end) / ites;
+                        fast_algo = algo;
+                    }
+                }
+            }
+            printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+
+            // for fp16 and bf16, we compare cublasLt
+            if (i < 5 && data_type != FLOAT_DATATYPE) {
+                printf("***cublasLt Gemm Testing Begin***\n");
+                // Let try a fixed number of combinations
+                int                ALGO_COMBINATIONS = 5000;
+                customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+
+                LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                             batch_size,
+                                             seq_len,
+                                             head_num,
+                                             size_per_head,
+                                             n,
+                                             m,
+                                             k,
+                                             &alpha,
+                                             d_B,
+                                             d_A,
+                                             &beta,
+                                             d_C,
+                                             cublas_workspace,
+                                             workSpaceSize,
+                                             fd,
+                                             perfResults,
+                                             ALGO_COMBINATIONS);
+                if (perfResults[0].time < exec_time) {
+                    printPerfStructure(
+                        batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
+                    exec_time = perfResults[0].time;
+                }
+                else {
+                    fprintf(fd,
+                            "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                            "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                            "-1 -1 -1 "
+#endif
+                            "%f\n",
+                            batch_size,
+                            seq_len,
+                            head_num,
+                            size_per_head,
+                            data_type,
+                            batchCount[i],
+                            n,
+                            m,
+                            k,
+                            fast_algo,
+                            exec_time);
+                }
+                printf("***cublasLt Gemm Testing End***\n");
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            exec_times[i] = exec_time;
+            cudaFree(darray);
+        }
+
+        if (basic_layer != NUM_OF_BASIC_LAYERS - 1) {
+            batch_size = batch_size / 4;
+            head_num   = head_num * 2;
+        }
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+    printf("***Encoder Gemm Testing End***\n");
+    return;
+}
+
+template void generate_swin_gemm_config<float>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
+template void generate_swin_gemm_config<half>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
+#ifdef ENABLE_BF16
+template void generate_swin_gemm_config<__nv_bfloat16>(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/swin_gemm_func.h b/src/fastertransformer/utils/gemm_test/swin_gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdbbddc662ff082fe9336b1d7f398c43600cc92a
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/swin_gemm_func.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_swin_gemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/swin_igemm_func.cc b/src/fastertransformer/utils/gemm_test/swin_igemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c053931f982ecd1ddbce49410bfa9f5564c2871
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/swin_igemm_func.cc
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "swin_igemm_func.h"
+
+namespace fastertransformer {
+
+static const char* showStatus(cublasStatus_t error)
+{
+    switch (error) {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+
+    return "<unknown>";
+}
+
+static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
+{
+    return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
+}
+
+static cublasStatus_t customMatmulRun(cublasLtHandle_t            ltHandle,  // to get the capabilities (required a GPU)
+                                      cublasLtMatmulDesc_t        operationDesc,
+                                      const void*                 alpha, /* host or device pointer */
+                                      const void*                 A,
+                                      cublasLtMatrixLayout_t      Adesc,
+                                      const void*                 B,
+                                      cublasLtMatrixLayout_t      Bdesc,
+                                      const void*                 beta, /* host or device pointer */
+                                      const void*                 C,
+                                      cublasLtMatrixLayout_t      Cdesc,
+                                      void*                       D,
+                                      cublasLtMatrixLayout_t      Ddesc,
+                                      const cublasLtMatmulAlgo_t& algo,
+                                      int                         kernelRepeats,
+                                      void*                       workSpace,
+                                      size_t                      workSpaceSizeInBytes,
+                                      customMatmulPerf_t&         perfResults,
+                                      cudaStream_t                stream)
+{
+    cublasLtMatmulHeuristicResult_t heurResult;
+    /* Looping over the Algo */
+    int            repeats = kernelRepeats;
+    cublasStatus_t algoStatus =
+        cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
+    if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+        if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
+            struct timeval start, end;
+            cublasStatus_t oneRunStatus;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int loop = 0; loop < repeats; loop++) {
+                oneRunStatus = cublasLtMatmul(ltHandle,
+                                              operationDesc,
+                                              alpha,
+                                              A,
+                                              Adesc,
+                                              B,
+                                              Bdesc,
+                                              beta,
+                                              C,
+                                              Cdesc,
+                                              D,
+                                              Ddesc,
+                                              &algo,
+                                              workSpace,
+                                              workSpaceSizeInBytes,
+                                              stream);
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
+                algoStatus = oneRunStatus;
+            }
+            float time = diffTime(start, end);
+            // For the moment only add successful findings
+            if (algoStatus == CUBLAS_STATUS_SUCCESS) {
+                perfResults.algo          = algo;
+                perfResults.time          = time / repeats;
+                perfResults.workspaceSize = heurResult.workspaceSize;
+                perfResults.wavesCount    = heurResult.wavesCount;
+            }
+        }
+        else {
+            // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
+            algoStatus = CUBLAS_STATUS_NOT_SUPPORTED;  // Not enough workspace
+        }
+    }
+    else {
+        // printf("check fail!\n");
+    }
+    return algoStatus;
+}
+
+int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
+{
+    printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+
+    int8_t* d_A = (int8_t*)buffer;         // m * k, stored in column-major
+    int8_t* d_B = d_A + m * k;             // k * n, stored in column-major
+    int8_t* d_C = (int8_t*)(d_B + k * n);  // m * n, stored in column-major
+
+    cublasLtHandle_t ltHandle;
+    cublasLtCreate(&ltHandle);
+
+    LtIgemmCustomFind(ltHandle,
+                      m,
+                      n,
+                      k,
+                      &alpha, /* host pointer */
+                      d_A,
+                      d_B,
+                      &beta, /* host pointer */
+                      d_C,
+                      NULL,
+                      0,
+                      fout);
+
+    cublasLtDestroy(ltHandle);
+    return 0;
+}
+
+int generate_swin_igemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
+{
+
+    // ensure program running on SM >= 7.5
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
+        printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
+        exit(-1);
+    }
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fout;
+    if (!isAppend) {
+        fout = fopen(IGEMM_CONFIG, "w+");
+        fprintf(
+            fout,
+            "batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
+    }
+    else {
+        fout = fopen(IGEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fout) != NULL) {
+            config.push_back(std::string(line));
+        }
+        if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
+            int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
+            fclose(fout);
+            fout = fopen(IGEMM_CONFIG, "w+");
+            for (int i = startIdx; i < (int)config.size(); i++) {
+                fprintf(fout, "%s", config[i].c_str());
+            }
+        }
+    }
+
+    int       m = batch_size * seq_len;
+    int       n = head_num * size_per_head;
+    int       k = n;
+    int       batchCount;
+    const int NUM_OF_BASIC_LAYERS = 4;
+
+    printf("***Swin IGemm Testing Begin***\n");
+
+    for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) {
+        printf("\n-----------------------------\n");
+        batchCount = 1;
+        m          = batch_size * seq_len;
+        k          = head_num * size_per_head;
+        n          = 3 * head_num * size_per_head;
+        if (n % 32 != 0 || k % 32 != 0) {
+            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+        }
+        else {
+            igemm_config_INT8IO(m, n, k, fout, buffer);
+        }
+
+        printf("\n-----------------------------\n");
+        m = batch_size * seq_len;
+        n = head_num * size_per_head;
+        k = head_num * size_per_head;
+        if (n % 32 != 0 || k % 32 != 0) {
+            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+        }
+        else {
+            igemm_config_INT8IO(m, n, k, fout, buffer);
+        }
+
+        printf("\n-----------------------------\n");
+        m = batch_size * seq_len;
+        n = 4 * head_num * size_per_head;
+        k = head_num * size_per_head;
+        if (n % 32 != 0 || k % 32 != 0) {
+            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+        }
+        else {
+            igemm_config_INT8IO(m, n, k, fout, buffer);
+        }
+
+        printf("\n-----------------------------\n");
+        m = batch_size * seq_len;
+        n = head_num * size_per_head;
+        k = 4 * head_num * size_per_head;
+        if (n % 32 != 0 || k % 32 != 0) {
+            printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+        }
+        else {
+            igemm_config_INT8IO(m, n, k, fout, buffer);
+        }
+
+        if (basic_layer != NUM_OF_BASIC_LAYERS - 1) {
+            printf("\n-----------------------------\n");
+            batch_size = batch_size / 4;
+            head_num   = head_num * 2;
+            m          = batch_size * seq_len;
+            n          = head_num * size_per_head;
+            k          = 2 * head_num * size_per_head;
+            if (n % 32 != 0 || k % 32 != 0) {
+                printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
+            }
+            else {
+                igemm_config_INT8IO(m, n, k, fout, buffer);
+            }
+        }
+        printf("\n-----------------------------\n");
+    }
+
+    fclose(fout);
+    printf("\n-----------------------------\n");
+    printf("***Swin IGemm Testing End***\n");
+    return 0;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/swin_igemm_func.h b/src/fastertransformer/utils/gemm_test/swin_igemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f4d087a003e15087ef473d5581b1e9b06db6e44
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/swin_igemm_func.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/encoder_igemm_func.h"
+#include <algorithm>
+#include <cublasLt.h>
+#include <cuda_runtime.h>
+#include <map>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+/* CAUTION : must match cublasLtMatmulTile_t */
+// const char* const matmulTileName[] = {
+//     "UNDEF",  "8x8",    "8x16",    "16x8",   "8x32",   "16x16",   "32x8",    "8x64",   "16x32",
+//     "32x16",  "64x8",   "32x32",   "32x64",  "64x32",  "32x128",  "64x64",   "128x32", "64x128",
+//     "128x64", "64x256", "128x128", "256x64", "64x512", "128x256", "256x128", "512x64",
+// };
+
+int generate_swin_igemm_config(
+    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/t5_gemm_func.cc b/src/fastertransformer/utils/gemm_test/t5_gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ecc7ef57c004e2a3e534ea5b8612c8aadb1e94e
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/t5_gemm_func.cc
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm_test/t5_gemm_func.h"
+
+namespace fastertransformer {
+
+bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
+{
+    return m % 8 == 0 && n % 8 == 0 && k % 8 == 0;
+}
+
+template<typename T>
+void generate_t5_gemm_config(int   batch_size,
+                             int   beam_width,
+                             int   max_mem_seq_len,
+                             int   encoder_d_model,
+                             int   encoder_head_num,
+                             int   encoder_size_per_head,
+                             int   encoder_inter_size,
+                             int   decoder_d_model,
+                             int   decoder_head_num,
+                             int   decoder_size_per_head,
+                             int   decoder_inter_size,
+                             int   decoder_vocab_size,
+                             int   tensor_para_size,
+                             void* buffer_in,
+                             bool  isAppend,
+                             bool  is_fp16_compute_type)
+{
+    FT_CHECK(encoder_head_num % tensor_para_size == 0);
+    FT_CHECK(decoder_head_num % tensor_para_size == 0);
+
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+
+    const int gemm_num = 12;
+    int       M[gemm_num];
+    int       N[gemm_num];
+    int       K[gemm_num];
+    int       batchCount[gemm_num];
+    char      mess[gemm_num][256];
+    float     exec_times[gemm_num];
+
+    // gemm 0
+    M[0]          = batch_size * max_mem_seq_len;
+    K[0]          = encoder_d_model;
+    N[0]          = encoder_head_num / tensor_para_size * encoder_size_per_head;
+    batchCount[0] = 3;
+    strcpy(mess[0], "encoder from_tensor * batched gemm weightQKV");
+
+    // gemm 1
+    M[1]          = max_mem_seq_len;
+    K[1]          = encoder_size_per_head;
+    N[1]          = max_mem_seq_len;
+    batchCount[1] = batch_size * encoder_head_num / tensor_para_size;
+    strcpy(mess[1], "encoder batch strided gemm Q*K^T");
+
+    // gemm 2
+    M[2]          = max_mem_seq_len;
+    K[2]          = max_mem_seq_len;
+    N[2]          = encoder_size_per_head;
+    batchCount[2] = batch_size * encoder_head_num / tensor_para_size;
+    strcpy(mess[2], "encoder batch strided gemm QK*V^T");
+
+    // gemm 3
+    M[3]          = batch_size * max_mem_seq_len;
+    K[3]          = encoder_head_num / tensor_para_size * encoder_size_per_head;
+    N[3]          = encoder_d_model;
+    batchCount[3] = 1;
+    strcpy(mess[3], "encoder attr * output_kernel");
+
+    // gemm 4
+    M[4]          = batch_size * max_mem_seq_len;
+    K[4]          = encoder_d_model;
+    N[4]          = encoder_inter_size / tensor_para_size;
+    batchCount[4] = 1;
+    strcpy(mess[4], "encoder ffn gemm 1");
+
+    // gemm 5
+    M[5]          = batch_size * max_mem_seq_len;
+    K[5]          = encoder_inter_size / tensor_para_size;
+    N[5]          = encoder_d_model;
+    batchCount[5] = 1;
+    strcpy(mess[5], "encoder ffn gemm 2");
+
+    // gemm 6
+    M[6]          = batch_size * beam_width;
+    K[6]          = decoder_d_model;
+    N[6]          = 3 * decoder_head_num / tensor_para_size * decoder_size_per_head;
+    batchCount[6] = 1;
+    strcpy(mess[6], "from_tensor * weightQKV");
+
+    // gemm 7
+    M[7]          = batch_size * beam_width;
+    K[7]          = decoder_head_num / tensor_para_size * decoder_size_per_head;
+    N[7]          = decoder_d_model;
+    batchCount[7] = 1;
+    strcpy(mess[7], "attr * output_kernel");
+
+    // gemm 8
+    M[8]          = batch_size * beam_width;
+    K[8]          = decoder_d_model;
+    N[8]          = decoder_inter_size / tensor_para_size;
+    batchCount[8] = 1;
+    strcpy(mess[8], "ffn gemm 1");
+
+    // gemm 9
+    M[9]          = batch_size * beam_width;
+    K[9]          = decoder_inter_size / tensor_para_size;
+    N[9]          = decoder_d_model;
+    batchCount[9] = 1;
+    strcpy(mess[9], "ffn gemm 2");
+
+    // gemm 10
+    size_t decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size / 1. / tensor_para_size) * tensor_para_size);
+    if (!std::is_same<T, float>::value) {
+        decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size_padded / 8.) * 8);
+    }
+    M[10]          = batch_size * beam_width;
+    K[10]          = decoder_d_model;
+    N[10]          = decoder_vocab_size_padded / tensor_para_size;
+    batchCount[10] = 1;
+    strcpy(mess[10], "logits gemm");
+
+    // gemm 11
+    M[11]          = batch_size * max_mem_seq_len;
+    K[11]          = encoder_d_model;
+    N[11]          = encoder_head_num / tensor_para_size * encoder_size_per_head;
+    batchCount[11] = 1;
+    strcpy(mess[11], "encoder from_tensor * splited qkv weight");
+
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+    float f_alpha = (float)1.0f;
+    float f_beta  = (float)0.0f;
+
+    half h_alpha = (half)(1.0f);
+    half h_beta  = (half)(0.0f);
+
+    void* alpha = computeType == CUDA_R_16F ? (void*)(&h_alpha) : (void*)(&f_alpha);
+    void* beta  = computeType == CUDA_R_16F ? (void*)(&h_beta) : (void*)(&f_beta);
+
+    printf("***Encoder Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
+    }
+    for (int i = 0; i < gemm_num; ++i) {
+        int seq_len       = (i <= 5 || i == 11) ? max_mem_seq_len : 1;
+        int head_num      = ((i <= 5 || i == 11) ? encoder_head_num : decoder_head_num) / tensor_para_size;
+        int size_per_head = (i <= 5 || i == 11) ? encoder_size_per_head : decoder_size_per_head;
+
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+
+        // array of pointer for batchedGemm
+        T* harray[12];
+        harray[0]  = (T*)buffer;
+        harray[1]  = (T*)((char*)buffer + sizeof(T) * m * k);
+        harray[2]  = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
+        harray[4]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
+        harray[5]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
+        harray[6]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
+        harray[8]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
+        harray[9]  = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
+        harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
+
+        T** darray = 0;
+        check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
+        cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
+        T** dAarray = darray;
+        T** dBarray = darray + 4;
+        T** dCarray = darray + 8;
+
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                if (i == 0) {
+                    status = cublasGemmBatchedEx(cublas_handle,
+                                                 CUBLAS_OP_N,
+                                                 CUBLAS_OP_N,
+                                                 n,
+                                                 m,
+                                                 k,
+                                                 alpha,
+                                                 (const void* const*)dBarray,
+                                                 BType,
+                                                 n,
+                                                 (const void* const*)dAarray,
+                                                 AType,
+                                                 k,
+                                                 beta,
+                                                 (void* const*)dCarray,
+                                                 CType,
+                                                 n,
+                                                 batchCount[i],
+                                                 computeType,
+                                                 static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 1) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_T,
+                                                        CUBLAS_OP_N,
+                                                        max_mem_seq_len,
+                                                        max_mem_seq_len,
+                                                        encoder_size_per_head,
+                                                        alpha,
+                                                        d_B,
+                                                        BType,
+                                                        encoder_size_per_head,
+                                                        max_mem_seq_len * encoder_size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        encoder_size_per_head,
+                                                        max_mem_seq_len * encoder_size_per_head,
+                                                        beta,
+                                                        d_C,
+                                                        CType,  // CType,
+                                                        max_mem_seq_len,
+                                                        max_mem_seq_len * max_mem_seq_len,
+                                                        batchCount[i],
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 2) {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        CUBLAS_OP_N,
+                                                        CUBLAS_OP_N,
+                                                        encoder_size_per_head,
+                                                        max_mem_seq_len,
+                                                        max_mem_seq_len,
+                                                        alpha,
+                                                        d_B,
+                                                        BType,
+                                                        encoder_size_per_head,
+                                                        max_mem_seq_len * encoder_size_per_head,
+                                                        d_A,
+                                                        AType,
+                                                        max_mem_seq_len,
+                                                        max_mem_seq_len * max_mem_seq_len,
+                                                        beta,
+                                                        d_C,
+                                                        CType,
+                                                        encoder_size_per_head,
+                                                        max_mem_seq_len * encoder_size_per_head,
+                                                        batchCount[i],
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else if (i == 10) {
+                    status = cublasGemmEx(cublas_handle,
+                                          CUBLAS_OP_T,
+                                          CUBLAS_OP_N,
+                                          n,
+                                          m,
+                                          k,
+                                          alpha,
+                                          d_B,
+                                          BType,
+                                          k,
+                                          d_A,
+                                          AType,
+                                          k,
+                                          beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else {
+                    status = cublasGemmEx(cublas_handle,
+                                          CUBLAS_OP_N,
+                                          CUBLAS_OP_N,
+                                          n,
+                                          m,
+                                          k,
+                                          alpha,
+                                          d_B,
+                                          BType,
+                                          n,
+                                          d_A,
+                                          AType,
+                                          k,
+                                          beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }
+            }
+            sync_check_cuda_error();
+        }
+
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+
+        using scaleT = float;
+
+        if (is_fp16_compute_type) {
+            using scaleT = typename ScaleTypeConverter<T, true>::Type;
+        }
+
+        // for fp16 and bf16, we compare cublasLt
+        if (data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 0 && i != 10) {
+            printf("***cublasLt Gemm Testing Begin***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+
+            // for t5, computeType & scaleType should be FP32
+            if (is_fp16_compute_type) {
+                using scaleT       = typename ScaleTypeConverter<T, true>::Type;
+                scaleT alpha_scale = (scaleT)1.0f;
+                scaleT beta_scale  = (scaleT)0.0f;
+
+                LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                             m,
+                                             seq_len,
+                                             head_num,
+                                             size_per_head,
+                                             n,
+                                             m,
+                                             k,
+                                             &(alpha_scale),
+                                             d_B,
+                                             d_A,
+                                             &(beta_scale),
+                                             d_C,
+                                             cublas_workspace,
+                                             workSpaceSize,
+                                             fd,
+                                             perfResults,
+                                             ALGO_COMBINATIONS);
+            }
+            else {
+                LtHgemmCustomFind<T, float>(ltHandle,
+                                            m,
+                                            seq_len,
+                                            head_num,
+                                            size_per_head,
+                                            n,
+                                            m,
+                                            k,
+                                            &(f_alpha),
+                                            d_B,
+                                            d_A,
+                                            &(f_beta),
+                                            d_C,
+                                            cublas_workspace,
+                                            workSpaceSize,
+                                            fd,
+                                            perfResults,
+                                            ALGO_COMBINATIONS);
+            }
+
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
+                                   seq_len,
+                                   head_num,
+                                   size_per_head,
+                                   n,
+                                   m,
+                                   k,
+                                   perfResults[0],
+                                   fd,
+                                   data_type,
+                                   0);
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }
+        sync_check_cuda_error();
+        exec_times[i] = exec_time;
+    }
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+
+#ifdef SPARSITY_ENABLED
+    bool do_sparse_test = false;
+    if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) {
+        do_sparse_test = true;
+    }
+    if (do_sparse_test) {
+        printf("***cusparseLt Gemm Testing Begin***\n");
+        // Only first 8 cases can be sparse
+        // - QKV kernel, Projection, FC1, FC2 in context or decoding.
+        const int spgemm_num = 8;
+        if (!isAppend) {
+            fd = fopen(SPGEMM_CONFIG, "w+");
+        }
+        else {
+            fd = fopen(SPGEMM_CONFIG, "a+");
+            std::vector<std::string> config;
+            char                     line[1024];
+            while (fgets(line, 1024, fd) != NULL) {
+                config.push_back(std::string(line));
+            }
+            line_count = config.size();
+            // gemm_num configs (cublas/cublasLt), first row is not included
+            if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) {
+                int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
+                fclose(fd);
+                fd = fopen(SPGEMM_CONFIG, "w+");
+                fprintf(fd, "%s", config[0].c_str());
+                for (uint i = startIdx; i < config.size(); i++) {
+                    fprintf(fd, "%s", config[i].c_str());
+                }
+                line_count = config.size() - (spgemm_num + 3);
+            }
+        }
+        if (line_count == 0) {
+            // header line
+            fprintf(fd,
+                    "batch_size, seq_len, head_num, size_per_head dataType "
+                    "### batchCount, m, n, k, algoId, exec_time\n");
+        }
+
+        cusparseLtHandle_t handle;
+        CHECK_CUSPARSE(cusparseLtInit(&handle));
+        cusparseOrder_t     order = CUSPARSE_ORDER_COL;
+        cusparseOperation_t opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        cusparseOperation_t opB   = CUSPARSE_OPERATION_NON_TRANSPOSE;
+        // let's make this optional
+        cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
+        unsigned            alignment    = 16;
+        cudaStream_t        stream       = 0;
+        float               alpha2       = 1.0f;
+        float               beta2        = 0.0f;
+        for (int i = 0; i < gemm_num; ++i) {
+            // skip qk or attn or logit gemms.
+            if (i == 1 || i == 2 || i == 10) {
+                continue;
+            }
+
+            // seq_len is always 1 except context gemms.
+            int seq_len       = i <= 5 ? max_mem_seq_len : 1;
+            int head_num      = (i <= 5 ? encoder_head_num : decoder_head_num) / tensor_para_size;
+            int size_per_head = i <= 5 ? encoder_size_per_head : decoder_size_per_head;
+
+            // to be compatible with spgemm wrapper, we let A be the weight matrix
+            // so m and n are swapped
+            // A: mxk B: kxn C:mxn
+            int m = N[i], n = M[i], k = K[i];
+            printf("\n-----------------------------\n");
+            printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
+            T* d_A = (T*)buffer;
+            T* d_B = d_A + m * k * batchCount[i];
+            T* d_C = d_B + k * n * batchCount[i];
+            T* dA_compressed;
+            {
+                cusparseLtMatDescriptor_t matA;
+                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                CHECK_CUSPARSE(
+                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                size_t compressed_size;
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+            }
+
+            float exec_time = 99999.0f;
+            int   fast_algo = 0;
+            if (isSparseGemmAvailable(m, n, k)) {
+                for (int alg = 0; alg < 4; ++alg) {
+                    cudaDeviceSynchronize();
+                    cusparseLtMatDescriptor_t matA, matB, matC;
+                    void*                     d_workspace = nullptr;
+                    int                       num_streams = 1;
+                    cudaStream_t              streams[1]  = {stream};
+                    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
+                        &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                    cudaDeviceSynchronize();
+                    gettimeofday(&start, NULL);
+                    for (int ite = 0; ite < ites; ++ite) {
+                        // initializing MatDesc takes a lot of time
+                        // and these descs can be stored to other place
+                        // whereas storing MatMulPlan to other place will cause errors
+                        cusparseLtMatmulDescriptor_t   matmul;
+                        cusparseLtMatmulAlgSelection_t alg_sel;
+                        cusparseLtMatmulPlan_t         plan;
+                        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
+                            &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                        CHECK_CUSPARSE(
+                            cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+                        CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+                            &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+                        size_t workspace_size;
+                        CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
+                        CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
+                        CHECK_CUSPARSE(cusparseLtMatmul(&handle,
+                                                        &plan,
+                                                        &alpha2,
+                                                        dA_compressed,
+                                                        d_B,
+                                                        &beta2,
+                                                        d_C,
+                                                        d_C,
+                                                        d_workspace,
+                                                        streams,
+                                                        num_streams))
+                        CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
+                    }
+                    cudaDeviceSynchronize();
+                    gettimeofday(&end, NULL);
+                    printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
+                    if (diffTime(start, end) < exec_time) {
+                        exec_time = diffTime(start, end);
+                        fast_algo = alg;
+                    }
+                }
+            }
+            exec_time /= ites;
+            if (exec_time >= exec_times[i]) {
+                fast_algo = -1;
+            }
+            printf("fast_algo %d\n", fast_algo);
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d %f\n",
+                    batch_size * beam_width,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    m,
+                    n,
+                    k,
+                    fast_algo,
+                    exec_time);
+            cudaFree(dA_compressed);
+        }
+        CHECK_CUSPARSE(cusparseLtDestroy(&handle))
+        fclose(fd);
+        printf("***cusparseLt Gemm Testing End***\n");
+    }
+#endif
+
+    printf("***T5 Gemm Testing End***\n");
+    return;
+}
+
+template void generate_t5_gemm_config<float>(int   batch_size,
+                                             int   beam_width,
+                                             int   max_mem_seq_len,
+                                             int   encoder_d_model,
+                                             int   encoder_head_num,
+                                             int   encoder_size_per_head,
+                                             int   encoder_inter_size,
+                                             int   decoder_d_model,
+                                             int   decoder_head_num,
+                                             int   decoder_size_per_head,
+                                             int   decoder_inter_size,
+                                             int   decoder_vocab_size,
+                                             int   tensor_para_size,
+                                             void* buffer_in,
+                                             bool  isAppend,
+                                             bool  is_fp16_compute_type);
+
+template void generate_t5_gemm_config<half>(int   batch_size,
+                                            int   beam_width,
+                                            int   max_mem_seq_len,
+                                            int   encoder_d_model,
+                                            int   encoder_head_num,
+                                            int   encoder_size_per_head,
+                                            int   encoder_inter_size,
+                                            int   decoder_d_model,
+                                            int   decoder_head_num,
+                                            int   decoder_size_per_head,
+                                            int   decoder_inter_size,
+                                            int   decoder_vocab_size,
+                                            int   tensor_para_size,
+                                            void* buffer_in,
+                                            bool  isAppend,
+                                            bool  is_fp16_compute_type);
+
+#ifdef ENABLE_BF16
+template void generate_t5_gemm_config<__nv_bfloat16>(int   batch_size,
+                                                     int   beam_width,
+                                                     int   max_mem_seq_len,
+                                                     int   encoder_d_model,
+                                                     int   encoder_head_num,
+                                                     int   encoder_size_per_head,
+                                                     int   encoder_inter_size,
+                                                     int   decoder_d_model,
+                                                     int   decoder_head_num,
+                                                     int   decoder_size_per_head,
+                                                     int   decoder_inter_size,
+                                                     int   decoder_vocab_size,
+                                                     int   tensor_para_size,
+                                                     void* buffer_in,
+                                                     bool  isAppend,
+                                                     bool  is_fp16_compute_type);
+#endif
+
+size_t calT5GemmTestBufSizeInByte(int            batch_size,
+                                  int            beam_width,
+                                  int            max_mem_seq_len,
+                                  int            encoder_d_model,
+                                  int            encoder_head_num,
+                                  int            encoder_size_per_head,
+                                  int            encoder_inter_size,
+                                  int            decoder_d_model,
+                                  int            decoder_head_num,
+                                  int            decoder_size_per_head,
+                                  int            decoder_inter_size,
+                                  int            decoder_vocab_size,
+                                  int            tensor_para_size,
+                                  CublasDataType data_type)
+{
+    const size_t local_encoder_head_num     = encoder_head_num / tensor_para_size;
+    const size_t local_encoder_hidden_units = local_encoder_head_num * encoder_size_per_head;
+    const size_t local_encoder_inter_size   = encoder_inter_size / tensor_para_size;
+    const size_t local_decoder_head_num     = decoder_head_num / tensor_para_size;
+    const size_t local_decoder_hidden_units = local_decoder_head_num * decoder_size_per_head;
+    const size_t local_decoder_inter_size   = decoder_inter_size / tensor_para_size;
+
+    size_t              m = batch_size * max_mem_seq_len;
+    std::vector<size_t> buff_size;
+
+    // encoder qkv gemm
+    buff_size.push_back(
+        3 * (m * encoder_d_model + encoder_d_model * local_encoder_hidden_units + m * local_encoder_hidden_units));
+    // encoder batch gemm
+    buff_size.push_back(m * local_encoder_hidden_units + m * local_encoder_hidden_units
+                        + batch_size * beam_width * local_encoder_head_num * max_mem_seq_len * max_mem_seq_len);
+    // encoder ffn gemm
+    buff_size.push_back(m * local_encoder_inter_size + encoder_d_model * local_encoder_inter_size
+                        + m * encoder_d_model);
+
+    m = batch_size * beam_width;
+    // decoder qkv gemm
+    buff_size.push_back(m * decoder_d_model + decoder_d_model * 3 * local_decoder_hidden_units
+                        + 3 * m * local_decoder_hidden_units);
+    // decoder cross mem gemm
+    buff_size.push_back(m * max_mem_seq_len * encoder_d_model + encoder_d_model * local_decoder_hidden_units
+                        + m * max_mem_seq_len * local_decoder_hidden_units);
+    // decoder ffn gemm
+    buff_size.push_back(m * local_decoder_inter_size + decoder_d_model * local_decoder_inter_size
+                        + m * decoder_d_model);
+    // decoder vocab gemm
+    size_t decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size / 1. / tensor_para_size) * tensor_para_size);
+    if (data_type != FLOAT_DATATYPE) {
+        decoder_vocab_size_padded = ((size_t)ceil(decoder_vocab_size_padded / 8.) * 8);
+    }
+    buff_size.push_back(m * decoder_d_model + decoder_d_model * decoder_vocab_size_padded / tensor_para_size
+                        + m * decoder_vocab_size_padded / tensor_para_size);
+
+    size_t buf_size_in_byte = 0;
+    // int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
+    // Because we always use float for some buffer, set the wordSize to float directly.
+    int wordSize = sizeof(float);
+    for (auto t : buff_size) {
+        buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
+    }
+    buf_size_in_byte *= wordSize;
+    buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
+
+    return buf_size_in_byte;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/t5_gemm_func.h b/src/fastertransformer/utils/gemm_test/t5_gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..cea1586b252f9723db62aa8422e7360153b9d987
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/t5_gemm_func.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_t5_gemm_config(int   batch_size,
+                             int   beam_width,
+                             int   max_mem_seq_len,
+                             int   encoder_d_model,
+                             int   encoder_head_num,
+                             int   encoder_size_per_head,
+                             int   encoder_inter_size,
+                             int   decoder_d_model,
+                             int   decoder_head_num,
+                             int   decoder_size_per_head,
+                             int   decoder_inter_size,
+                             int   decoder_vocab_size,
+                             int   tensor_para_size,
+                             void* buffer_in,
+                             bool  isAppend,
+                             bool  is_fp16_compute_type);
+
+size_t calT5GemmTestBufSizeInByte(int            batch_size,
+                                  int            beam_width,
+                                  int            max_mem_seq_len,
+                                  int            encoder_d_model,
+                                  int            encoder_head_num,
+                                  int            encoder_size_per_head,
+                                  int            encoder_inter_size,
+                                  int            decoder_d_model,
+                                  int            decoder_head_num,
+                                  int            decoder_size_per_head,
+                                  int            decoder_inter_size,
+                                  int            decoder_vocab_size,
+                                  int            tensor_para_size,
+                                  CublasDataType data_type);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.cc b/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f24c54e1d50ab8705c629d4ffce33fcb4017b23f
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.cc
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_xlnet_gemm_config(int   batch_size,
+                                int   seq_len,
+                                int   head_num,
+                                int   size_per_head,
+                                int   hidden_units_,
+                                int   inter_size_,
+                                void* buffer_in,
+                                bool  isAppend)
+{
+    void* cublas_workspace;
+    void* buffer;
+    int   workSpaceSize;
+
+#ifdef ENABLE_BF16
+    if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
+#else
+    if (std::is_same<T, half>::value) {
+#endif  // ENABLE_BF16
+        // cublas_workspace_ should be the start pointer of cudaMalloc()
+        // to ensure 16B alignemnet
+        cublas_workspace = buffer_in;
+        buffer           = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
+        workSpaceSize    = CUBLAS_WORKSPACE_SIZE;
+    }
+    else {
+        cublas_workspace = nullptr;
+        buffer           = buffer_in;
+        workSpaceSize    = 0;
+    }
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+    printf("Device %s\n", prop.name);
+
+    // check config
+    FILE* fd;
+    int   line_count = 0;
+    if (!isAppend) {
+        fd = fopen(GEMM_CONFIG, "w+");
+    }
+    else {
+        fd = fopen(GEMM_CONFIG, "a+");
+        std::vector<std::string> config;
+        char                     line[1024];
+        while (fgets(line, 1024, fd) != NULL) {
+            config.push_back(std::string(line));
+        }
+        line_count = config.size();
+        if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1))  // 6 cublas/cublasLt, first row is not included
+        {
+            int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
+            fclose(fd);
+            fd = fopen(GEMM_CONFIG, "w+");
+            fprintf(fd, "%s", config[0].c_str());
+            for (uint i = startIdx; i < config.size(); i++) {
+                fprintf(fd, "%s", config[i].c_str());
+            }
+            line_count = config.size() - (GEMM_NUM + 3);
+        }
+    }
+
+    const int         gemm_num = 10;
+    int               M[gemm_num];
+    int               N[gemm_num];
+    int               K[gemm_num];
+    int               lda[gemm_num];
+    int               strideA[gemm_num];
+    int               ldb[gemm_num];
+    int               strideB[gemm_num];
+    int               ldc[gemm_num];
+    int               strideC[gemm_num];
+    cublasOperation_t transa[gemm_num]     = {CUBLAS_OP_N,
+                                              CUBLAS_OP_N,
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_N,
+                                              CUBLAS_OP_T,
+                                              CUBLAS_OP_N,
+                                              CUBLAS_OP_N};
+    cublasOperation_t transb[gemm_num]     = {CUBLAS_OP_N};
+    int               batchCount[gemm_num] = {1};
+    char              mess[gemm_num][256];
+
+    // gemm1
+    M[0]          = hidden_units_;
+    N[0]          = seq_len * batch_size;
+    K[0]          = hidden_units_;
+    lda[0]        = hidden_units_;
+    strideA[0]    = hidden_units_ * hidden_units_;
+    ldb[0]        = hidden_units_;
+    strideB[0]    = 0;
+    ldc[0]        = hidden_units_;
+    strideC[0]    = seq_len * batch_size * hidden_units_;
+    batchCount[0] = 3;
+    strcpy(mess[0], "from_tensor * weightQ/K/V");
+
+    // gemm2
+    M[1]          = hidden_units_;
+    N[1]          = seq_len * 2;
+    K[1]          = hidden_units_;
+    batchCount[1] = 1;
+    strcpy(mess[1], " k_head_r_");
+
+    // gemm3
+    M[2]          = seq_len;
+    N[2]          = seq_len;
+    K[2]          = size_per_head;
+    lda[2]        = size_per_head;
+    strideA[2]    = seq_len * size_per_head;
+    ldb[2]        = size_per_head;
+    strideB[2]    = seq_len * size_per_head;
+    ldc[2]        = seq_len;
+    strideC[2]    = seq_len * seq_len;
+    batchCount[2] = batch_size * head_num;
+    strcpy(mess[2], "ac");
+
+    // gemm4
+    M[3]       = seq_len * 2;
+    N[3]       = seq_len;
+    K[3]       = size_per_head;
+    lda[3]     = size_per_head;
+    strideA[3] = seq_len * 2 * size_per_head;
+    ldb[3]     = size_per_head;
+    strideB[3] = seq_len * size_per_head;
+    ldc[3]     = seq_len * 2;
+    strideC[3] = seq_len * seq_len * 2;
+
+    batchCount[3] = batch_size * head_num;
+    strcpy(mess[3], "bd");
+
+    // gemm5
+    M[4]          = 2;
+    N[4]          = seq_len;
+    K[4]          = size_per_head;
+    lda[4]        = size_per_head;
+    strideA[4]    = 2 * size_per_head;
+    ldb[4]        = size_per_head;
+    strideB[4]    = seq_len * size_per_head;
+    ldc[4]        = 2;
+    strideC[4]    = seq_len * 2;
+    batchCount[4] = batch_size * head_num;
+    strcpy(mess[4], "ef");
+
+    // gemm6
+    M[5]       = head_num;
+    N[5]       = seq_len;
+    K[5]       = 2;
+    lda[5]     = 2;
+    strideA[5] = 2 * head_num;
+    ldb[5]     = 2;
+    strideB[5] = seq_len * 2;
+    ldc[5]     = head_num;
+    strideC[5] = seq_len * head_num;
+
+    batchCount[5] = batch_size * seq_len;
+    strcpy(mess[5], "seg_mat");
+    // gemm7
+    M[6]       = size_per_head;
+    N[6]       = seq_len;
+    K[6]       = seq_len;
+    lda[6]     = size_per_head;
+    strideA[6] = seq_len * size_per_head;
+    ldb[6]     = seq_len;
+    strideB[6] = seq_len * seq_len;
+    ldc[6]     = size_per_head;
+    strideC[6] = seq_len * size_per_head;
+
+    batchCount[6] = batch_size * head_num;
+    strcpy(mess[6], "attn_vec");
+
+    // gemm8
+    M[7]          = hidden_units_;
+    N[7]          = seq_len * batch_size;
+    K[7]          = hidden_units_;
+    lda[7]        = hidden_units_;
+    batchCount[7] = 1;
+    strcpy(mess[7], "attn_out");
+
+    // gemm9
+    M[8]          = inter_size_;
+    N[8]          = seq_len * batch_size;
+    K[8]          = hidden_units_;
+    batchCount[8] = 1;
+    strcpy(mess[8], "output_fc1_");
+
+    // gemm10
+    M[9]          = hidden_units_;
+    N[9]          = seq_len * batch_size;
+    K[9]          = inter_size_;
+    batchCount[9] = 1;
+
+    strcpy(mess[9], "output_fc2_");
+
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    cublasLtHandle_t ltHandle;
+    check_cuda_error(cublasLtCreate(&ltHandle));
+
+    cudaDataType_t AType;
+    cudaDataType_t BType;
+    cudaDataType_t CType;
+    cudaDataType_t computeType;
+    int            startAlgo, endAlgo;
+    const int      ites = 100;
+    struct timeval start, end;
+
+    CublasDataType data_type;
+    if (std::is_same<T, float>::value) {
+        data_type   = FLOAT_DATATYPE;
+        AType       = CUDA_R_32F;
+        BType       = CUDA_R_32F;
+        CType       = CUDA_R_32F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO23;
+    }
+    else if (std::is_same<T, half>::value) {
+        data_type   = HALF_DATATYPE;
+        AType       = CUDA_R_16F;
+        BType       = CUDA_R_16F;
+        CType       = CUDA_R_16F;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#ifdef ENABLE_BF16
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        data_type   = BFLOAT16_DATATYPE;
+        AType       = CUDA_R_16BF;
+        BType       = CUDA_R_16BF;
+        CType       = CUDA_R_16BF;
+        computeType = CUDA_R_32F;
+        startAlgo   = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+        endAlgo     = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+    }
+#endif
+
+    using scaleT = typename ScaleTypeConverter<T, false>::Type;
+
+    scaleT alpha = (scaleT)1.0f;
+    scaleT beta  = (scaleT)0.0f;
+
+    printf("***Xlnet Gemm Testing Begin***\n");
+    printf("***Cublas Gemm Testing Begin***\n");
+    if (line_count == 0) {
+        fprintf(fd,
+                "batch_size, seq_len, head_num, size_per_head dataType ### "
+                "batchCount, n, m, k, algoId, "
+                "customOption, tile, numSplitsK, swizzle, reductionScheme, "
+                "workspaceSize, stages, exec_time\n");
+    }
+    for (int i = 0; i < gemm_num; ++i) {
+        int m = M[i], n = N[i], k = K[i];
+        printf("\n-----------------------------\n");
+        printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
+        T* d_A = (T*)buffer;
+        T* d_B = d_A + m * k * batchCount[i];
+        T* d_C = d_B + k * n * batchCount[i];
+
+        float exec_time = 99999.0f;
+        int   fast_algo = 0;
+        for (int algo = startAlgo; algo <= endAlgo; algo++) {
+            cublasStatus_t status;
+            cudaDeviceSynchronize();
+            gettimeofday(&start, NULL);
+            for (int ite = 0; ite < ites; ++ite) {
+                if (i == 1 || i == 7 || i == 8 || i == 9) {
+                    status = cublasGemmEx(cublas_handle,
+                                          transa[i],
+                                          transb[i],
+                                          n,
+                                          m,
+                                          k,
+                                          &alpha,
+                                          d_A,
+                                          AType,
+                                          n,
+                                          d_B,
+                                          AType,
+                                          k,
+                                          &beta,
+                                          d_C,
+                                          CType,
+                                          n,
+                                          computeType,
+                                          static_cast<cublasGemmAlgo_t>(algo));
+                }
+                else {
+                    status = cublasGemmStridedBatchedEx(cublas_handle,
+                                                        transa[i],
+                                                        transb[i],
+                                                        m,
+                                                        n,
+                                                        k,
+                                                        &alpha,
+                                                        d_A,
+                                                        BType,
+                                                        lda[i],
+                                                        strideA[i],
+                                                        d_B,
+                                                        AType,
+                                                        ldb[i],
+                                                        strideB[i],
+                                                        &beta,
+                                                        d_C,
+                                                        CType,
+                                                        ldc[i],
+                                                        strideC[i],
+                                                        batchCount[i],
+                                                        computeType,
+                                                        static_cast<cublasGemmAlgo_t>(algo));
+                }
+                if (status != CUBLAS_STATUS_SUCCESS) {
+                    break;
+                }
+            }
+            cudaDeviceSynchronize();
+            gettimeofday(&end, NULL);
+            if (status == CUBLAS_STATUS_SUCCESS) {
+                printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
+                if (diffTime(start, end) / ites < exec_time) {
+                    exec_time = diffTime(start, end) / ites;
+                    fast_algo = algo;
+                }  // end if diffTime
+            }      // end status
+        }          // end for algo
+
+        printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
+
+        if ((i == 1 || i == 7 || i == 8 || i == 9) && data_type != FLOAT_DATATYPE) {
+            printf("***cublasLt Gemm Testing Begin***\n");
+            // Let try a fixed number of combinations
+            int                ALGO_COMBINATIONS = 5000;
+            customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
+
+            LtHgemmCustomFind<T, scaleT>(ltHandle,
+                                         batch_size,
+                                         seq_len,
+                                         head_num,
+                                         size_per_head,
+                                         n,
+                                         m,
+                                         k,
+                                         &alpha,
+                                         d_B,
+                                         d_A,
+                                         &beta,
+                                         d_C,
+                                         cublas_workspace,
+                                         workSpaceSize,
+                                         fd,
+                                         perfResults,
+                                         ALGO_COMBINATIONS);
+            if (perfResults[0].time < exec_time) {
+                printPerfStructure(
+                    batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
+                exec_time = perfResults[0].time;
+            }
+            else {
+                fprintf(fd,
+                        "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                        "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                        "-1 -1 -1 "
+#endif
+                        "%f\n",
+                        batch_size,
+                        seq_len,
+                        head_num,
+                        size_per_head,
+                        data_type,
+                        batchCount[i],
+                        n,
+                        m,
+                        k,
+                        fast_algo,
+                        exec_time);
+            }
+            printf("***cublasLt Gemm Testing End***\n");
+        }
+        else {
+            fprintf(fd,
+                    "%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
+#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
+                    "-1 -1 "
+#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
+                    "-1 -1 -1 "
+#endif
+                    "%f\n",
+                    batch_size,
+                    seq_len,
+                    head_num,
+                    size_per_head,
+                    data_type,
+                    batchCount[i],
+                    n,
+                    m,
+                    k,
+                    fast_algo,
+                    exec_time);
+        }  // end else fp16
+    }      // end i
+    printf("***cublas Gemm Testing End***\n\n");
+    fclose(fd);
+    printf("***Xlnet Gemm Testing End***\n");
+
+    return;
+}
+
+template void generate_xlnet_gemm_config<float>(int   batch_size,
+                                                int   seq_len,
+                                                int   head_num,
+                                                int   size_per_head,
+                                                int   hidden_units_,
+                                                int   inter_size_,
+                                                void* buffer_in,
+                                                bool  isAppend);
+template void generate_xlnet_gemm_config<half>(int   batch_size,
+                                               int   seq_len,
+                                               int   head_num,
+                                               int   size_per_head,
+                                               int   hidden_units_,
+                                               int   inter_size_,
+                                               void* buffer_in,
+                                               bool  isAppend);
+#ifdef ENABLE_BF16
+template void generate_xlnet_gemm_config<__nv_bfloat16>(int   batch_size,
+                                                        int   seq_len,
+                                                        int   head_num,
+                                                        int   size_per_head,
+                                                        int   hidden_units_,
+                                                        int   inter_size_,
+                                                        void* buffer_in,
+                                                        bool  isAppend);
+#endif
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h b/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e5e0212cfd1b4ae12ce9c194484adae00e1d5f4
--- /dev/null
+++ b/src/fastertransformer/utils/gemm_test/xlnet_gemm_func.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cublasAlgoMap.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <map>
+#include <sys/time.h>
+#include <unistd.h>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+void generate_xlnet_gemm_config(int   batch_size,
+                                int   seq_len,
+                                int   head_num,
+                                int   size_per_head,
+                                int   hidden_units_,
+                                int   inter_size_,
+                                void* buffer_in,
+                                bool  isAppend = true);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/gpu_buf.h b/src/fastertransformer/utils/gpu_buf.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2505b8f8ae0eeed86f07698f19947d6eef0576b
--- /dev/null
+++ b/src/fastertransformer/utils/gpu_buf.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_fp16.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include <cstdlib>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+class GPUBuf {
+public:
+    GPUBuf(size_t size, bool random_init = true): size(size), ptr(nullptr)
+    {
+        deviceMalloc(&ptr, size, random_init);
+    }
+    template<typename T2>
+    GPUBuf(const GPUBuf<T2>& buf_src): size(buf_src.size), ptr(nullptr)
+    {
+        deviceMalloc(&ptr, size, false);
+        set(buf_src);
+    }
+
+    template<typename T2>
+    void set(const GPUBuf<T2>& buf_src)
+    {
+        if (std::is_same<T, T2>::value) {
+            cudaD2Dcpy(ptr, reinterpret_cast<T*>(buf_src.ptr), size);
+        }
+        else {
+            invokeCudaCast(ptr, buf_src.ptr, size, 0);
+        }
+    }
+
+    void set(const T* h_ptr)
+    {
+        cudaH2Dcpy(ptr, h_ptr, size);
+    }
+
+    void to_host(T* h_ptr) const
+    {
+        cudaD2Hcpy(h_ptr, ptr, size);
+    }
+
+    std::vector<T> to_host_vec() const
+    {
+        std::vector<T> host_vec(size);
+        cudaD2Hcpy(host_vec.data(), ptr, size);
+        return host_vec;
+    }
+
+    void zero()
+    {
+        deviceMemSetZero(ptr, size);
+    }
+
+    ~GPUBuf()
+    {
+        if (ptr != nullptr)
+            cudaFree(ptr);
+    }
+
+    size_t size;
+    T*     ptr;
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/instance_comm.h b/src/fastertransformer/utils/instance_comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b2c4bbf261f39ea4f6adb452d23a7f47e68631c
--- /dev/null
+++ b/src/fastertransformer/utils/instance_comm.h
@@ -0,0 +1,16 @@
+#pragma once
+
+namespace fastertransformer {
+
+class AbstractInstanceComm {
+public:
+    virtual ~AbstractInstanceComm() = default;
+
+    virtual void barrier() = 0;
+
+    virtual void setSharedObject(void*) = 0;
+
+    virtual void* getSharedObject() = 0;
+};
+
+}  // namespace fastertransformer
\ No newline at end of file
diff --git a/src/fastertransformer/utils/logger.cc b/src/fastertransformer/utils/logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02d24c24f3404634156f56df43bec1e5bd79b812
--- /dev/null
+++ b/src/fastertransformer/utils/logger.cc
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/logger.h"
+#include <cuda_runtime.h>
+
+namespace fastertransformer {
+
+Logger::Logger()
+{
+    char* is_first_rank_only_char = std::getenv("FT_LOG_FIRST_RANK_ONLY");
+    bool  is_first_rank_only =
+        (is_first_rank_only_char != nullptr && std::string(is_first_rank_only_char) == "ON") ? true : false;
+
+    int device_id;
+    cudaGetDevice(&device_id);
+
+    char* level_name = std::getenv("FT_LOG_LEVEL");
+    if (level_name != nullptr) {
+        std::map<std::string, Level> name_to_level = {
+            {"TRACE", TRACE},
+            {"DEBUG", DEBUG},
+            {"INFO", INFO},
+            {"WARNING", WARNING},
+            {"ERROR", ERROR},
+        };
+        auto level = name_to_level.find(level_name);
+        // If FT_LOG_FIRST_RANK_ONLY=ON, set LOG LEVEL of other device to ERROR
+        if (is_first_rank_only && device_id != 0) {
+            level = name_to_level.find("ERROR");
+        }
+        if (level != name_to_level.end()) {
+            setLevel(level->second);
+        }
+        else {
+            fprintf(stderr,
+                    "[FT][WARNING] Invalid logger level FT_LOG_LEVEL=%s. "
+                    "Ignore the environment variable and use a default "
+                    "logging level.\n",
+                    level_name);
+            level_name = nullptr;
+        }
+    }
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/logger.h b/src/fastertransformer/utils/logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..af617ff8b4b816e571baf180236bc3c21d4c8259
--- /dev/null
+++ b/src/fastertransformer/utils/logger.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdlib>
+#include <map>
+#include <string>
+
+#include "src/fastertransformer/utils/string_utils.h"
+
+namespace fastertransformer {
+
+class Logger {
+
+public:
+    enum Level
+    {
+        TRACE   = 0,
+        DEBUG   = 10,
+        INFO    = 20,
+        WARNING = 30,
+        ERROR   = 40
+    };
+
+    static Logger& getLogger()
+    {
+        thread_local Logger instance;
+        return instance;
+    }
+    Logger(Logger const&) = delete;
+    void operator=(Logger const&) = delete;
+
+    template<typename... Args>
+    void log(const Level level, const std::string format, const Args&... args)
+    {
+        if (level_ <= level) {
+            std::string fmt = getPrefix(level) + format + "\n";
+            // FILE*       out    = level_ < WARNING ? stdout : stderr;
+            std::string logstr = fmtstr(fmt, args...);
+            fprintf(stderr, "%s", logstr.c_str());
+        }
+    }
+
+    template<typename... Args>
+    void log(const Level level, const int rank, const std::string format, const Args&... args)
+    {
+        if (level_ <= level) {
+            std::string fmt = getPrefix(level, rank) + format + "\n";
+            // FILE*       out    = level_ < WARNING ? stdout : stderr;
+            std::string logstr = fmtstr(fmt, args...);
+            fprintf(stderr, "%s", logstr.c_str());
+        }
+    }
+
+    void setLevel(const Level level)
+    {
+        level_ = level;
+        log(INFO, "Set logger level by %s", getLevelName(level).c_str());
+    }
+
+    int getLevel() const
+    {
+        return level_;
+    }
+
+private:
+    const std::string                              PREFIX      = "[FT]";
+    const std::map<const Level, const std::string> level_name_ = {
+        {TRACE, "TRACE"}, {DEBUG, "DEBUG"}, {INFO, "INFO"}, {WARNING, "WARNING"}, {ERROR, "ERROR"}};
+
+#ifndef NDEBUG
+    const Level DEFAULT_LOG_LEVEL = DEBUG;
+#else
+    const Level DEFAULT_LOG_LEVEL = INFO;
+#endif
+    Level level_ = DEFAULT_LOG_LEVEL;
+
+    Logger();
+
+    inline const std::string getLevelName(const Level level)
+    {
+        return level_name_.at(level);
+    }
+
+    inline const std::string getPrefix(const Level level)
+    {
+        return PREFIX + "[" + getLevelName(level) + "] ";
+    }
+
+    inline const std::string getPrefix(const Level level, const int rank)
+    {
+        return PREFIX + "[" + getLevelName(level) + "][" + std::to_string(rank) + "] ";
+    }
+};
+
+#define FT_LOG(level, ...)                                                                                             \
+    do {                                                                                                               \
+        if (fastertransformer::Logger::getLogger().getLevel() <= level) {                                              \
+            fastertransformer::Logger::getLogger().log(level, __VA_ARGS__);                                            \
+        }                                                                                                              \
+    } while (0)
+
+#define FT_LOG_TRACE(...) FT_LOG(fastertransformer::Logger::TRACE, __VA_ARGS__)
+#define FT_LOG_DEBUG(...) FT_LOG(fastertransformer::Logger::DEBUG, __VA_ARGS__)
+#define FT_LOG_INFO(...) FT_LOG(fastertransformer::Logger::INFO, __VA_ARGS__)
+#define FT_LOG_WARNING(...) FT_LOG(fastertransformer::Logger::WARNING, __VA_ARGS__)
+#define FT_LOG_ERROR(...) FT_LOG(fastertransformer::Logger::ERROR, __VA_ARGS__)
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/memory_utils.cu b/src/fastertransformer/utils/memory_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c7c8e731ad8ee0cda6c38e688c7d2e461d3b5fd
--- /dev/null
+++ b/src/fastertransformer/utils/memory_utils.cu
@@ -0,0 +1,837 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <curand_kernel.h>
+#include <sys/stat.h>
+#include <unordered_map>
+
+namespace fastertransformer {
+
+template<typename T>
+void deviceMalloc(T** ptr, size_t size, bool is_random_initialize)
+{
+    FT_CHECK_WITH_INFO(size >= ((size_t)0), "Ask deviceMalloc size " + std::to_string(size) + "< 0 is invalid.");
+    check_cuda_error(cudaMalloc((void**)(ptr), sizeof(T) * size));
+    if (is_random_initialize) {
+        cudaRandomUniform(*ptr, size);
+    }
+}
+
+template void deviceMalloc(float** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(half** ptr, size_t size, bool is_random_initialize);
+#ifdef ENABLE_BF16
+template void deviceMalloc(__nv_bfloat16** ptr, size_t size, bool is_random_initialize);
+#endif
+template void deviceMalloc(uint16_t** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(int** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(bool** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(char** ptr, size_t size, bool is_random_initialize);
+template void deviceMalloc(int8_t** ptr, size_t size, bool is_random_initialize);
+#ifdef ENABLE_FP8
+template void deviceMalloc(__nv_fp8_e4m3** ptr, size_t size, bool is_random_initialize);
+#endif
+
+template<typename T>
+void deviceMemSetZero(T* ptr, size_t size)
+{
+    check_cuda_error(cudaMemset(static_cast<void*>(ptr), 0, sizeof(T) * size));
+}
+
+template void deviceMemSetZero(float* ptr, size_t size);
+template void deviceMemSetZero(half* ptr, size_t size);
+template void deviceMemSetZero(int* ptr, size_t size);
+template void deviceMemSetZero(uint32_t* ptr, size_t size);
+template void deviceMemSetZero(bool* ptr, size_t size);
+#ifdef ENABLE_FP8
+template void deviceMemSetZero(__nv_fp8_e4m3* ptr, size_t size);
+#endif
+#ifdef ENABLE_BF16
+template void deviceMemSetZero(__nv_bfloat16* ptr, size_t size);
+#endif
+
+template<typename T>
+void deviceFree(T*& ptr)
+{
+    if (ptr != NULL) {
+        check_cuda_error(cudaFree(ptr));
+        ptr = NULL;
+    }
+}
+
+template void deviceFree(float*& ptr);
+template void deviceFree(half*& ptr);
+#ifdef ENABLE_BF16
+template void deviceFree(__nv_bfloat16*& ptr);
+#endif
+template void deviceFree(unsigned short*& ptr);
+template void deviceFree(int*& ptr);
+template void deviceFree(bool*& ptr);
+template void deviceFree(char*& ptr);
+template void deviceFree(int8_t*& ptr);
+#ifdef ENABLE_FP8
+template void deviceFree(__nv_fp8_e4m3*& ptr);
+#endif
+
+template<typename T>
+void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream)
+{
+    T* arr = new T[size];
+    std::fill(arr, arr + size, value);
+    check_cuda_error(cudaMemcpyAsync(devptr, arr, sizeof(T) * size, cudaMemcpyHostToDevice, stream));
+    delete[] arr;
+}
+
+template void deviceFill(float* devptr, size_t size, float value, cudaStream_t stream);
+template void deviceFill(half* devptr, size_t size, half value, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void deviceFill(__nv_bfloat16* devptr, size_t size, __nv_bfloat16 value, cudaStream_t stream);
+#endif
+template void deviceFill(int* devptr, size_t size, int value, cudaStream_t stream);
+template void deviceFill(bool* devptr, size_t size, bool value, cudaStream_t stream);
+
+template<typename T>
+void cudaD2Hcpy(T* tgt, const T* src, const size_t size)
+{
+    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToHost));
+}
+
+template void cudaD2Hcpy(float* tgt, const float* src, size_t size);
+template void cudaD2Hcpy(half* tgt, const half* src, size_t size);
+#ifdef ENABLE_BF16
+template void cudaD2Hcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
+#endif
+template void cudaD2Hcpy(int* tgt, const int* src, size_t size);
+template void cudaD2Hcpy(bool* tgt, const bool* src, size_t size);
+#ifdef ENABLE_FP8
+template void cudaD2Hcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size);
+#endif
+template void cudaD2Hcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
+template void cudaD2Hcpy(unsigned int* tgt, const unsigned int* src, size_t size);
+template void cudaD2Hcpy(int8_t* tgt, const int8_t* src, size_t size);
+
+template<typename T>
+void cudaH2Dcpy(T* tgt, const T* src, const size_t size)
+{
+    if (tgt == nullptr || src == nullptr) {
+        FT_LOG_ERROR("cudaH2Dcpy: dst=%p src=%p, size=%d", tgt, src, (int)(sizeof(T) * size));
+    }
+    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyHostToDevice));
+}
+
+template void cudaH2Dcpy(float* tgt, const float* src, size_t size);
+template void cudaH2Dcpy(half* tgt, const half* src, size_t size);
+#ifdef ENABLE_BF16
+template void cudaH2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
+#endif
+template void cudaH2Dcpy(int* tgt, const int* src, size_t size);
+template void cudaH2Dcpy(bool* tgt, const bool* src, size_t size);
+#ifdef ENABLE_FP8
+template void cudaH2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size);
+#endif
+template void cudaH2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
+template void cudaH2Dcpy(unsigned int* tgt, const unsigned int* src, size_t size);
+template void cudaH2Dcpy(int8_t* tgt, const int8_t* src, size_t size);
+
+template<typename T>
+void cudaD2Dcpy(T* tgt, const T* src, const size_t size)
+{
+    check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+}
+
+template void cudaD2Dcpy(float* tgt, const float* src, size_t size);
+template void cudaD2Dcpy(half* tgt, const half* src, size_t size);
+#ifdef ENABLE_BF16
+template void cudaD2Dcpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size);
+#endif
+template void cudaD2Dcpy(int* tgt, const int* src, size_t size);
+template void cudaD2Dcpy(bool* tgt, const bool* src, size_t size);
+template void cudaD2Dcpy(int8_t* tgt, const int8_t* src, size_t size);
+#ifdef ENABLE_FP8
+template void cudaD2Dcpy(__nv_fp8_e4m3* tgt, const __nv_fp8_e4m3* src, size_t size);
+#endif
+template void cudaD2Dcpy(unsigned long long* tgt, const unsigned long long* src, size_t size);
+
+template<typename T_OUT, typename T_IN>
+__global__ void cudaCast(T_OUT* dst, T_IN* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = (T_OUT)((float)(src[tid]));
+    }
+}
+
+template<typename T_OUT, typename T_IN>
+void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream)
+{
+    cudaCast<<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+template void invokeCudaCast(float* dst, half const* const src, const size_t size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeCudaCast(float* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
+template void invokeCudaCast(__nv_bfloat16* dst, float const* const src, const size_t size, cudaStream_t stream);
+template void invokeCudaCast(__nv_bfloat16* dst, half const* const src, const size_t size, cudaStream_t stream);
+template void invokeCudaCast(half* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
+#endif
+#ifdef ENABLE_FP8
+template void invokeCudaCast(float* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
+template void
+invokeCudaCast(__nv_bfloat16* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
+template void invokeCudaCast(half* dst, __nv_fp8_e4m3 const* const src, const size_t size, cudaStream_t stream);
+template void invokeCudaCast(__nv_fp8_e4m3* dst, float const* const src, const size_t size, cudaStream_t stream);
+template void
+invokeCudaCast(__nv_fp8_e4m3* dst, __nv_bfloat16 const* const src, const size_t size, cudaStream_t stream);
+template void invokeCudaCast(__nv_fp8_e4m3* dst, half const* const src, const size_t size, cudaStream_t stream);
+#endif
+
+template<typename T>
+void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream)
+{
+    if (stream != NULL) {
+        check_cuda_error(cudaMemcpyAsync(tgt, src, sizeof(T) * size, cudaMemcpyDefault, stream));
+    }
+    else {
+        check_cuda_error(cudaMemcpy(tgt, src, sizeof(T) * size, cudaMemcpyDefault));
+    }
+}
+
+template void cudaAutoCpy(float* tgt, const float* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(half* tgt, const half* src, size_t size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void cudaAutoCpy(__nv_bfloat16* tgt, const __nv_bfloat16* src, size_t size, cudaStream_t stream);
+#endif
+template void cudaAutoCpy(int* tgt, const int* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(bool* tgt, const bool* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(int8_t* tgt, const int8_t* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(uint* tgt, const uint* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(unsigned long long* tgt, const unsigned long long* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(char* tgt, const char* src, size_t size, cudaStream_t stream);
+
+template void cudaAutoCpy(float const** tgt, float const* const* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(half const** tgt, half const* const* src, size_t size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void cudaAutoCpy(__nv_bfloat16 const** tgt, __nv_bfloat16 const* const* src, size_t size, cudaStream_t stream);
+#endif
+template void cudaAutoCpy(int const** tgt, int const* const* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(bool const** tgt, bool const* const* src, size_t size, cudaStream_t stream);
+template void cudaAutoCpy(int8_t const** tgt, int8_t const* const* src, size_t size, cudaStream_t stream);
+template void
+cudaAutoCpy(unsigned long long const** tgt, unsigned long long const* const* src, size_t size, cudaStream_t stream);
+
+template<typename T>
+__global__ void cuda_random_uniform_kernel(T* buffer, const size_t size, const int seq_offset)
+{
+    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
+    curandState_t local_state;
+    curand_init((unsigned long long int)1337, idx + seq_offset, 0, &local_state);
+    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
+        buffer[index] = (T)(curand_uniform(&local_state) * 0.2f - 0.1f);
+    }
+}
+
+template<>
+__global__ void cuda_random_uniform_kernel<int>(int* buffer, const size_t size, const int seq_offset)
+{
+    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
+    curandState_t local_state;
+    curand_init((float)1337.f, idx + seq_offset, 0, &local_state);
+    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
+        buffer[index] = curand(&local_state);
+    }
+}
+
+template<>
+__global__ void cuda_random_uniform_kernel<bool>(bool* buffer, const size_t size, const int seq_offset)
+{
+    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
+    curandState_t local_state;
+    curand_init((float)1337.f, idx + seq_offset, 0, &local_state);
+    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
+        buffer[index] = (curand(&local_state) % 2 == 0);
+    }
+}
+
+template<>
+__global__ void cuda_random_uniform_kernel<char>(char* buffer, const size_t size, const int seq_offset)
+{
+    const int     idx = blockIdx.x * blockDim.x + threadIdx.x;
+    curandState_t local_state;
+    curand_init((float)1337.f, idx + seq_offset, 0, &local_state);
+    for (size_t index = idx; index < size; index += blockDim.x * gridDim.x) {
+        buffer[index] = curand(&local_state) % 0xFF;
+    }
+}
+
+template<typename T>
+void cudaRandomUniform(T* buffer, const size_t size)
+{
+    static int seq_offset = 0;
+    cuda_random_uniform_kernel<T><<<256, 256>>>(buffer, size, seq_offset);
+    seq_offset += 256 * 256;
+}
+
+template void cudaRandomUniform(float* buffer, const size_t size);
+template void cudaRandomUniform(half* buffer, const size_t size);
+#ifdef ENABLE_BF16
+template void cudaRandomUniform(__nv_bfloat16* buffer, const size_t size);
+#endif
+template void cudaRandomUniform(int* buffer, const size_t size);
+template void cudaRandomUniform(bool* buffer, const size_t size);
+template void cudaRandomUniform(char* buffer, const size_t size);
+#ifdef ENABLE_FP8
+template void cudaRandomUniform(__nv_fp8_e4m3* buffer, const size_t size);
+#endif
+
+// loads data from binary file. If it succeeds, returns a non-empty vector. If loading fails or
+// the product of the elements in shape is 0, this function will return an empty vector.
+template<typename T>
+std::vector<T> loadWeightFromBinHelper(std::vector<size_t> shape, std::string filename)
+{
+    if (shape.size() > 2) {
+        printf("[ERROR] shape should have less than two dims \n");
+        return std::vector<T>();
+    }
+    size_t dim0 = shape[0], dim1 = 1;
+    if (shape.size() == 2) {
+        dim1 = shape[1];
+    }
+    size_t size = dim0 * dim1;
+    if (size == 0) {
+        FT_LOG_WARNING("shape is zero, skip loading weight from file %s \n", filename.c_str());
+        return std::vector<T>();
+    }
+
+    std::vector<T> host_array(size);
+    std::ifstream  in(filename, std::ios::in | std::ios::binary);
+    if (!in.is_open()) {
+        FT_LOG_WARNING("file %s cannot be opened, loading model fails! \n", filename.c_str());
+        return std::vector<T>();
+    }
+
+    size_t loaded_data_size = sizeof(T) * size;
+    in.seekg(0, in.end);
+    in.seekg(0, in.beg);
+
+    FT_LOG_DEBUG("Read " + std::to_string(loaded_data_size) + " bytes from " + filename);
+    in.read((char*)host_array.data(), loaded_data_size);
+
+    size_t in_get_size = in.gcount();
+    if (in_get_size != loaded_data_size) {
+        FT_LOG_WARNING("file %s only has %ld, but request %ld, loading model fails! \n",
+                       filename.c_str(),
+                       in_get_size,
+                       loaded_data_size);
+        return std::vector<T>();
+    }
+    in.close();
+    // If we succeed, return an array with values.
+    return host_array;
+}
+
+template<typename T, typename T_IN>
+int loadWeightFromBinFunc(T* ptr, std::vector<size_t> shape, std::string filename)
+{
+    std::vector<T_IN> host_array = loadWeightFromBinHelper<T_IN>(shape, filename);
+
+    if (host_array.empty()) {
+        return 0;
+    }
+
+    if (std::is_same<T, T_IN>::value == true) {
+        cudaH2Dcpy(ptr, (T*)host_array.data(), host_array.size());
+    }
+    else {
+        T_IN* ptr_2 = nullptr;
+        deviceMalloc(&ptr_2, host_array.size(), false);
+        cudaH2Dcpy(ptr_2, host_array.data(), host_array.size());
+        invokeCudaD2DcpyConvert(ptr, ptr_2, host_array.size());
+        deviceFree(ptr_2);
+    }
+    return 0;
+}
+
+template int loadWeightFromBinFunc<float, float>(float* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<half, float>(half* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<float, half>(float* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<half, half>(half* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<int8_t, int8_t>(int8_t* ptr, std::vector<size_t> shape, std::string filename);
+#ifdef ENABLE_BF16
+template int
+loadWeightFromBinFunc<__nv_bfloat16, float>(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
+template int
+loadWeightFromBinFunc<__nv_bfloat16, half>(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<float, __nv_bfloat16>(float* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<half, __nv_bfloat16>(half* ptr, std::vector<size_t> shape, std::string filename);
+template int loadWeightFromBinFunc<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16*      ptr,
+                                                                 std::vector<size_t> shape,
+                                                                 std::string         filename);
+#endif  // ENABLE_BF16
+template int loadWeightFromBinFunc<int, int>(int* ptr, std::vector<size_t> shape, std::string filename);
+#ifdef ENABLE_FP8
+template int
+loadWeightFromBinFunc<__nv_fp8_e4m3, float>(__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename);
+#endif  // ENABLE_FP8
+
+template<typename T>
+int loadWeightFromBin(T* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type)
+{
+    switch (model_file_type) {
+        case FtCudaDataType::FP32:
+            loadWeightFromBinFunc<T, float>(ptr, shape, filename);
+            break;
+        case FtCudaDataType::FP16:
+            loadWeightFromBinFunc<T, half>(ptr, shape, filename);
+            break;
+        case FtCudaDataType::INT8:
+            loadWeightFromBinFunc<T, int8_t>(ptr, shape, filename);
+            break;
+#ifdef ENABLE_BF16
+        case FtCudaDataType::BF16:
+            loadWeightFromBinFunc<T, __nv_bfloat16>(ptr, shape, filename);
+            break;
+#endif
+#ifdef ENABLE_FP8
+        case FtCudaDataType::FP8:
+            loadWeightFromBinFunc<T, float>(ptr, shape, filename);
+            break;
+#endif
+        default:
+            FT_LOG_ERROR("Does not support FtCudaDataType=%d", model_file_type);
+            FT_CHECK(false);
+    }
+    return 0;
+}
+
+template<>
+int loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type)
+{
+    loadWeightFromBinFunc<int, int>(ptr, shape, filename);
+    return 0;
+}
+
+template int
+loadWeightFromBin(float* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
+template int
+loadWeightFromBin(half* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
+template int
+loadWeightFromBin(int8_t* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
+#ifdef ENABLE_BF16
+template int
+loadWeightFromBin(__nv_bfloat16* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
+#endif
+#ifdef ENABLE_FP8
+template int
+loadWeightFromBin(__nv_fp8_e4m3* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
+#endif
+template int
+loadWeightFromBin(int* ptr, std::vector<size_t> shape, std::string filename, FtCudaDataType model_file_type);
+
+template<typename T_IN, typename T_OUT>
+__global__ void cudaD2DcpyConvert(T_OUT* dst, const T_IN* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = cuda_cast<T_OUT>(src[tid]);
+    }
+}
+
+template<typename T_IN, typename T_OUT>
+void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream)
+{
+    cudaD2DcpyConvert<<<256, 256, 0, stream>>>(tgt, src, size);
+}
+
+template void invokeCudaD2DcpyConvert(int8_t* tgt, const float* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(float* tgt, const int8_t* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(float* tgt, const int* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(half* tgt, const int* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(float* tgt, const float* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(half* tgt, const float* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(float* tgt, const half* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(uint* tgt, const int* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(int* tgt, const uint* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(int* tgt, const float* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(int* tgt, const half* src, const size_t size, cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const float* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(__nv_bfloat16* tgt, const int* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(float* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DcpyConvert(int* tgt, const __nv_bfloat16* src, const size_t size, cudaStream_t stream);
+#endif  // ENABLE_BF16
+
+template<typename T_IN, typename T_OUT>
+__global__ void
+cudaD2DScaleCpyConvert(T_OUT* dst, const T_IN* src, const float* scale, bool invert_scale, const size_t size)
+{
+    const float scale_value = invert_scale ? 1.0f / scale[0] : scale[0];
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = cuda_cast<T_OUT>(cuda_cast<float>(src[tid]) * scale_value);
+    }
+}
+
+template<typename T_IN, typename T_OUT>
+void invokeCudaD2DScaleCpyConvert(
+    T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream)
+{
+    cudaD2DScaleCpyConvert<<<256, 256, 0, stream>>>(tgt, src, scale, invert_scale, size);
+}
+
+// clang-format off
+template void invokeCudaD2DScaleCpyConvert(float* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const float* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DScaleCpyConvert(half* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const half* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeCudaD2DScaleCpyConvert(__nv_bfloat16* tgt, const int32_t* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+template void invokeCudaD2DScaleCpyConvert(int32_t* tgt, const __nv_bfloat16* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+#endif  // ENABLE_BF16
+#ifdef ENABLE_FP8
+template void invokeCudaD2DScaleCpyConvert(float* tgt, const __nv_fp8_e4m3* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream);
+#endif  // ENABLE_FP8
+// clang-format on
+
+void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream)
+{
+    invokeCudaD2DcpyConvert(dst, src, size, stream);
+}
+
+void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream)
+{
+    invokeCudaD2DcpyConvert(dst, src, size, stream);
+}
+
+template<typename T>
+void saveToBinary(const T* ptr, const size_t size, std::string filename)
+{
+
+    std::vector<T> h_ptr(size);
+    cudaD2Hcpy(h_ptr.data(), ptr, size);
+    std::vector<float> float_ptr(size);
+    for (int i = 0; i < size; i++) {
+        float_ptr[i] = (float)h_ptr[i];
+    }
+
+    std::ofstream out(filename, std::ios::out | std::ios::binary);
+    FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
+
+    out.write((char*)float_ptr.data(), size * sizeof(float));
+}
+
+template void saveToBinary(const float* ptr, const size_t size, std::string filename);
+template void saveToBinary(const half* ptr, const size_t size, std::string filename);
+#ifdef ENABLE_BF16
+template void saveToBinary(const __nv_bfloat16* ptr, const size_t size, std::string filename);
+#endif  // ENABLE_BF16
+
+template<>
+void saveToBinary(const int* ptr, const size_t size, std::string filename)
+{
+    std::vector<int> h_ptr(size);
+    cudaD2Hcpy(h_ptr.data(), ptr, size);
+    std::ofstream out(filename, std::ios::out | std::ios::binary);
+    FT_CHECK_WITH_INFO(out.is_open(), "Fail to open file " + filename);
+    out.write((char*)h_ptr.data(), size * sizeof(int));
+}
+
+template<typename T_IN, typename T_fake_type>
+__global__ void fakeCast(T_IN* input_ptr, const size_t size)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+        T_fake_type tmp_val = (T_fake_type)((float)input_ptr[i]);
+        input_ptr[i]        = (T_IN)((float)tmp_val);
+    }
+}
+
+template<typename T_IN, typename T_fake_type>
+void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream)
+{
+    dim3 block(256);
+    dim3 grid((size + 255) / 256);
+    fakeCast<T_IN, T_fake_type><<<grid, block, 0, stream>>>(input_ptr, size);
+}
+
+#ifdef ENABLE_FP8
+__global__ void cudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = (float)(src[tid]);
+    }
+}
+
+void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
+{
+    cudaD2Dcpyfp82Float<<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+__global__ void cudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = (half)((float)(src[tid]));
+    }
+}
+
+void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream)
+{
+    cudaD2Dcpyfp82Half<<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+__global__ void cudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = (__nv_fp8_e4m3)src[tid];
+    }
+}
+
+void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream)
+{
+    cudaD2DcpyFloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+__global__ void cudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = (__nv_fp8_e4m3)src[tid];
+    }
+}
+
+void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream)
+{
+    cudaD2DcpyHalf2fp8<<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+__global__ void cudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
+        dst[tid] = (__nv_fp8_e4m3)src[tid];
+    }
+}
+
+void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream)
+{
+    cudaD2DcpyBfloat2fp8<<<256, 256, 0, stream>>>(dst, src, size);
+}
+
+#endif  // ENABLE_FP8
+
+template<typename T_OUT, typename T_IN>
+__global__ void transpose(T_OUT* dst, T_IN* src, const int dim0, const int dim1)
+{
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1; tid += blockDim.x * gridDim.x) {
+        const int src_col_id                = tid % dim1;
+        const int src_row_id                = tid / dim1;
+        dst[src_col_id * dim0 + src_row_id] = (T_OUT)(src[tid]);
+    }
+}
+
+template<typename T>
+void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1)
+{
+    // copy data to workspace, and then transpose from workspace to data
+    cudaD2Dcpy(workspace, data, dim0 * dim1);
+    transpose<<<256, 256>>>(data, workspace, dim0, dim1);
+}
+
+#ifdef ENABLE_FP8
+template void invokeInPlaceTranspose(__nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1);
+#endif  // ENABLE_FP8
+#ifdef ENABLE_BF16
+template void invokeInPlaceTranspose(__nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1);
+#endif  // ENABLE_BF16
+template void invokeInPlaceTranspose(float* data, float* workspace, const int dim0, const int dim1);
+
+template<typename T_OUT, typename T_IN>
+__global__ void transpose0213(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2, const int dim3)
+{
+    // src permutation: [0, 1, 2, 3]
+    // dst permutation: [0, 2, 1, 3]
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2 * dim3;
+         tid += blockDim.x * gridDim.x) {
+        int       tmp_idx   = tid;
+        const int dim_3_idx = tmp_idx % dim3;
+        tmp_idx             = (tmp_idx - dim_3_idx) / dim3;
+        const int dim_2_idx = tmp_idx % dim2;
+        tmp_idx             = (tmp_idx - dim_2_idx) / dim2;
+        const int dim_1_idx = tmp_idx % dim1;
+        tmp_idx             = (tmp_idx - dim_1_idx) / dim1;
+        const int dim_0_idx = tmp_idx % dim0;
+        dst[dim_0_idx * dim1 * dim2 * dim3 + dim_2_idx * dim1 * dim3 + dim_1_idx * dim3 + dim_3_idx] = src[tid];
+    }
+}
+
+template<typename T>
+void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3)
+{
+    // copy data to workspace, and then transpose from workspace to data
+    // Note that this kernel is used for pre-processing and not very efficient.
+    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2 * dim3);
+    transpose0213<<<256, 256>>>(data, workspace, dim0, dim1, dim2, dim3);
+}
+
+#ifdef ENABLE_FP8
+template void invokeInPlaceTranspose0213(
+    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+#endif  // ENABLE_FP8
+#ifdef ENABLE_BF16
+template void invokeInPlaceTranspose0213(
+    __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+#endif  // ENABLE_BF16
+template void invokeInPlaceTranspose0213(
+    float* data, float* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+
+template<typename T_OUT, typename T_IN>
+__global__ void transpose102(T_OUT* dst, T_IN* src, const int dim0, const int dim1, const int dim2)
+{
+    // src permutation: [0, 1, 2]
+    // dst permutation: [1, 0, 2]
+    for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < dim0 * dim1 * dim2; tid += blockDim.x * gridDim.x) {
+        int       tmp_idx                                           = tid;
+        const int dim_2_idx                                         = tmp_idx % dim2;
+        tmp_idx                                                     = (tmp_idx - dim_2_idx) / dim2;
+        const int dim_1_idx                                         = tmp_idx % dim1;
+        tmp_idx                                                     = (tmp_idx - dim_1_idx) / dim1;
+        const int dim_0_idx                                         = tmp_idx % dim0;
+        dst[dim_1_idx * dim0 * dim2 + dim_0_idx * dim2 + dim_2_idx] = src[tid];
+    }
+}
+
+template<typename T>
+void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2)
+{
+    // copy data to workspace, and then transpose from workspace to data
+    // Note that this kernel is used for pre-processing and not very efficient.
+    cudaD2Dcpy(workspace, data, dim0 * dim1 * dim2);
+    transpose102<<<256, 256>>>(data, workspace, dim0, dim1, dim2);
+}
+
+#ifdef ENABLE_FP8
+template void invokeInPlaceTranspose102(
+    __nv_fp8_e4m3* data, __nv_fp8_e4m3* workspace, const int dim0, const int dim1, const int dim2);
+#endif  // ENABLE_FP8
+#ifdef ENABLE_BF16
+template void invokeInPlaceTranspose102(
+    __nv_bfloat16* data, __nv_bfloat16* workspace, const int dim0, const int dim1, const int dim2);
+#endif  // ENABLE_BF16
+template void invokeInPlaceTranspose102(float* data, float* workspace, const int dim0, const int dim1, const int dim2);
+
+template<typename T>
+void __global__ multiplyScale(T* tensor, float scale, const size_t size)
+{
+    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
+        tensor[index] = (T)(((float)tensor[index]) * scale);
+    }
+}
+
+template<typename T>
+void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
+{
+    int block = 256;
+    int grid  = (size + 255) / 256;
+    multiplyScale<<<grid, block, 0, stream>>>(tensor, scale, size);
+}
+
+template void invokeMultiplyScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
+template void invokeMultiplyScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeMultiplyScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
+#endif
+#ifdef ENABLE_FP8
+template void invokeMultiplyScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
+#endif
+
+template<typename T>
+void __global__ divideScale(T* tensor, float scale, const size_t size)
+{
+    for (size_t index = threadIdx.x + blockIdx.x * blockDim.x; index < size; index += blockDim.x * gridDim.x) {
+        tensor[index] = (T)(((float)tensor[index]) / scale);
+    }
+}
+
+template<typename T>
+void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream)
+{
+    int block = 256;
+    int grid  = (size + 255) / 256;
+    divideScale<<<grid, block, 0, stream>>>(tensor, scale, size);
+}
+
+template void invokeDivideScale(float* tensor, float scale, const size_t size, cudaStream_t stream);
+template void invokeDivideScale(half* tensor, float scale, const size_t size, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void invokeDivideScale(__nv_bfloat16* tensor, float scale, const size_t size, cudaStream_t stream);
+#endif
+#ifdef ENABLE_FP8
+template void invokeDivideScale(__nv_fp8_e4m3* tensor, float scale, const size_t size, cudaStream_t stream);
+#endif
+#ifdef ENABLE_BF16
+template void invokeFakeCast<float, __nv_bfloat16>(float* input_ptr, const size_t size, cudaStream_t stream);
+template void
+invokeFakeCast<__nv_bfloat16, __nv_bfloat16>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
+template void invokeFakeCast<half, __nv_bfloat16>(half* input_ptr, const size_t size, cudaStream_t stream);
+#endif
+template void invokeFakeCast<float, half>(float* input_ptr, const size_t size, cudaStream_t stream);
+template void invokeFakeCast<float, float>(float* input_ptr, const size_t size, cudaStream_t stream);
+#ifdef ENABLE_FP8
+template void invokeFakeCast<float, __nv_fp8_e4m3>(float* input_ptr, const size_t size, cudaStream_t stream);
+template void invokeFakeCast<half, __nv_fp8_e4m3>(half* input_ptr, const size_t size, cudaStream_t stream);
+template void
+invokeFakeCast<__nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* input_ptr, const size_t size, cudaStream_t stream);
+#endif
+
+size_t cuda_datatype_size(FtCudaDataType dt)
+{
+    static const std::unordered_map<FtCudaDataType, size_t> sizes{{FtCudaDataType::FP32, sizeof(float)},
+                                                                  {FtCudaDataType::FP16, sizeof(half)}
+#ifdef ENABLE_BF16
+                                                                  ,
+                                                                  {FtCudaDataType::BF16, sizeof(__nv_bfloat16)}
+#endif
+    };
+
+    return sizes.at(dt);
+}
+
+template<typename T>
+__global__ void check_range(T* buffer, size_t size, T min, T max, bool* d_within_range)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+        const T val = buffer[i];
+        if (val < min || val > max) {
+            *d_within_range = false;
+        }
+    }
+}
+
+template<typename T>
+bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream)
+{
+    cudaMemsetAsync(d_within_range, true, sizeof(bool), stream);
+
+    dim3 block(256);
+    dim3 grid((size + 255) / 256);
+    check_range<T><<<grid, block, 0, stream>>>(buffer, size, min, max, d_within_range);
+
+    bool result;
+    cudaD2Hcpy(&result, d_within_range, 1);
+    return result;
+}
+
+template bool
+invokeCheckRange<int>(int* buffer, const size_t size, int min, int max, bool* d_within_range, cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/memory_utils.h b/src/fastertransformer/utils/memory_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..316a4dd2b495a38772a5120f0c870c03e35b69e9
--- /dev/null
+++ b/src/fastertransformer/utils/memory_utils.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cuda_fp8_utils.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+
+namespace fastertransformer {
+
+template<typename T>
+void deviceMalloc(T** ptr, size_t size, bool is_random_initialize = true);
+
+template<typename T>
+void deviceMemSetZero(T* ptr, size_t size);
+
+template<typename T>
+void deviceFree(T*& ptr);
+
+template<typename T>
+void deviceFill(T* devptr, size_t size, T value, cudaStream_t stream = 0);
+
+template<typename T>
+void cudaD2Hcpy(T* tgt, const T* src, const size_t size);
+
+template<typename T>
+void cudaH2Dcpy(T* tgt, const T* src, const size_t size);
+
+template<typename T>
+void cudaD2Dcpy(T* tgt, const T* src, const size_t size);
+
+template<typename T>
+void cudaAutoCpy(T* tgt, const T* src, const size_t size, cudaStream_t stream = NULL);
+
+template<typename T>
+void cudaRandomUniform(T* buffer, const size_t size);
+
+template<typename T>
+int loadWeightFromBin(T*                  ptr,
+                      std::vector<size_t> shape,
+                      std::string         filename,
+                      FtCudaDataType      model_file_type = FtCudaDataType::FP32);
+
+// template<typename T>
+// int loadWeightFromBinAndQuantizeForWeightOnly(int8_t*             quantized_weight_ptr,
+//                                               T*                  scale_ptr,
+//                                               std::vector<size_t> shape,
+//                                               std::string         filename,
+//                                               FtCudaDataType      model_file_type = FtCudaDataType::FP32);
+
+void invokeCudaD2DcpyHalf2Float(float* dst, half* src, const size_t size, cudaStream_t stream);
+void invokeCudaD2DcpyFloat2Half(half* dst, float* src, const size_t size, cudaStream_t stream);
+#ifdef ENABLE_FP8
+void invokeCudaD2Dcpyfp82Float(float* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream);
+void invokeCudaD2Dcpyfp82Half(half* dst, __nv_fp8_e4m3* src, const size_t size, cudaStream_t stream);
+void invokeCudaD2DcpyFloat2fp8(__nv_fp8_e4m3* dst, float* src, const size_t size, cudaStream_t stream);
+void invokeCudaD2DcpyHalf2fp8(__nv_fp8_e4m3* dst, half* src, const size_t size, cudaStream_t stream);
+void invokeCudaD2DcpyBfloat2fp8(__nv_fp8_e4m3* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream);
+#endif  // ENABLE_FP8
+#ifdef ENABLE_BF16
+void invokeCudaD2DcpyBfloat2Float(float* dst, __nv_bfloat16* src, const size_t size, cudaStream_t stream);
+#endif  // ENABLE_BF16
+
+template<typename T_OUT, typename T_IN>
+void invokeCudaCast(T_OUT* dst, T_IN const* const src, const size_t size, cudaStream_t stream);
+
+template<typename T, size_t n_dims>
+__inline__ __host__ __device__ size_t dim2flat(const T (&idx)[n_dims], const T (&dims)[n_dims])
+{
+    size_t flat_idx = 0;
+    for (size_t i = 0; i < n_dims; i++) {
+        flat_idx += idx[i];
+        if (i + 1 < n_dims)
+            flat_idx *= dims[i + 1];
+    }
+    return flat_idx;
+}
+
+template<typename T1, size_t n_dims, typename T2>
+__inline__ __host__ __device__ void flat2dim(T1 flat_idx, const T2 (&dims)[n_dims], T2 (&idx)[n_dims])
+{
+    for (int i = n_dims - 1; i >= 0; i--) {
+        idx[i] = flat_idx % dims[i];
+        flat_idx /= dims[i];
+    }
+}
+
+template<typename T>
+void invokeInPlaceTranspose(T* data, T* workspace, const int dim0, const int dim1);
+
+template<typename T>
+void invokeInPlaceTranspose0213(T* data, T* workspace, const int dim0, const int dim1, const int dim2, const int dim3);
+
+template<typename T>
+void invokeInPlaceTranspose102(T* data, T* workspace, const int dim0, const int dim1, const int dim2);
+
+template<typename T>
+void invokeMultiplyScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
+
+template<typename T>
+void invokeDivideScale(T* tensor, float scale, const size_t size, cudaStream_t stream);
+
+template<typename T_IN, typename T_OUT>
+void invokeCudaD2DcpyConvert(T_OUT* tgt, const T_IN* src, const size_t size, cudaStream_t stream = 0);
+
+template<typename T_IN, typename T_OUT>
+void invokeCudaD2DScaleCpyConvert(
+    T_OUT* tgt, const T_IN* src, const float* scale, bool invert_scale, const size_t size, cudaStream_t stream = 0);
+
+inline bool checkIfFileExist(const std::string& file_path)
+{
+    std::ifstream in(file_path, std::ios::in | std::ios::binary);
+    if (in.is_open()) {
+        in.close();
+        return true;
+    }
+    return false;
+}
+
+template<typename T>
+void saveToBinary(const T* ptr, const size_t size, std::string filename);
+
+template<typename T_IN, typename T_fake_type>
+void invokeFakeCast(T_IN* input_ptr, const size_t size, cudaStream_t stream);
+
+size_t cuda_datatype_size(FtCudaDataType dt);
+
+template<typename T>
+bool invokeCheckRange(T* buffer, const size_t size, T min, T max, bool* d_within_range, cudaStream_t stream);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/mpi_utils.cc b/src/fastertransformer/utils/mpi_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfd6da1a4ffff25b0ac99fb3b527812d2d27604b
--- /dev/null
+++ b/src/fastertransformer/utils/mpi_utils.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/mpi_utils.h"
+
+namespace fastertransformer {
+namespace mpi {
+
+#ifdef BUILD_MULTI_GPU
+MPI_Datatype getMpiDtype(MpiType dtype)
+{
+    static const std::unordered_map<MpiType, MPI_Datatype> dtype_map{
+        {MPI_TYPE_BYTE, MPI_BYTE},
+        {MPI_TYPE_CHAR, MPI_CHAR},
+        {MPI_TYPE_INT, MPI_INT},
+        {MPI_TYPE_INT64_T, MPI_INT64_T},
+        {MPI_TYPE_UINT32_T, MPI_UINT32_T},
+        {MPI_TYPE_UNSIGNED_LONG_LONG, MPI_UNSIGNED_LONG_LONG},
+    };
+    return dtype_map.at(dtype);
+}
+#endif
+
+void initialize(int* argc, char*** argv)
+{
+#ifdef BUILD_MULTI_GPU
+    MPICHECK(MPI_Init(argc, argv));
+#endif
+}
+
+void finalize()
+{
+#ifdef BUILD_MULTI_GPU
+    MPICHECK(MPI_Finalize());
+#endif
+}
+
+bool isInitialized()
+{
+    int mpi_initialized = 0;
+#ifdef BUILD_MULTI_GPU
+    MPICHECK(MPI_Initialized(&mpi_initialized));
+#endif
+    return static_cast<bool>(mpi_initialized);
+}
+
+void initThread(int* argc, char*** argv, MpiThreadSupport required, int* provided)
+{
+#ifdef BUILD_MULTI_GPU
+    switch (required) {
+        case THREAD_SINGLE:
+            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_SINGLE, provided));
+            break;
+        case THREAD_FUNNELED:
+            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_FUNNELED, provided));
+            break;
+        case THREAD_SERIALIZED:
+            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_SERIALIZED, provided));
+            break;
+        case THREAD_MULTIPLE:
+            MPICHECK(MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, provided));
+            break;
+        default:
+            break;
+    }
+#endif
+}
+
+int getCommWorldRank()
+{
+    int rank = 0;
+#ifdef BUILD_MULTI_GPU
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif
+    return rank;
+}
+
+int getCommWorldSize()
+{
+    int world_size = 1;
+#ifdef BUILD_MULTI_GPU
+    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+#endif
+    return world_size;
+}
+
+void barrier(MpiComm comm)
+{
+#ifdef BUILD_MULTI_GPU
+    MPICHECK(MPI_Barrier(comm.group));
+#endif
+}
+
+void barrier()
+{
+#ifdef BUILD_MULTI_GPU
+    MPICHECK(MPI_Barrier(MPI_COMM_WORLD));
+#endif
+}
+
+void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm)
+{
+#ifdef BUILD_MULTI_GPU
+    MPICHECK(MPI_Bcast(buffer, size, getMpiDtype(dtype), root, comm.group));
+#endif
+}
+
+}  // namespace mpi
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/mpi_utils.h b/src/fastertransformer/utils/mpi_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d9d9af589133807af3fe116ebd69af49e21cb93
--- /dev/null
+++ b/src/fastertransformer/utils/mpi_utils.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/logger.h"
+
+#ifdef BUILD_MULTI_GPU
+#include <mpi.h>
+#endif
+#include <stdio.h>
+#include <unordered_map>
+
+namespace fastertransformer {
+
+#ifdef BUILD_MULTI_GPU
+#define MPICHECK(cmd)                                                                                                  \
+    do {                                                                                                               \
+        int e = cmd;                                                                                                   \
+        if (e != MPI_SUCCESS) {                                                                                        \
+            printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e);                                           \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#else
+#define MPICHECK(cmd) printf("[WARNING] No MPI\n");
+#endif
+
+// A wrapper module of the MPI library.
+namespace mpi {
+
+// A wrapper of MPI data type. MPI_TYPE_{data_type}
+enum MpiType {
+    MPI_TYPE_BYTE,
+    MPI_TYPE_CHAR,
+    MPI_TYPE_INT,
+    MPI_TYPE_INT64_T,
+    MPI_TYPE_UINT32_T,
+    MPI_TYPE_UNSIGNED_LONG_LONG,
+};
+
+// A wrapper of the level of MPI thread support
+enum MpiThreadSupport {
+    THREAD_SINGLE,
+    THREAD_FUNNELED,
+    THREAD_SERIALIZED,
+    THREAD_MULTIPLE
+};
+
+struct MpiComm {
+#ifdef BUILD_MULTI_GPU
+    MPI_Comm group;
+    MpiComm(){};
+    MpiComm(MPI_Comm g): group(g){};
+#endif
+};
+
+#ifdef BUILD_MULTI_GPU
+#define COMM_WORLD MpiComm(MPI_COMM_WORLD)
+#else
+#define COMM_WORLD MpiComm()
+#endif
+
+#ifdef BUILD_MULTI_GPU
+MPI_Datatype getMpiDtype(MpiType dtype);
+#endif
+
+void initialize(int* argc, char*** argv);
+void initThread(int* argc, char*** argv, MpiThreadSupport required, int* provided);
+void finalize();
+bool isInitialized();
+void barrier(MpiComm comm);
+void barrier();
+
+int getCommWorldRank();
+int getCommWorldSize();
+
+void bcast(void* buffer, size_t size, MpiType dtype, int root, MpiComm comm);
+
+}  // namespace mpi
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/nccl_utils.cc b/src/fastertransformer/utils/nccl_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c13c54bcaba5a7f8fec8b25aa6f26da651eac504
--- /dev/null
+++ b/src/fastertransformer/utils/nccl_utils.cc
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include <atomic>
+
+namespace fastertransformer {
+
+#ifdef BUILD_MULTI_GPU
+template<typename T>
+ncclDataType_t getNcclDataType()
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    ncclDataType_t nccl_data_type;
+    if (std::is_same<T, float>::value) {
+        nccl_data_type = ncclFloat;
+    }
+    else if (std::is_same<T, half>::value) {
+        nccl_data_type = ncclHalf;
+    }
+#if defined(ENABLE_BF16) && defined(ENABLE_BF16_NCCL)
+    else if (std::is_same<T, __nv_bfloat16>::value) {
+        nccl_data_type = ncclBfloat16;
+    }
+#endif
+    else if (std::is_same<T, int>::value) {
+        nccl_data_type = ncclInt;
+    }
+    else if (std::is_same<T, char>::value) {
+        nccl_data_type = ncclChar;
+    }
+    else if (std::is_same<T, bool>::value) {
+        nccl_data_type = ncclInt8;
+    }
+    else {
+        printf("[ERROR] NCCL only support float, half, bfloat16, int, char, and bool. \n");
+        exit(-1);
+    }
+    return nccl_data_type;
+}
+#endif
+
+template<typename T>
+void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    ncclDataType_t nccl_data_type = getNcclDataType<T>();
+    NCCLCHECK(ncclGroupStart());
+    NCCLCHECK(ncclAllReduce(
+        (const void*)send_buf, (void*)recv_buf, data_size, nccl_data_type, ncclSum, nccl_param.nccl_comm_, stream));
+    NCCLCHECK(ncclGroupEnd());
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template<typename T>
+void ftNcclAllGather(
+    const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    ncclDataType_t nccl_data_type = getNcclDataType<T>();
+    NCCLCHECK(ncclGroupStart());
+    NCCLCHECK(
+        ncclAllGather(send_buf + rank * data_size, recv_buf, data_size, nccl_data_type, nccl_param.nccl_comm_, stream));
+    NCCLCHECK(ncclGroupEnd());
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template<typename T>
+void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    ncclDataType_t nccl_data_type = getNcclDataType<T>();
+    NCCLCHECK(ncclSend(send_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream));
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template void
+ftNcclSend(const float* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclSend(const half* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void ftNcclSend(
+    const __nv_bfloat16* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+#endif
+template void
+ftNcclSend(const int* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclSend(const bool* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclSend(const char* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+
+template<typename T>
+void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    ncclDataType_t nccl_data_type = getNcclDataType<T>();
+    NCCLCHECK(ncclRecv(recv_buf, data_size, nccl_data_type, peer, nccl_param.nccl_comm_, stream));
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template void
+ftNcclRecv(float* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclRecv(half* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void
+ftNcclRecv(__nv_bfloat16* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+#endif
+template void ftNcclRecv(int* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclRecv(bool* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclRecv(char* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+
+template<typename T>
+void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    ncclDataType_t nccl_data_type = getNcclDataType<T>();
+    NCCLCHECK(ncclBcast(buff, data_size, nccl_data_type, root, nccl_param.nccl_comm_, stream));
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+template void
+ftNcclBroadCast(char* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclBroadCast(bool* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclBroadCast(int* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclBroadCast(float* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+template void
+ftNcclBroadCast(half* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+#ifdef ENABLE_BF16
+template void
+ftNcclBroadCast(__nv_bfloat16* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+#endif
+
+template void ftNcclAllReduceSum(
+    const float* send_buf, float* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
+
+template void ftNcclAllReduceSum(
+    const half* send_buf, half* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
+
+template void ftNcclAllReduceSum(
+    const int32_t* send_buf, int32_t* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void ftNcclAllReduceSum(const __nv_bfloat16* send_buf,
+                                 __nv_bfloat16*       recv_buf,
+                                 const int            data_size,
+                                 NcclParam            nccl_param,
+                                 cudaStream_t         stream);
+#endif
+
+template void ftNcclAllGather(const float* send_buf,
+                              float*       recv_buf,
+                              const int    data_size,
+                              const int    rank,
+                              NcclParam    nccl_param,
+                              cudaStream_t stream);
+
+template void ftNcclAllGather(const half*  send_buf,
+                              half*        recv_buf,
+                              const int    data_size,
+                              const int    rank,
+                              NcclParam    nccl_param,
+                              cudaStream_t stream);
+
+#ifdef ENABLE_BF16
+template void ftNcclAllGather(const __nv_bfloat16* send_buf,
+                              __nv_bfloat16*       recv_buf,
+                              const int            data_size,
+                              const int            rank,
+                              NcclParam            nccl_param,
+                              cudaStream_t         stream);
+#endif
+
+void ftNcclGroupStart()
+{
+#ifdef BUILD_MULTI_GPU
+    NCCLCHECK(ncclGroupStart());
+#endif
+}
+
+void ftNcclGroupEnd()
+{
+#ifdef BUILD_MULTI_GPU
+    NCCLCHECK(ncclGroupEnd());
+#endif
+}
+
+void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para, cudaStream_t stream)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    cudaError_t  cudaErr;
+    ncclResult_t tensor_ncclErr = ncclSuccess, tensor_ncclAsyncErr = ncclSuccess, pipeline_ncclErr = ncclSuccess,
+                 pipeline_ncclAsyncErr = ncclSuccess;
+    ncclComm_t tensor_comm             = tensor_para.nccl_comm_;
+    ncclComm_t pipeline_comm           = pipeline_para.nccl_comm_;
+    if (tensor_para.world_size_ == 1 && pipeline_para.world_size_ == 1) {
+        check_cuda_error(cudaStreamSynchronize(stream));
+        return;
+    }
+    while (1) {
+        cudaErr = cudaStreamQuery(stream);
+        if (cudaErr == cudaSuccess) {
+            FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+            return;
+        }
+
+        if (cudaErr != cudaErrorNotReady) {
+            std::string error_msg = "CUDA Error : cudaStreamQuery returned " + std::to_string(cudaErr);
+            throw std::runtime_error(error_msg);
+        }
+        if (tensor_para.world_size_ > 1) {
+            tensor_ncclErr = ncclCommGetAsyncError(tensor_comm, &tensor_ncclAsyncErr);
+        }
+        if (pipeline_para.world_size_ > 1) {
+            pipeline_ncclErr = ncclCommGetAsyncError(pipeline_comm, &pipeline_ncclAsyncErr);
+        }
+
+        if (tensor_ncclErr != ncclSuccess || pipeline_ncclErr != ncclSuccess) {
+            std::string error_msg = "NCCL Error : ncclCommGetAsyncError returned " + std::to_string(tensor_ncclErr)
+                                    + " (tensor_para) " + std::to_string(pipeline_ncclErr) + " (pipeline_para)";
+            throw std::runtime_error(error_msg);
+        }
+
+        if (tensor_ncclAsyncErr != ncclSuccess) {
+            // An asynchronous error happened. Stop the operation and destroy
+            // the communicator
+            tensor_ncclErr = ncclCommAbort(tensor_comm);
+            if (tensor_ncclErr != ncclSuccess) {
+                std::string error_msg = "NCCL Error : ncclCommDestroy returned " + std::to_string(tensor_ncclErr);
+                throw std::runtime_error(error_msg);
+            }
+        }
+
+        if (pipeline_ncclAsyncErr != ncclSuccess) {
+            // An asynchronous error happened. Stop the operation and destroy
+            // the communicator
+            pipeline_ncclErr = ncclCommAbort(pipeline_comm);
+            if (pipeline_ncclErr != ncclSuccess) {
+                std::string error_msg = "NCCL Error : ncclCommDestroy returned " + std::to_string(pipeline_ncclErr);
+                throw std::runtime_error(error_msg);
+            }
+        }
+    }
+#endif
+}
+
+void ftNcclGetUniqueId(NcclUid& uid)
+{
+#ifdef BUILD_MULTI_GPU
+    NCCLCHECK(ncclGetUniqueId(&uid.nccl_uid_));
+#endif
+}
+
+void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const NcclUid uid)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+#ifdef BUILD_MULTI_GPU
+    // Initialize a nccl communicator.
+    if (param.nccl_comm_ != nullptr) {
+        FT_LOG_WARNING("NcclParam is already initialized.");
+        return;
+    }
+    param.rank_       = rank;
+    param.world_size_ = world_size;
+    param.nccl_uid_   = uid.nccl_uid_;
+    NCCLCHECK(ncclCommInitRank(&param.nccl_comm_, param.world_size_, param.nccl_uid_, param.rank_));
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+void ftNcclParamDestroy(NcclParam& param)
+{
+#ifdef BUILD_MULTI_GPU
+    if (param.nccl_comm_ != nullptr) {
+        ncclCommDestroy(param.nccl_comm_);
+    }
+#endif
+}
+
+void ftNcclInitialize(NcclParam& tensor_para,
+                      NcclParam& pipeline_para,
+                      const int  tensor_para_size,
+                      const int  pipeline_para_size)
+{
+    FT_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    // Initialize nccl communication grid of tensor and pipeline parallel groups.
+#ifndef BUILD_MULTI_GPU
+    FT_CHECK_WITH_INFO(tensor_para_size == 1,
+                       fmtstr("tensor_para_size=%d although BUILD_MULTI_GPU is disabled. "
+                              "Please use the cmake flag -DBUILD_MULTI_GPU=ON if you want "
+                              "to use tensor/pipeline parallelism.",
+                              tensor_para_size));
+    FT_CHECK_WITH_INFO(pipeline_para_size == 1,
+                       fmtstr("pipeline_para_size=%d although BUILD_MULTI_GPU is disabled. "
+                              "Please use the cmake flag -DBUILD_MULTI_GPU=ON if you want "
+                              "to use tensor/pipeline parallelism.",
+                              pipeline_para_size));
+    tensor_para.rank_         = 0;
+    tensor_para.world_size_   = tensor_para_size;
+    pipeline_para.rank_       = 0;
+    pipeline_para.world_size_ = pipeline_para_size;
+#else
+    // Initialize a nccl communicator.
+    if (tensor_para.nccl_comm_ != nullptr && pipeline_para.nccl_comm_ != nullptr) {
+        FT_LOG_WARNING("NcclParam is already initialized. Skip NCCL initialization.");
+        return;
+    }
+    FT_CHECK(tensor_para.nccl_comm_ == nullptr);
+    FT_CHECK(pipeline_para.nccl_comm_ == nullptr);
+    FT_CHECK(tensor_para_size > 0);
+    FT_CHECK(pipeline_para_size > 0);
+
+    if (tensor_para_size == 1 && pipeline_para_size == 1) {
+        FT_LOG_WARNING("Skip NCCL initialization since requested tensor/pipeline parallel sizes are equals to 1.");
+        tensor_para.rank_         = 0;
+        tensor_para.world_size_   = tensor_para_size;
+        pipeline_para.rank_       = 0;
+        pipeline_para.world_size_ = pipeline_para_size;
+        return;
+    }
+
+    int mpi_initialized;
+    MPICHECK(MPI_Initialized(&mpi_initialized));
+    FT_CHECK_WITH_INFO(mpi_initialized, "Fail to nccl initialization because MPI is not initialized.");
+
+    int rank, world_size;
+    MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
+    MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &world_size));
+
+    FT_CHECK_WITH_INFO(tensor_para_size * pipeline_para_size <= world_size,
+                       fmtstr("tensor_para_size (%d) * pipeline_para_size (%d) should equal to the world size (%d).",
+                              tensor_para_size,
+                              pipeline_para_size,
+                              world_size));
+
+    // Convert WORLD communicator into 2D grid (k * n) communicator.
+    //  row = a tensor parallel group, col = a pipeline parallel group.
+    MPI_Comm grid_comm, tp_comm, pp_comm;
+
+    int dims[2]    = {pipeline_para_size, tensor_para_size};
+    int periods[2] = {0, 0};
+    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &grid_comm);
+
+    // Split 2D communicator into rows and cols.
+    int tp_remain_dims[2] = {false, true};
+    int pp_remain_dims[2] = {true, false};
+    MPI_Cart_sub(grid_comm, tp_remain_dims, &tp_comm);
+    MPI_Cart_sub(grid_comm, pp_remain_dims, &pp_comm);
+
+    int tp_rank, pp_rank;
+    MPI_Comm_rank(tp_comm, &tp_rank);
+    MPI_Comm_rank(pp_comm, &pp_rank);
+
+    ncclUniqueId tp_uid;
+    ncclUniqueId pp_uid;
+    // The root of each group creates a nccl uid.
+    if (tp_rank == 0) {
+        FT_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, tp_rank);
+        NCCLCHECK(ncclGetUniqueId(&tp_uid));
+    }
+    if (pp_rank == 0) {
+        FT_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, pp_rank);
+        NCCLCHECK(ncclGetUniqueId(&pp_uid));
+    }
+    // Broadcast nccl uid to share the same nccl uid across gpus in the same group.
+    FT_LOG_DEBUG("Broadcast nccl uid to the others in the same parallel groups.");
+    MPI_Bcast(&tp_uid, sizeof(tp_uid), MPI_BYTE, 0, tp_comm);
+    MPI_Bcast(&pp_uid, sizeof(pp_uid), MPI_BYTE, 0, pp_comm);
+
+    FT_LOG_DEBUG("Initialize NCCL communicators.");
+    ncclComm_t tp_nccl_comm, pp_nccl_comm;
+    NCCLCHECK(ncclCommInitRank(&tp_nccl_comm, tensor_para_size, tp_uid, tp_rank));
+    NCCLCHECK(ncclCommInitRank(&pp_nccl_comm, pipeline_para_size, pp_uid, pp_rank));
+
+    tensor_para.world_size_   = tensor_para_size;
+    tensor_para.rank_         = tp_rank;
+    tensor_para.nccl_uid_     = tp_uid;
+    tensor_para.nccl_comm_    = tp_nccl_comm;
+    pipeline_para.world_size_ = pipeline_para_size;
+    pipeline_para.rank_       = pp_rank;
+    pipeline_para.nccl_uid_   = pp_uid;
+    pipeline_para.nccl_comm_  = pp_nccl_comm;
+    FT_LOG_INFO("NCCL initialized rank=%d world_size=%d tensor_para=%s pipeline_para=%s",
+                rank,
+                world_size,
+                tensor_para.toString().c_str(),
+                pipeline_para.toString().c_str());
+#endif
+    FT_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+static std::atomic<int>& ncclGroupCount()
+{
+    static std::atomic<int> value{};
+    return value;
+}
+
+int ftNcclNextGroupId()
+{
+    return ncclGroupCount()++;
+}
+
+int ftNcclGroupCount()
+{
+    return ncclGroupCount();
+}
+
+size_t getLocalBatchSize(const size_t batch_size, const size_t seq_len, const size_t pipeline_para_size)
+{
+    size_t local_batch_size = batch_size;
+    if (pipeline_para_size == 1) {
+        return local_batch_size;
+    }
+    if (local_batch_size % pipeline_para_size == 0) {
+        local_batch_size /= pipeline_para_size;
+    }
+    while (local_batch_size * seq_len > 1024 && local_batch_size % 2 == 0) {
+        local_batch_size /= 2;
+    }
+    return local_batch_size;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/nccl_utils.h b/src/fastertransformer/utils/nccl_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f36c92724fe3809e33e2a4cec29bb0c415645d8f
--- /dev/null
+++ b/src/fastertransformer/utils/nccl_utils.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/mpi_utils.h"
+
+#include <cuda_runtime.h>
+#ifdef BUILD_MULTI_GPU
+#include <mpi.h>
+#include <nccl.h>
+#endif
+#include <stdio.h>
+#include <string>
+
+#if defined(NCCL_VERSION_CODE) && (NCCL_VERSION_CODE >= 21003)
+#define ENABLE_BF16_NCCL
+#endif
+
+namespace fastertransformer {
+#ifdef BUILD_MULTI_GPU
+#define NCCLCHECK(cmd)                                                                                                 \
+    do {                                                                                                               \
+        ncclResult_t r = cmd;                                                                                          \
+        if (r != ncclSuccess) {                                                                                        \
+            printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, ncclGetErrorString(r));                      \
+            __builtin_trap();                                                                                          \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#else
+#define NCCLCHECK(cmd) printf("[WARNING} No NCCL");
+#endif
+
+struct NcclUid {
+#ifndef BUILD_MULTI_GPU
+    NcclUid(){};
+    NcclUid(NcclUid const& uid){};
+#else
+    ncclUniqueId nccl_uid_;
+    NcclUid(){};
+    NcclUid(NcclUid const& uid): nccl_uid_(uid.nccl_uid_){};
+#endif
+};
+
+struct NcclParam {
+    int rank_{0};
+    int world_size_{1};
+    int group_id_{0};
+#ifdef BUILD_MULTI_GPU
+    ncclUniqueId nccl_uid_{};
+    ncclComm_t   nccl_comm_ = nullptr;
+#endif
+
+#ifdef BUILD_MULTI_GPU
+    NcclParam(): rank_(0), world_size_(1), nccl_comm_(nullptr){};
+    NcclParam(int rank, int world_size): rank_(rank), world_size_(world_size){};
+    NcclParam(NcclParam const& param):
+        rank_(param.rank_),
+        world_size_(param.world_size_),
+        group_id_(param.group_id_),
+        nccl_uid_(param.nccl_uid_),
+        nccl_comm_(param.nccl_comm_){};
+    std::string toString()
+    {
+        return fmtstr(
+            "NcclParam[rank=%d, world_size=%d, nccl_comm=%p, group_id=%d]", rank_, world_size_, nccl_comm_, group_id_);
+    }
+#else
+    NcclParam(): rank_(0), world_size_(1){};
+    NcclParam(int rank, int world_size): rank_(rank), world_size_(world_size){};
+    NcclParam(NcclParam const& param): rank_(param.rank_), world_size_(param.world_size_){};
+    std::string toString()
+    {
+        return fmtstr("NcclParam[rank=%d, world_size=%d]", rank_, world_size_);
+    }
+#endif
+};
+
+// New APIs
+template<typename T>
+void ftNcclAllReduceSum(const T* send_buf, T* recv_buf, const int data_size, NcclParam nccl_param, cudaStream_t stream);
+
+template<typename T>
+void ftNcclAllGather(
+    const T* send_buf, T* recv_buf, const int data_size, const int rank, NcclParam nccl_param, cudaStream_t stream);
+
+template<typename T>
+void ftNcclBroadCast(T* buff, const int data_size, const int root, NcclParam nccl_param, cudaStream_t stream);
+
+template<typename T>
+void ftNcclRecv(T* recv_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+
+template<typename T>
+void ftNcclSend(const T* send_buf, const int data_size, const int peer, NcclParam nccl_param, cudaStream_t stream);
+
+// nccl stream synchronize, abort nccl comms and throw errors when nccl async errors detected
+void ftNcclStreamSynchronize(NcclParam tensor_para, NcclParam pipeline_para_, cudaStream_t stream);
+
+void ftNcclGroupStart();
+void ftNcclGroupEnd();
+void ftNcclGetUniqueId(NcclUid& uid);
+void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const NcclUid uid);
+void ftNcclParamDestroy(NcclParam& param);
+
+void ftNcclInitialize(NcclParam& tensor_para,
+                      NcclParam& pipeline_para,
+                      const int  tensor_para_size,
+                      const int  pipeline_para_size);
+
+int ftNcclNextGroupId();
+int ftNcclGroupCount();
+
+size_t getLocalBatchSize(const size_t batch_size, const size_t seq_len, const size_t pipeline_para_size);
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/nvtx_utils.cc b/src/fastertransformer/utils/nvtx_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffe54c6ee7fe09f41de8a2d06d724eaed257fc04
--- /dev/null
+++ b/src/fastertransformer/utils/nvtx_utils.cc
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include "nvtx_utils.h"
+#ifdef USE_NVTX
+#include "nvToolsExt.h"
+#endif
+
+namespace ft_nvtx {
+std::string getScope()
+{
+    return scope;
+}
+void addScope(std::string name)
+{
+    scope = scope + name + "/";
+    return;
+}
+void setScope(std::string name)
+{
+    scope = name + "/";
+    return;
+}
+void resetScope()
+{
+    scope = "";
+    return;
+}
+void setDeviceDomain(int deviceId)
+{
+    domain = deviceId;
+    return;
+}
+void resetDeviceDomain()
+{
+    domain = 0;
+    return;
+}
+int getDeviceDomain()
+{
+    return domain;
+}
+
+bool isEnableNvtx()
+{
+    if (!has_read_nvtx_env) {
+        static char* ft_nvtx_env_char = std::getenv("FT_NVTX");
+        is_enable_ft_nvtx = (ft_nvtx_env_char != nullptr && std::string(ft_nvtx_env_char) == "ON") ? true : false;
+        has_read_nvtx_env = true;
+    }
+    return is_enable_ft_nvtx;
+}
+
+void ftNvtxRangePush(std::string name)
+{
+#ifdef USE_NVTX
+    nvtxStringHandle_t    nameId      = nvtxDomainRegisterStringA(NULL, (getScope() + name).c_str());
+    nvtxEventAttributes_t eventAttrib = {0};
+    eventAttrib.messageType           = NVTX_MESSAGE_TYPE_REGISTERED;
+    eventAttrib.message.registered    = nameId;
+    eventAttrib.payloadType           = NVTX_PAYLOAD_TYPE_INT32;
+    eventAttrib.payload.iValue        = getDeviceDomain();
+    nvtxRangePushEx(&eventAttrib);
+#endif
+}
+
+void ftNvtxRangePop()
+{
+#ifdef USE_NVTX
+    nvtxRangePop();
+#endif
+}
+
+}  // namespace ft_nvtx
diff --git a/src/fastertransformer/utils/nvtx_utils.h b/src/fastertransformer/utils/nvtx_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e000c2157b957551b496671da851104198809564
--- /dev/null
+++ b/src/fastertransformer/utils/nvtx_utils.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace ft_nvtx {
+static std::string scope;
+std::string        getScope();
+void               addScope(std::string name);
+void               setScope(std::string name);
+void               resetScope();
+static int         domain = 0;
+void               setDeviceDomain(int deviceId);
+int                getDeviceDomain();
+void               resetDeviceDomain();
+bool               isEnableNvtx();
+
+static bool has_read_nvtx_env = false;
+static bool is_enable_ft_nvtx = false;
+void        ftNvtxRangePush(std::string name);
+void        ftNvtxRangePop();
+}  // namespace ft_nvtx
+
+#define PUSH_RANGE(name)                                                                                               \
+    {                                                                                                                  \
+        if (ft_nvtx::isEnableNvtx()) {                                                                                 \
+            ft_nvtx::ftNvtxRangePush(name);                                                                            \
+        }                                                                                                              \
+    }
+
+#define POP_RANGE                                                                                                      \
+    {                                                                                                                  \
+        if (ft_nvtx::isEnableNvtx()) {                                                                                 \
+            ft_nvtx::ftNvtxRangePop();                                                                                 \
+        }                                                                                                              \
+    }
diff --git a/src/fastertransformer/utils/prompt_learning.h b/src/fastertransformer/utils/prompt_learning.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ab7706864331d3b6ada05018669cefe7a65565e
--- /dev/null
+++ b/src/fastertransformer/utils/prompt_learning.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace fastertransformer {
+
+enum class PromptLearningType {
+    no_prompt,
+    soft_prompt,
+    prefix_prompt,
+    p_prompt_tuning
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/string_utils.h b/src/fastertransformer/utils/string_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9616e418e4246f1c4b4826d4b14829bf81eae8d
--- /dev/null
+++ b/src/fastertransformer/utils/string_utils.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>   // std::make_unique
+#include <sstream>  // std::stringstream
+#include <string>
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename... Args>
+inline std::string fmtstr(const std::string& format, Args... args)
+{
+    // This function came from a code snippet in stackoverflow under cc-by-1.0
+    //   https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf
+
+    // Disable format-security warning in this function.
+#if defined(_MSC_VER)  // for visual studio
+#pragma warning(push)
+#pragma warning(warning(disable : 4996))
+#elif defined(__GNUC__) || defined(__clang__)  // for gcc or clang
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-security"
+#endif
+    int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;  // Extra space for '\0'
+    if (size_s <= 0) {
+        throw std::runtime_error("Error during formatting.");
+    }
+    auto size = static_cast<size_t>(size_s);
+    auto buf  = std::make_unique<char[]>(size);
+    std::snprintf(buf.get(), size, format.c_str(), args...);
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#elif defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+    return std::string(buf.get(), buf.get() + size - 1);  // We don't want the '\0' inside
+}
+
+template<typename T>
+inline std::string vec2str(std::vector<T> vec)
+{
+    std::stringstream ss;
+    ss << "(";
+    if (!vec.empty()) {
+        for (size_t i = 0; i < vec.size() - 1; ++i) {
+            ss << vec[i] << ", ";
+        }
+        ss << vec.back();
+    }
+    ss << ")";
+    return ss.str();
+}
+
+template<typename T>
+inline std::string arr2str(T* arr, size_t size)
+{
+    std::stringstream ss;
+    ss << "(";
+    for (size_t i = 0; i < size - 1; ++i) {
+        ss << arr[i] << ", ";
+    }
+    if (size > 0) {
+        ss << arr[size - 1];
+    }
+    ss << ")";
+    return ss.str();
+}
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/test_utils.h b/src/fastertransformer/utils/test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..22d2cd81ce2a68ba6ae5579362ca9b37c4ef44d5
--- /dev/null
+++ b/src/fastertransformer/utils/test_utils.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cmath>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+namespace fastertransformer {
+
+#define TIMEIT(print, n, stream, fn, ...)                                                                              \
+    ({                                                                                                                 \
+        cudaEvent_t _macro_event_start, _macro_event_stop;                                                             \
+        cudaEventCreate(&_macro_event_start);                                                                          \
+        cudaEventCreate(&_macro_event_stop);                                                                           \
+        cudaEventRecord(_macro_event_start, stream);                                                                   \
+        for (int i = 0; i < n; i++) {                                                                                  \
+            fn(__VA_ARGS__);                                                                                           \
+        }                                                                                                              \
+        cudaEventRecord(_macro_event_stop, stream);                                                                    \
+        cudaStreamSynchronize(stream);                                                                                 \
+        float ms = 0.0f;                                                                                               \
+        cudaEventElapsedTime(&ms, _macro_event_start, _macro_event_stop);                                              \
+        ms /= n;                                                                                                       \
+        if (print)                                                                                                     \
+            printf("[TIMEIT] " #fn ": %.2fµs\n", ms * 1000);                                                           \
+        ms;                                                                                                            \
+    })
+
+template<typename T>
+struct rel_abs_diff {
+    T operator()(const T& lhs, const T& rhs) const
+    {
+        return lhs == 0 ? 0 : static_cast<T>(fabs(lhs - rhs) / fabs(lhs));
+    }
+};
+
+template<typename T>
+struct abs_diff {
+    T operator()(const T& lhs, const T& rhs) const
+    {
+        return static_cast<T>(fabs(lhs - rhs));
+    }
+};
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/wenet_conv2d.h b/src/fastertransformer/utils/wenet_conv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c7523f789d40390dcf22754ca50fc21f397ff93
--- /dev/null
+++ b/src/fastertransformer/utils/wenet_conv2d.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cublasLt.h"
+#include "cuda_utils.h"
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cudnn.h>
+
+namespace fastertransformer {
+
+template<typename T>
+void conv2d(T*             output,
+            T*             input,
+            void*          ws_data,
+            const int      index,
+            const T*       kernel,
+            const T*       bias,
+            const int      batch,
+            const int      h,
+            const int      w,
+            const int      in_channels,
+            const int      out_channels,
+            const int      kernel_size,
+            const int      stride,
+            cudnnHandle_t& cudnn_handle,
+            cudaStream_t   stream)
+{
+    cudnnDataType_t dataType;
+    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+    const float     alpha1      = 1.0f;
+    const float     alpha2      = 0.0f;
+    if (std::is_same<T, half>::value) {
+        dataType = CUDNN_DATA_HALF;
+        // computeType = CUDNN_DATA_HALF;
+    }
+    else {
+        dataType = CUDNN_DATA_FLOAT;
+    }
+
+    cudnnTensorDescriptor_t      input_descriptor_;
+    cudnnTensorDescriptor_t      output_descriptor_;
+    cudnnFilterDescriptor_t      kernel_descriptor_;
+    cudnnTensorDescriptor_t      bias_descriptor_;
+    cudnnConvolutionDescriptor_t convolution_descriptor_;
+    cudnnActivationDescriptor_t  activation_descriptor_;
+    cudnnConvolutionFwdAlgo_t    convolution_algorithm_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor_));
+    checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor_,
+                                          /*format=*/CUDNN_TENSOR_NHWC,
+                                          /*dataType=*/dataType,
+                                          /*batch_size=*/batch,
+                                          /*channels=*/in_channels,
+                                          /*image_height=*/h,
+                                          /*image_width=*/w));
+
+    checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor_));
+    checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor_,
+                                          /*dataType=*/dataType,
+                                          /*format=*/CUDNN_TENSOR_NHWC,
+                                          /*out_channels=*/out_channels,
+                                          /*in_channels=*/in_channels,
+                                          /*kernel_height=*/kernel_size,
+                                          /*kernel_width=*/kernel_size));
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&bias_descriptor_));
+    checkCUDNN(cudnnSetTensor4dDescriptor(bias_descriptor_,
+                                          /*format=*/CUDNN_TENSOR_NHWC,
+                                          /*dataType=*/dataType,
+                                          /*batch_size=*/1,
+                                          /*channels=*/out_channels,
+                                          /*image_height=*/1,
+                                          /*image_width=*/1));
+
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor_));
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor_,
+                                               /*pad_height=*/0,
+                                               /*pad_width=*/0,
+                                               /*vertical_stride=*/stride,
+                                               /*horizontal_stride=*/stride,
+                                               /*dilation_height=*/1,
+                                               /*dilation_width=*/1,
+                                               /*mode=*/CUDNN_CROSS_CORRELATION, /*CUDNN_CONVOLUTION,*/
+                                               /*computeType=*/computeType));
+    checkCUDNN(cudnnSetConvolutionMathType(convolution_descriptor_, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
+
+    checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor_));
+    checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor_,
+                                            /*mode=*/CUDNN_ACTIVATION_RELU,
+                                            // /*mode=*/CUDNN_ACTIVATION_IDENTITY,
+                                            /*reluNanOpt=*/CUDNN_PROPAGATE_NAN,
+                                            /*coef=*/0.0));
+
+    checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor_));
+
+    // output
+    int out_n;
+    int out_c;
+    int out_h;
+    int out_w;
+
+    // TODO: set the second conv as nhwc in and nchw out
+    cudnnTensorFormat_t output_format = index == 0 ? CUDNN_TENSOR_NHWC : CUDNN_TENSOR_NCHW;
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+        convolution_descriptor_, input_descriptor_, kernel_descriptor_, &out_n, &out_c, &out_h, &out_w));
+    checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor_, output_format, dataType, out_n, out_c, out_h, out_w));
+
+    // search algorithm, we use default directly to prevent the overhead to choose bset one
+    // int requestedAlgoCount = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
+    // int returnedAlgoCount = -1;
+    // cudnnConvolutionFwdAlgoPerf_t results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
+
+    // // Choose the best according to the preference
+    // checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnn_handle,
+    //                                                   input_descriptor_,
+    //                                                   kernel_descriptor_,
+    //                                                   convolution_descriptor_,
+    //                                                   output_descriptor_,
+    //                                                   requestedAlgoCount,
+    //                                                   &returnedAlgoCount,
+    //                                                   results));
+    // for (int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex) {
+    //     printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
+    //            cudnnGetErrorString(results[algoIndex].status),
+    //            results[algoIndex].algo,
+    //            results[algoIndex].time,
+    //            (unsigned long long)results[algoIndex].memory);
+    // }
+    // convolution_algorithm_ = results[0].algo;
+
+    // workspace
+    size_t ws_size = 0;
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle,
+                                                       input_descriptor_,
+                                                       kernel_descriptor_,
+                                                       convolution_descriptor_,
+                                                       output_descriptor_,
+                                                       convolution_algorithm_,
+                                                       &ws_size));
+    FT_LOG_DEBUG("Convolution algorithm: %d with workspace size: %d \n", convolution_algorithm_, ws_size);
+    FT_CHECK_WITH_INFO(
+        ws_size <= (1 << 29),
+        "Current workspace used for CuDNN Convolution is fixed as 1 << 29, please increase it in WenetEncoder::allocateBuffer!");
+    // void *ws_data;
+    // if (ws_size > 0) {
+    //     check_cuda_error(cudaMalloc(&ws_data, ws_size));
+    // }
+    // else{
+    //     ws_data = nullptr;
+    // }
+
+    sync_check_cuda_error();
+    checkCUDNN(cudnnConvolutionBiasActivationForward(cudnn_handle,
+                                                     (void*)(&alpha1),
+                                                     input_descriptor_,
+                                                     input,
+                                                     kernel_descriptor_,
+                                                     kernel,
+                                                     convolution_descriptor_,
+                                                     convolution_algorithm_,
+                                                     (void*)ws_data,
+                                                     ws_size,
+                                                     (void*)(&alpha2),
+                                                     output_descriptor_,
+                                                     output,
+                                                     bias_descriptor_,
+                                                     bias,
+                                                     activation_descriptor_,
+                                                     output_descriptor_,
+                                                     output));
+
+    sync_check_cuda_error();
+    checkCUDNN(cudnnDestroyTensorDescriptor(input_descriptor_));
+    checkCUDNN(cudnnDestroyTensorDescriptor(output_descriptor_));
+    checkCUDNN(cudnnDestroyFilterDescriptor(kernel_descriptor_));
+    checkCUDNN(cudnnDestroyTensorDescriptor(bias_descriptor_));
+    checkCUDNN(cudnnDestroyActivationDescriptor(activation_descriptor_));
+    checkCUDNN(cudnnDestroyConvolutionDescriptor(convolution_descriptor_));
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/word_list.cc b/src/fastertransformer/utils/word_list.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcb4fcc86240b28f08e3261abf89364cb979e472
--- /dev/null
+++ b/src/fastertransformer/utils/word_list.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "word_list.h"
+#include "memory_utils.h"
+
+#include "assert.h"
+
+namespace fastertransformer {
+
+int read_word_list(const std::string& filename, std::vector<int>& file_data)
+{
+    std::ifstream word_list_file(filename, std::ios::in);
+
+    std::string line_buf;
+    int         line_count   = 0;
+    size_t      id_counts[2] = {0, 0};
+    while (std::getline(word_list_file, line_buf)) {
+
+        std::stringstream line_stream(line_buf);
+        std::string       vals;
+        while (std::getline(line_stream, vals, ',')) {
+            file_data.push_back(std::stoi(vals));
+            id_counts[line_count]++;
+        }
+        line_count++;
+
+        if (line_count > 1) {
+            break;
+        }
+    }
+    assert(id_counts[0] == id_counts[1]);
+
+    return 0;
+}
+
+}  // namespace fastertransformer
diff --git a/src/fastertransformer/utils/word_list.h b/src/fastertransformer/utils/word_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..33e8e5658aa7bb07acdd1f11fec84b0e375430d4
--- /dev/null
+++ b/src/fastertransformer/utils/word_list.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "Tensor.h"
+#include "stdlib.h"
+
+namespace fastertransformer {
+
+int read_word_list(const std::string& filename, std::vector<int>& tensor_data);
+
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..37d971daed3080ad75606075ce684ca2eec9b8b2
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_subdirectory(unittests)
+if(BUILD_PYT)
+    add_subdirectory(gemm_dequantize)
+    add_subdirectory(moe)
+    add_subdirectory(int8_gemm)
+endif()
diff --git a/tests/gemm_dequantize/CMakeLists.txt b/tests/gemm_dequantize/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecccacdf158bcfef766f16b6192e3170db01bd19
--- /dev/null
+++ b/tests/gemm_dequantize/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if (TORCH_VERSION VERSION_GREATER_EQUAL "1.9.0")
+    set(gemm_dq_test_files
+        th_gemm_dequantize.cc
+    )
+
+    add_definitions(-DTORCH_CUDA=1)
+
+    set(LIB_NAME "gemm_dq_unit_ops")
+    add_library(${LIB_NAME} SHARED ${gemm_dq_test_files})
+    set_target_properties(${LIB_NAME} PROPERTIES
+                          CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+    target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" fpA_intB_gemm logger)
+else()
+    message("TORCH_VERSION ${TORCH_VERSION} < 1.9.0, skipping compiling th_moe_ops.cc because QUInt4x2 is supported after torch 1.9.0")
+endif()
\ No newline at end of file
diff --git a/tests/gemm_dequantize/th_gemm_dequantize.cc b/tests/gemm_dequantize/th_gemm_dequantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8c00b1fb1dd223efa0a848e80218c6742da1477
--- /dev/null
+++ b/tests/gemm_dequantize/th_gemm_dequantize.cc
@@ -0,0 +1,372 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cublas_v2.h>
+#include <iostream>
+#include <vector>
+
+#include "torch/csrc/cuda/Stream.h"
+#include <torch/custom_class.h>
+#include <torch/script.h>
+
+#include "src/fastertransformer/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "src/fastertransformer/th_op/th_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+
+#include "cutlass/numeric_types.h"
+
+using torch::Tensor;
+
+namespace torch_ext {
+
+namespace ft = fastertransformer;
+
+template<typename T, typename WeightType>
+Tensor fused_gemm_dq_helper(
+    Tensor input_activations, Tensor weight, Tensor scales, const int64_t timing_iterations, float& avg_time)
+{
+    const at::ScalarType _st    = input_activations.scalar_type();
+    const int            m      = input_activations.size(0);
+    const int            n      = scales.size(0);
+    const int            k      = input_activations.size(1);
+    auto                 stream = at::cuda::getCurrentCUDAStream().stream();
+
+    const T*          input_act_ptr = get_ptr<const T>(input_activations);
+    const WeightType* weight_ptr    = get_ptr<const WeightType>(weight);
+    const T*          scales_ptr    = get_ptr<const T>(scales);
+
+    fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
+    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
+
+    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
+    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+
+    T*   output_tensor_ptr = get_ptr<T>(output_tensor);
+    char* ws_ptr            = get_ptr<char>(ws_tensor);
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaEventRecord(start, stream);
+    for (int64_t iter = 0; iter < timing_iterations; ++iter) {
+        fused_gemm_dq_runner.gemm(
+            input_act_ptr, weight_ptr, scales_ptr, output_tensor_ptr, m, n, k, ws_ptr, ws_bytes, stream);
+    }
+    cudaEventRecord(stop, stream);
+    cudaEventSynchronize(stop);
+    float total_time_ms = 0;
+    cudaEventElapsedTime(&total_time_ms, start, stop);
+    avg_time = total_time_ms / float(timing_iterations);
+
+    return output_tensor;
+}
+
+Tensor
+_fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales, int64_t timing_iterations, float& avg_time)
+{
+    const at::ScalarType _st = input_activations.scalar_type();
+    CHECK_INPUT(scales, _st);
+
+    TORCH_CHECK(input_activations.dim() == 2, "Invalid rank for activations");
+    TORCH_CHECK(weight.dim() == 2, "Invalid rank for weight");
+    TORCH_CHECK(scales.dim() == 1, "Invalid rank for scales");
+
+    const int m = input_activations.size(0);
+    const int n = scales.size(0);
+    const int k = input_activations.size(1);
+
+    TORCH_CHECK(input_activations.size(1) == weight.size(0), "dim 1 of act and dim 0 of weight must be equal");
+
+    // We signal int4 by having the last weight dim be half the size of the scales.
+    // This is because int4 elements are packed into a single byte.
+    torch::ScalarType quant_type = weight.scalar_type();
+    if (weight.size(-1) == scales.size(-1) / 2) {
+        quant_type = at::ScalarType::QUInt4x2;
+    }
+    else {
+        TORCH_CHECK(weight.size(-1) == scales.size(-1),
+                    "Last dim of weight and scales must be equal for int8 "
+                    "or last dim of scale must be 2x last dim of weight for int4.");
+    }
+
+    Tensor output_tensor;
+    switch (_st) {
+        case at::ScalarType::Half: {
+            if (quant_type == torch::kInt8) {
+                output_tensor =
+                    fused_gemm_dq_helper<half, uint8_t>(input_activations, weight, scales, timing_iterations, avg_time);
+            }
+            else if (quant_type == at::ScalarType::QUInt4x2) {
+                output_tensor = fused_gemm_dq_helper<half, cutlass::uint4b_t>(
+                    input_activations, weight, scales, timing_iterations, avg_time);
+            }
+            else {
+                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
+                throw std::runtime_error(err_msg);
+            }
+            break;
+        }
+#ifdef ENABLE_BF16
+        case at::ScalarType::BFloat16: {
+            if (quant_type == torch::kInt8) {
+                output_tensor = fused_gemm_dq_helper<__nv_bfloat16, uint8_t>(
+                    input_activations, weight, scales, timing_iterations, avg_time);
+            }
+            else if (quant_type == at::ScalarType::QUInt4x2) {
+                output_tensor = fused_gemm_dq_helper<__nv_bfloat16, cutlass::uint4b_t>(
+                    input_activations, weight, scales, timing_iterations, avg_time);
+            }
+            else {
+                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
+                throw std::runtime_error(err_msg);
+            }
+            break;
+        }
+#endif
+        default:
+            throw std::runtime_error("Unsupported tensor type. Got " + std::string(at::toString(_st)));
+    }
+    return output_tensor;
+}
+
+Tensor fused_gemm_dq(Tensor input_activations, Tensor weight, Tensor scales)
+{
+    float dummy = 0.f;
+    return _fused_gemm_dq(input_activations, weight, scales, 1, dummy);
+}
+
+Tensor
+bench_cublas(Tensor input_activations, Tensor weight_dequantized, const int64_t timing_iterations, float& avg_time)
+{
+    using namespace fastertransformer;
+    const int m = input_activations.size(0);
+    const int n = weight_dequantized.size(1);
+    const int k = input_activations.size(1);
+
+    const void* input_act_ptr = get_ptr<const void>(input_activations);
+    const void* weight_ptr    = get_ptr<const void>(weight_dequantized);
+
+    cublasHandle_t       handle = at::cuda::getCurrentCUDABlasHandle();
+    const at::ScalarType _st    = input_activations.scalar_type();
+
+    TORCH_CHECK(input_activations.size(1) == weight_dequantized.size(0),
+                "CUBLAS_BENCH: dim 1 of act and dim 0 of weight must be equal");
+    CHECK_INPUT(input_activations, _st);
+    CHECK_INPUT(weight_dequantized, _st);
+
+    auto  output_tensor     = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
+    void* output_tensor_ptr = get_ptr<void>(output_tensor);
+
+    TORCH_CHECK(_st == at::ScalarType::Half || _st == at::ScalarType::BFloat16, "Input type must be float or bfloat");
+    cudaDataType_t cublasType = _st == at::ScalarType::Half ? CUDA_R_16F : CUDA_R_16BF;
+
+    float alpha = 1.0f;
+    float beta  = 0.0f;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    cublasSetStream(handle, stream);
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    cudaEventRecord(start, stream);
+    for (int64_t iter = 0; iter < timing_iterations; ++iter) {
+        status = cublasGemmEx(handle,
+                              CUBLAS_OP_N,
+                              CUBLAS_OP_N,
+                              n,
+                              m,
+                              k,
+                              &alpha,
+                              weight_ptr,
+                              cublasType,
+                              n,
+                              input_act_ptr,
+                              cublasType,
+                              k,
+                              &beta,
+                              output_tensor_ptr,
+                              cublasType,
+                              n,
+                              CUBLAS_COMPUTE_32F,
+                              CUBLAS_GEMM_DEFAULT);
+    }
+    cudaEventRecord(stop, stream);
+    cudaEventSynchronize(stop);
+    float total_time_ms = 0;
+    cudaEventElapsedTime(&total_time_ms, start, stop);
+    avg_time = total_time_ms / float(timing_iterations);
+    check_cuda_error(status);
+    return output_tensor;
+}
+
+std::vector<std::vector<Tensor>> benchmark_against_cublas_fp(Tensor        input_activations,
+                                                             Tensor        weight_quantized,
+                                                             Tensor        scales,
+                                                             Tensor        weight_dequantized,
+                                                             const int64_t timing_iterations)
+{
+    float  cublas_time   = 0.f;
+    float  ft_time       = 0.f;
+    Tensor cublas_result = bench_cublas(input_activations, weight_dequantized, timing_iterations, cublas_time);
+    Tensor ft_result     = _fused_gemm_dq(input_activations, weight_quantized, scales, timing_iterations, ft_time);
+
+    auto timing_tensor =
+        torch::empty({2}, torch::dtype(at::ScalarType::Float).device(torch::kCPU).requires_grad(false));
+    timing_tensor[0] = cublas_time;
+    timing_tensor[1] = ft_time;
+
+    // const int m = input_activations.size(0);
+    // const int n = weight_dequantized.size(1);
+    // const int k = input_activations.size(1);
+    // std::cout << "m, n, k" << m << ", " << n << ", " << k << std::endl;
+    // std::cout << "cuBLAS time (ms) " << cublas_time << std::endl;
+    // std::cout << "FT time (ms) " << ft_time << std::endl;
+
+    return {{timing_tensor}, {cublas_result, ft_result}};
+}
+
+template<typename T, typename WeightType>
+Tensor fused_gemm_dq_bias_act_helper(
+    Tensor input_activations, Tensor weight, Tensor scales, Tensor bias, ft::ActivationType activation_type)
+{
+    const at::ScalarType _st    = input_activations.scalar_type();
+    const int            m      = input_activations.size(0);
+    const int            n      = scales.size(0);
+    const int            k      = input_activations.size(1);
+    auto                 stream = at::cuda::getCurrentCUDAStream().stream();
+
+    const T*          input_act_ptr = get_ptr<const T>(input_activations);
+    const WeightType* weight_ptr    = get_ptr<const WeightType>(weight);
+    const T*          scales_ptr    = get_ptr<const T>(scales);
+    const T*          bias_ptr      = get_ptr<const T>(bias);
+
+    fastertransformer::CutlassFpAIntBGemmRunner<T, WeightType> fused_gemm_dq_runner;
+    const int ws_bytes = fused_gemm_dq_runner.getWorkspaceSize(m, n, k);
+
+    auto output_tensor = torch::empty({m, n}, torch::dtype(_st).device(torch::kCUDA).requires_grad(false));
+    auto ws_tensor     = torch::empty({ws_bytes}, torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false));
+
+    T*   output_tensor_ptr = get_ptr<T>(output_tensor);
+    char* ws_ptr            = get_ptr<char>(ws_tensor);
+
+    fused_gemm_dq_runner.gemm_bias_act(input_act_ptr,
+                                       weight_ptr,
+                                       scales_ptr,
+                                       bias_ptr,
+                                       output_tensor_ptr,
+                                       m,
+                                       n,
+                                       k,
+                                       activation_type,
+                                       ws_ptr,
+                                       ws_bytes,
+                                       stream);
+
+    return output_tensor;
+}
+
+Tensor fused_gemm_dq_bias_act(
+    Tensor input_activations, Tensor weight, Tensor scales, Tensor bias, std::string activation_type_str)
+{
+    const at::ScalarType _st = input_activations.scalar_type();
+    CHECK_INPUT(scales, _st);
+    CHECK_INPUT(bias, _st);
+
+    TORCH_CHECK(input_activations.dim() == 2, "Invalid rank for activations");
+    TORCH_CHECK(weight.dim() == 2, "Invalid rank for weight");
+    TORCH_CHECK(scales.dim() == 1, "Invalid rank for scales");
+    TORCH_CHECK(bias.dim() == 1, "Invalid rank for bias");
+
+    const int m = input_activations.size(0);
+    const int n = scales.size(0);
+    const int k = input_activations.size(1);
+
+    TORCH_CHECK(bias.size(0) == n, "Must have 1 bias value for each output column");
+    TORCH_CHECK(input_activations.size(1) == weight.size(0), "dim 1 of act and dim 0 of weight must be equal");
+
+    // We signal int4 by having the last weight dim be half the size of the scales.
+    // This is because int4 elements are packed into a single byte.
+    torch::ScalarType quant_type = weight.scalar_type();
+    if (weight.size(-1) == scales.size(-1) / 2) {
+        quant_type = at::ScalarType::QUInt4x2;
+    }
+    else {
+        TORCH_CHECK(weight.size(-1) == scales.size(-1),
+                    "Last dim of weight and scales must be equal for int8 "
+                    "or last dim of scale must be 2x last dim of weight for int4.");
+    }
+
+    ft::ActivationType activation_type = ft::ActivationType::InvalidType;
+    if (activation_type_str == "identity") {
+        activation_type = ft::ActivationType::Identity;
+    }
+    else {
+        activation_type = ft::getActivationType(activation_type_str);
+    }
+
+    TORCH_CHECK(!isGatedActivation(activation_type), "Fused gated activations not supported.");
+
+    Tensor output_tensor;
+    switch (_st) {
+        case at::ScalarType::Half: {
+            if (quant_type == torch::kInt8) {
+                output_tensor = fused_gemm_dq_bias_act_helper<half, uint8_t>(
+                    input_activations, weight, scales, bias, activation_type);
+            }
+            else if (quant_type == at::ScalarType::QUInt4x2) {
+                output_tensor = fused_gemm_dq_bias_act_helper<half, cutlass::uint4b_t>(
+                    input_activations, weight, scales, bias, activation_type);
+            }
+            else {
+                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
+                throw std::runtime_error(err_msg);
+            }
+            break;
+        }
+#ifdef ENABLE_BF16
+        case at::ScalarType::BFloat16: {
+            if (quant_type == torch::kInt8) {
+                output_tensor = fused_gemm_dq_bias_act_helper<__nv_bfloat16, uint8_t>(
+                    input_activations, weight, scales, bias, activation_type);
+            }
+            else if (quant_type == at::ScalarType::QUInt4x2) {
+                output_tensor = fused_gemm_dq_bias_act_helper<__nv_bfloat16, cutlass::uint4b_t>(
+                    input_activations, weight, scales, bias, activation_type);
+            }
+            else {
+                std::string err_msg = "Unsupported weight type " + std::string(at::toString(quant_type));
+                throw std::runtime_error(err_msg);
+            }
+            break;
+        }
+#endif
+        default:
+            throw std::runtime_error("Unsupported tensor type. Got " + std::string(at::toString(_st)));
+    }
+    return output_tensor;
+}
+
+TORCH_LIBRARY(gemm_dq_unit_ops, m)
+{
+    m.def("fused_gemm_dq", fused_gemm_dq);
+    m.def("benchmark_against_cublas_fp", benchmark_against_cublas_fp);
+    m.def("fused_gemm_dq_bias_act", fused_gemm_dq_bias_act);
+}
+}  // namespace torch_ext
\ No newline at end of file
diff --git a/tests/gemm_dequantize/th_gemm_dequantize.py b/tests/gemm_dequantize/th_gemm_dequantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..6563588a494c17e9e0574e491ca4266a307694a8
--- /dev/null
+++ b/tests/gemm_dequantize/th_gemm_dequantize.py
@@ -0,0 +1,206 @@
+import torch
+import unittest
+
+def random_tensor(shape, dtype, device, mean=0, std=1):
+    return torch.empty(shape, dtype=dtype, device=device).normal_(mean, std)
+  
+class TestGemmDequantize(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.classes.load_library("lib/libth_transformer.so")
+        torch.classes.load_library("lib/libgemm_dq_unit_ops.so")
+        self.unpack_packed_int4s = torch.ops.fastertransformer.unpack_int4_packed_tensor_to_int8
+        self.pack_int4s = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4
+        self.fused_gemm_dq = torch.ops.gemm_dq_unit_ops.fused_gemm_dq
+        self.fused_gemm_dq_bias_act = torch.ops.gemm_dq_unit_ops.fused_gemm_dq_bias_act
+        self.bench = torch.ops.gemm_dq_unit_ops.benchmark_against_cublas_fp
+        self.preprocess_weights_for_mixed_gemm = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm
+
+        self.symmetric_quantizer = torch.ops.fastertransformer._symmetric_quantize_last_axis_of_batched_matrix
+
+        torch.manual_seed(734876213)
+
+    def dequantize_test_helper(self, weight_type, quant_type):
+      assert quant_type == torch.int8 or quant_type == torch.quint4x2 
+
+      lower_bound = -128 if quant_type == torch.int8 else -8
+      upper_bound = 127 if quant_type == torch.int8 else 7
+
+      m, n, k = 64, 128, 64
+      weights = torch.randint(lower_bound, upper_bound, [k, n], dtype=torch.int8, device="cpu")
+
+      packed_weight = self.pack_int4s(weights) if quant_type == torch.quint4x2 else weights
+      cuda_weights = self.preprocess_weights_for_mixed_gemm(packed_weight, quant_type).to("cuda")
+      weights = weights.to("cuda")
+
+      act = torch.eye(m, dtype=weight_type, device="cuda")
+      scales = torch.ones([n], dtype=weight_type, device='cuda')
+
+      actual = self.fused_gemm_dq(act, cuda_weights, scales)
+      torch.testing.assert_close(actual, weights, atol=0, rtol=0, check_dtype=False)
+
+    def test_fp16_int8_dequantize(self):
+      self.dequantize_test_helper(torch.float16, torch.int8)
+
+    def test_bf16_int8_dequantize(self):
+      self.dequantize_test_helper(torch.bfloat16, torch.int8)
+
+    def test_fp16_int4_dequantize(self):
+      self.dequantize_test_helper(torch.float16, torch.quint4x2)
+
+    def test_bf16_int4_dequantize(self):
+      self.dequantize_test_helper(torch.bfloat16, torch.quint4x2)
+
+    def apply_act(self, inp, act_str):
+      if act_str == "identity":
+        return inp
+      elif act_str == "silu":
+        return torch.nn.SiLU()(inp)
+      elif act_str == "relu":
+        return torch.nn.ReLU()(inp)
+      elif act_str == "gelu":
+        return torch.nn.GELU(approximate="tanh")(inp)
+      else:
+        assert False, "Unsupported activation"
+
+    def gemm_dequant_test_helper(self, compute_type, weight_dtype, gemm_ms, gemm_ns, gemm_ks, rtol, atol, act_str="only_gemm", benchmark=False):
+        assert weight_dtype == torch.int8 or weight_dtype == torch.quint4x2, "Weight must be quantized"      
+
+        for gemm_k in gemm_ks:
+            for gemm_n in gemm_ns:
+                torch_weights_cpu = random_tensor((gemm_k, gemm_n), dtype=compute_type, device="cpu", mean=0, std=0.002)
+                ref_torch_weights, processed_torch_weights, torch_weight_scales = self.symmetric_quantizer(torch_weights_cpu, weight_dtype)
+                ref_torch_weights = self.unpack_packed_int4s(ref_torch_weights) if weight_dtype == torch.quint4x2 else ref_torch_weights
+                ref_torch_weights = ref_torch_weights.to("cuda")
+                processed_torch_weights = processed_torch_weights.to("cuda")
+                torch_weight_scales = torch_weight_scales.to("cuda")
+                torch_biases = random_tensor((gemm_n), dtype=compute_type, device="cuda", mean=0, std=0.1)
+
+
+                for num_rows in gemm_ms:
+                    torch_activations = torch.randn(size=(num_rows, gemm_k), dtype=compute_type, device="cuda")
+
+                    scales_unsqueezed = torch_weight_scales.unsqueeze(0)
+                    casted_weights = ref_torch_weights.to(torch_activations.dtype)
+                    dequantized_weights = torch.multiply(casted_weights, scales_unsqueezed)
+                    if benchmark:
+                      assert act_str == "only_gemm", "Benchmarks against cublas must use just GEMM."
+                      torch.cuda.profiler.start()
+                      times, results = self.bench(torch_activations, processed_torch_weights, torch_weight_scales, dequantized_weights, 200)
+                      torch.cuda.profiler.stop()
+                      times = times[0]
+                      cublas_time = times[0].item()
+                      ft_time = times[1].item()
+                      ft_speedup = cublas_time / ft_time
+                      print("{},{},{},{},{},{}".format(num_rows, gemm_n, gemm_k, cublas_time, ft_time, ft_speedup))
+                      reference_result = results[0]
+                      ft_result = results[1]
+                    else:
+                      if act_str == "only_gemm":
+                        reference_result = torch.matmul(torch_activations, dequantized_weights)
+                        ft_result = self.fused_gemm_dq(torch_activations, processed_torch_weights, torch_weight_scales)
+                      else:
+                        reference_result = torch.matmul(torch_activations, dequantized_weights)
+                        reference_result += torch_biases.unsqueeze(0)
+                        reference_result = self.apply_act(reference_result, act_str)
+
+                        ft_result = self.fused_gemm_dq_bias_act(torch_activations, processed_torch_weights, torch_weight_scales, torch_biases, act_str)
+
+                    msg = "FC1 Failed on m={}, n={}, k={}".format(num_rows, gemm_n, gemm_k)
+                    torch.testing.assert_close(ft_result, reference_result, rtol=rtol, atol=atol, msg=msg, check_dtype=False)         
+
+    def test_fp16_int8_gemm(self):
+        self.gemm_dequant_test_helper(torch.float16, torch.int8,
+                                      gemm_ms = [256, 177, 195, 125, 66, 33, 8, 2, 1],
+                                      gemm_ns = [1024, 2048, 4096],
+                                      gemm_ks = [4096, 8192, 16384],
+                                      rtol=0.001, atol=0.002)
+
+    def test_fp16_int4_gemm(self):
+        self.gemm_dequant_test_helper(torch.float16, torch.quint4x2,
+                                      gemm_ms = [256, 177, 195, 125, 66, 33, 8, 2, 1],
+                                      gemm_ns = [1024, 2048, 4096],
+                                      gemm_ks = [4096, 8192, 16384],
+                                      rtol=0.001, atol=0.002)
+    
+    def test_bf16_int8_gemm(self):
+        self.gemm_dequant_test_helper(torch.bfloat16, torch.int8,
+                                      gemm_ms = [256, 177, 195, 125, 66, 33, 8, 2, 1],
+                                      gemm_ns = [1024, 2048, 4096],
+                                      gemm_ks = [4096, 8192, 16384],
+                                      rtol=0.01, atol=0.01)
+
+    def test_bf16_int4_gemm(self):
+        self.gemm_dequant_test_helper(torch.bfloat16, torch.quint4x2,
+                                      gemm_ms = [256, 177, 195, 125, 66, 33, 8, 2, 1],
+                                      gemm_ns = [1024, 2048, 4096],
+                                      gemm_ks = [4096, 8192, 16384],
+                                      rtol=0.01, atol=0.01)
+
+    def test_fp16_int8_gemm_bias(self):
+        self.gemm_dequant_test_helper(torch.float16, torch.int8,
+                                      gemm_ms = [256],
+                                      gemm_ns = [1024],
+                                      gemm_ks = [8192],
+                                      rtol=0.001, atol=0.002,
+                                      act_str="identity")
+  
+    def test_fp16_int8_gemm_bias_relu(self):
+        self.gemm_dequant_test_helper(torch.float16, torch.int8,
+                                      gemm_ms = [256],
+                                      gemm_ns = [1024],
+                                      gemm_ks = [8192],
+                                      rtol=0.001, atol=0.002,
+                                      act_str="relu")
+
+    def test_fp16_int8_gemm_bias_gelu(self):
+        self.gemm_dequant_test_helper(torch.float16, torch.int8,
+                                      gemm_ms = [256],
+                                      gemm_ns = [1024],
+                                      gemm_ks = [8192],
+                                      rtol=0.001, atol=0.002,
+                                      act_str="gelu")                                    
+
+    def test_fp16_int8_gemm_bias_silu(self):
+        self.gemm_dequant_test_helper(torch.float16, torch.int8,
+                                      gemm_ms = [256],
+                                      gemm_ns = [1024],
+                                      gemm_ks = [8192],
+                                      rtol=0.001, atol=0.002,
+                                      act_str="silu")  
+
+    def bench_helper(self, act_type, quant_type, rtol, atol):
+      # Warm, using bfloat here since it seems to reliably use cublas.
+      x = random_tensor([20480, 20480], torch.bfloat16, device="cuda")
+      warm_iters = 30
+      for iter in range(warm_iters):
+        res = x @ x
+
+      m_shapes = torch.arange(0, 12)
+      m_shapes = 2 ** m_shapes
+
+      self.gemm_dequant_test_helper(act_type, quant_type,
+                                    gemm_ms = [128],
+                                    gemm_ns = [1536],
+                                    gemm_ks = [12288],
+                                    rtol=rtol, atol=atol, benchmark=True)
+
+    @unittest.skip("This is a benchmark so don't run by default")
+    def test_fp16_int8_cublas(self):
+      self.bench_helper(torch.float16, torch.int8, 1e-3, 0.002)
+
+    
+    @unittest.skip("This is a benchmark so don't run by default")
+    def test_bf16_int8_cublas(self):
+      self.bench_helper(torch.bfloat16, torch.int8, 1e-2, 1e-2)
+
+    @unittest.skip("This is a benchmark so don't run by default")
+    def test_fp16_int4_cublas(self):
+      self.bench_helper(torch.float16, torch.quint4x2, 1e-3, 0.002)
+
+    
+    @unittest.skip("This is a benchmark so don't run by default")
+    def test_bf16_int4_cublas(self):
+      self.bench_helper(torch.bfloat16, torch.quint4x2, 1e-2, 1e-2)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/int8_gemm/CMakeLists.txt b/tests/int8_gemm/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a68000332352b14104124445819e147b120c42ed
--- /dev/null
+++ b/tests/int8_gemm/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(int8_test_files
+    int8_gemm_test.cu
+)
+
+add_definitions(-DTORCH_CUDA=1)
+
+set(EXE_NAME "int8_gemm_test")
+add_executable(${EXE_NAME} ${int8_test_files})
+set_target_properties(${EXE_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+target_link_libraries(${EXE_NAME} PUBLIC "${TORCH_LIBRARIES}" int8_gemm tensor logger)
\ No newline at end of file
diff --git a/tests/int8_gemm/int8_gemm_test.cu b/tests/int8_gemm/int8_gemm_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dd385dd096fefe9c1f683942cae6081d7fba49eb
--- /dev/null
+++ b/tests/int8_gemm/int8_gemm_test.cu
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cublas_v2.h>
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <chrono>
+
+#include "torch/csrc/cuda/Stream.h"
+#include <torch/custom_class.h>
+#include <torch/script.h>
+
+#include "src/fastertransformer/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
+#include "src/fastertransformer/th_op/th_utils.h"
+#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/logger.h"
+
+#include "cutlass/numeric_types.h"
+
+using torch::Tensor;
+using torch_ext::get_ptr;
+
+namespace ft = fastertransformer;
+
+template<typename T>
+void int8_gemm_test(
+    const int m, 
+    const int n, 
+    const int k, 
+    const at::ScalarType output_data_type,
+    const QuantMode quant_mode,
+    const int iters)
+{
+     const bool per_token_quant = quant_mode == QuantMode::PerTokenChannelQuant
+        || quant_mode == QuantMode::PerTokenQuant;
+    const bool per_channel_quant = quant_mode == QuantMode::PerTokenChannelQuant
+        || quant_mode == QuantMode::PerChannelQuant;
+    const int row_scale_size = per_token_quant ? m : 1;
+    const int col_scale_size = per_channel_quant ? n : 1;
+
+    const at::ScalarType at_int32 = at::ScalarType::Int;
+    const at::ScalarType at_int8  = at::ScalarType::Char;
+    const at::ScalarType at_fp16  = at::ScalarType::Half;
+    const at::ScalarType at_bf16  = at::ScalarType::BFloat16;
+    const at::ScalarType at_fp32  = at::ScalarType::Float;
+
+    using std::chrono::high_resolution_clock;
+    using std::chrono::duration_cast;
+    using std::chrono::microseconds;
+
+    torch::manual_seed(0);
+
+    auto x = torch::randint(-128, 128, {m, k}, torch::dtype(at_int32).requires_grad(false));
+    auto w = torch::randint(-128, 128, {k, n}, torch::dtype(at_int32).requires_grad(false));
+
+    ft::FT_CHECK(torch::allclose(x, x.to(at_int8).to(at_int32)));
+    ft::FT_CHECK(torch::allclose(w, w.to(at_int8).to(at_int32)));
+
+    auto y = torch::matmul(x, w);
+
+    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)k}, get_ptr<int32_t>(x)}.saveNpy("x.npy");
+    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)k, (size_t)n}, get_ptr<int32_t>(w)}.saveNpy("w.npy");
+    ft::Tensor{ft::MEMORY_CPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y)}.saveNpy("y.npy");
+
+    auto x_gpu = x.to(at_int8).to(torch::kCUDA);
+    auto w_T_gpu = w.to(at_int8).to(torch::kCUDA).t().contiguous();
+    auto w_gpu = w.to(at_int8).to(torch::kCUDA);
+    auto y_gpu = torch::zeros({m, n}, torch::dtype(output_data_type).device(torch::kCUDA).requires_grad(false));
+    auto y_gpu_int32 = torch::zeros({m, n}, torch::dtype(at_int32).device(torch::kCUDA).requires_grad(false));
+
+    auto alpha_row_cultass = torch::ones({row_scale_size, 1}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
+        torch::randint(1, 10, {row_scale_size, 1}, torch::dtype(at_fp32));
+    auto alpha_col_cutlass = torch::ones({1, col_scale_size}, torch::dtype(at_fp32).requires_grad(false)) * (1.0 / 100) *
+        torch::randint(1, 10, {1, col_scale_size}, torch::dtype(at_fp32));
+
+    auto alpha_row_torch = alpha_row_cultass.expand({m, 1});
+    auto alpha_col_torch = alpha_col_cutlass.expand({1, n});
+
+    // std::cout << alpha_row << std::endl;
+    auto alpha_row_gpu = alpha_row_cultass.to(torch::kCUDA);
+    auto alpha_col_gpu = alpha_col_cutlass.to(torch::kCUDA);
+
+    auto alpha_row_col_scale_gpu = torch::matmul(alpha_row_torch, alpha_col_torch).to(torch::kCUDA);
+
+    ft::CutlassInt8GemmRunner<T> cutlass_runner_half;
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    // warm_up
+    cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
+            get_ptr<int8_t>(w_T_gpu),
+            quant_mode,
+            get_ptr<float>(alpha_col_gpu),
+            get_ptr<float>(alpha_row_gpu),
+            get_ptr<T>(y_gpu),
+            m,
+            n,
+            k,
+            nullptr,
+            0,
+            stream);
+
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)m, (size_t)k}, get_ptr<int8_t>(x_gpu)}.saveNpy("x_gpu.npy");
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)n, (size_t)k}, get_ptr<int8_t>(w_T_gpu)}.saveNpy("w_T_gpu.npy");
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT8, {(size_t)k, (size_t)n}, get_ptr<int8_t>(w_gpu)}.saveNpy("w_gpu.npy");
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_FP16, {(size_t)m, (size_t)n}, get_ptr<T>(y_gpu)}.saveNpy("y_gpu.npy");
+    ft::Tensor{ft::MEMORY_GPU, ft::TYPE_INT32, {(size_t)m, (size_t)n}, get_ptr<int32_t>(y_gpu_int32)}.saveNpy("y_gpu_int32.npy");
+
+    ft::check_cuda_error(cudaStreamSynchronize(stream));
+    auto start = high_resolution_clock::now();
+
+    for (int i = 0; i < iters; ++i) {
+        cutlass_runner_half.gemm(get_ptr<int8_t>(x_gpu),
+            get_ptr<int8_t>(w_T_gpu),
+            quant_mode,
+            get_ptr<float>(alpha_col_gpu),
+            get_ptr<float>(alpha_row_gpu),
+            get_ptr<T>(y_gpu),
+            m,
+            n,
+            k,
+            nullptr,
+            0,
+            stream);
+    }
+
+    ft::check_cuda_error(cudaStreamSynchronize(stream));
+    auto end = high_resolution_clock::now();
+
+    auto duration = duration_cast<microseconds>(end - start);
+
+    if (torch::allclose((y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(output_data_type), y_gpu)) {
+        FT_LOG_INFO("SUCCESS " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
+    } else {
+        FT_LOG_ERROR("FAILED " + std::to_string((double(duration.count()) / iters) / 1000) + " ms");
+        // std::cout << "diff " << (y.to(torch::kCUDA).to(at_fp32) * alpha_row_col_scale_gpu.to(torch::kCUDA)).to(at_fp16) - y_gpu << std::endl;
+    }
+}
+
+int main(int argc, char **argv)
+{
+    if (argc != 7) {
+        FT_LOG_ERROR("arguments missing, needs m, n, k, data_type(fp16=0, bf16=1), quant_mode (perTensor=0, perToken=1, perChannel=2, perTokenChannel=3), iters.");
+        return 0;
+    }
+
+    const int m = atoi(argv[1]);
+    const int n = atoi(argv[2]);
+    const int k = atoi(argv[3]);
+    const at::ScalarType output_data_type = atoi(argv[4]) == 0 ?
+        at::ScalarType::Half : at::ScalarType::BFloat16;
+    const QuantMode quant_mode = static_cast<QuantMode>(atoi(argv[5]));
+    if (quant_mode == QuantMode::PerChannelQuant) {
+        printf("per channel quant \n");
+    }
+    const int iters = atoi(argv[6]);
+
+    if (output_data_type == at::ScalarType::Half) {
+        int8_gemm_test<half>(m, n, k, output_data_type, quant_mode, iters);
+    } else {
+#if ENABLE_BF16
+        int8_gemm_test<__nv_bfloat16>(m, n, k, output_data_type, quant_mode, iters);
+#endif
+    }
+
+    return 0;
+}
diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5531ffca6ae637fec50b4112551f77dfc90068e0
--- /dev/null
+++ b/tests/unittests/CMakeLists.txt
@@ -0,0 +1,83 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# GoogleTest Preparation - Code block copied from
+#   https://google.github.io/googletest/quickstart-cmake.html
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG release-1.12.1
+)
+add_definitions(-DTORCH_CUDA=1)
+
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+add_executable(unittest
+    test_attention_kernels.cu
+    test_logprob_kernels.cu
+    test_penalty_kernels.cu
+    test_sampling_kernels.cu
+    test_sampling_layer.cu
+    test_tensor.cu)
+
+# automatic discovery of unit tests
+target_link_libraries(unittest PUBLIC "${TORCH_LIBRARIES}" gtest_main)
+target_compile_features(unittest PRIVATE cxx_std_14)
+
+# Sorted by alphabetical order of test name.
+target_link_libraries(  # Libs for test_attention_kernels
+  unittest PUBLIC
+    -lcudart -lcurand
+    gen_relative_pos_bias gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
+target_link_libraries(  # Libs for test_logprob_kernels
+  unittest PUBLIC
+    -lcudart
+    logprob_kernels memory_utils cuda_utils logger)
+target_link_libraries(  # Libs for test_penalty_kernels
+  unittest PUBLIC
+    -lcublas -lcublasLt -lcudart
+    sampling_penalty_kernels beam_search_penalty_kernels memory_utils cuda_utils logger)
+target_link_libraries(  # Libs for test_sampling_kernel
+  unittest PUBLIC
+    -lcudart
+    sampling_topk_kernels sampling_topp_kernels memory_utils tensor cuda_utils logger)
+target_link_libraries(  # Libs for test_sampling_layer
+  unittest PUBLIC
+    -lcublas -lcublasLt -lcudart
+    cublasMMWrapper memory_utils
+    DynamicDecodeLayer TopKSamplingLayer TopPSamplingLayer tensor cuda_utils logger)
+target_link_libraries(  # Libs for test_tensor
+  unittest PUBLIC tensor cuda_utils logger)
+
+remove_definitions(-DTORCH_CUDA=1)
+add_executable(test_gemm test_gemm.cu)
+target_link_libraries(test_gemm PUBLIC -lcublas -lcudart -lcurand gemm cublasMMWrapper tensor cuda_utils logger)
+
+add_executable(test_gpt_kernels test_gpt_kernels.cu)
+target_link_libraries(test_gpt_kernels PUBLIC
+                      gpt_kernels memory_utils tensor cuda_utils logger)
+
+add_executable(test_activation test_activation.cu)
+target_link_libraries(test_activation PUBLIC
+                    -lcublas -lcublasLt -lcudart
+                    activation_kernels memory_utils cuda_utils logger)
+
+add_executable(test_context_attention_layer test_context_attention_layer.cu)
+target_link_libraries(test_context_attention_layer PUBLIC
+                      Llama -lcublas -lcublasLt -lcudart
+                      unfused_attention_kernels
+                      memory_utils tensor cublasMMWrapper cuda_utils logger)
diff --git a/tests/unittests/gtest_utils.h b/tests/unittests/gtest_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..566837c297d18c7451a2ab1eef40bb81b09a0715
--- /dev/null
+++ b/tests/unittests/gtest_utils.h
@@ -0,0 +1,251 @@
+#include <algorithm>   // std::fill_n
+#include <iostream>    // snprintf
+#include <math.h>      // expf, log
+#include <stdlib.h>    // rand
+#include <string>      // std::string
+#include <vector>      // std::vector
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/logger.h"
+
+namespace ft = fastertransformer;
+
+namespace {
+
+#define EPSILON (1e-20)
+
+bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
+{
+    // Params: a = value to compare and b = reference
+    // This function follows implementation of numpy.isclose(), which checks
+    //   abs(a - b) <= (atol + rtol * abs(b)).
+    // Note that the inequality above is asymmetric where b is considered as
+    // a reference value. To account into both absolute/relative errors, it
+    // uses absolute tolerance and relative tolerance at the same time. The
+    // default values of atol and rtol borrowed from numpy.isclose(). For the
+    // case of nan value, the result will be true.
+    if (isnan(a) && isnan(b)) {
+        return true;
+    }
+    return fabs(a - b) <= (atol + rtol * fabs(b));
+}
+
+template<typename T>
+bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
+    size_t failures = 0;
+    float relative_gap = 0.0f;;
+
+    for (size_t i = 0; i < size; ++i) {
+        // The values for the output and the reference.
+        float a = (float)out[i];
+        float b = (float)ref[i];
+
+        bool ok = almostEqual(a, b, atol, rtol);
+        // Print the error.
+        if (!ok && failures < 4) {
+            FT_LOG_ERROR(">> invalid result for i=%lu:", i);
+            FT_LOG_ERROR(">>    found......: %10.6f", a);
+            FT_LOG_ERROR(">>    expected...: %10.6f", b);
+            FT_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
+            FT_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
+        }
+        // Update the number of failures.
+        failures += ok ? 0 : 1;
+        // Update the relative gap.
+        relative_gap += fabsf(a - b) / (fabsf(b) + EPSILON);
+    }
+
+    relative_gap /= size;
+
+    // Allow not matched up to 1% elements.
+    size_t tol_failures = (size_t)(0.01 * size);
+    if (failures > tol_failures) {
+        FT_LOG_ERROR("%s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
+                     name.c_str(), 100. * failures / size, atol, rtol, 100. * relative_gap);
+    }
+    return failures <= tol_failures;
+}
+
+template<typename T>
+bool checkResult(std::string name, T* out, T* ref, size_t size,
+                 bool device_out = true, bool device_ref = false)
+{
+    bool is_fp32 = sizeof(T) == 4;
+    float atol = is_fp32 ? 1e-4f : 1e-3f;
+    float rtol = is_fp32 ? 1e-2f : 1e-1f;
+
+    T* h_out = nullptr;
+    if (device_out) {
+        h_out = new T[size];
+        cudaMemcpy(h_out, out, sizeof(T) * size, cudaMemcpyDeviceToHost);
+        out = h_out;
+    }
+    T* h_ref = nullptr;
+    if (device_ref) {
+        h_ref = new T[size];
+        cudaMemcpy(h_ref, ref, sizeof(T) * size, cudaMemcpyDeviceToHost);
+        ref = h_ref;
+    }
+    bool is_ok = checkResult(name, out, ref, size, atol, rtol);
+    if (h_out != nullptr){
+        delete[] h_out;
+    }
+    if (h_ref != nullptr) {
+        delete[] h_ref;
+    }
+    return is_ok;
+}
+
+template<typename T>
+void initRandom(T* ptr, size_t size, float minval, float maxval) {
+    for (size_t i = 0; i < size; ++i) {
+        float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+        val *= (maxval - minval);
+        ptr[i] = static_cast<T>(minval + val);
+    }
+}
+
+void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
+    assert(minval < maxval);
+    int mod = maxval - minval;
+    for (size_t i = 0; i < size; ++i) {
+        ptr[i] = minval + rand() % mod;
+    }
+}
+
+template<typename T>
+void tile(T* x, int m, int n) {
+    for (int i = 1; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            x[i * n + j] = x[j];
+        }
+    }
+}
+
+template<typename T>
+void tile(T* dst, T* src, int m, int n) {
+    for (int i = 1; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            dst[i * n + j] = src[j];
+        }
+    }
+}
+
+// for the safe arithmetic functions in host.
+namespace math {
+template<typename T>
+inline T add(T a, T b)
+{
+    return static_cast<T>((float)a + (float)b);
+}
+
+template<typename T>
+inline T mul(T a, T b)
+{
+    return static_cast<T>((float)a * (float)b);
+}
+
+template<typename T>
+inline T fma(T a, T b, T c)
+{
+    return static_cast<T>((float)a * (float)b + (float)c);
+}
+}
+
+typedef testing::Types<float, half> FloatAndHalfTypes;
+#ifndef ENABLE_BF16
+typedef FloatAndHalfTypes SupportTypes;
+#else
+typedef testing::Types<float, half, __nv_bfloat16> FloatHalfBf16Types;
+typedef FloatHalfBf16Types SupportTypes;
+#endif
+
+class FtTestBase: public testing::Test {
+public:
+    void SetUp() override
+    {
+        int device = 0;
+        cudaGetDevice(&device);
+        cudaStreamCreate(&stream);
+        allocator = new ft::Allocator<ft::AllocatorType::CUDA>(device);
+        allocator->setStream(stream);
+    }
+
+    void TearDown() override
+    {
+        // Automatically allocated CPU buffers should be released at the end of a test.
+        // We don't need to care GPU buffers allocated by Allocator because they are
+        // managed by the allocator.
+        for (auto& buffer : allocated_cpu_buffers) {
+            free(buffer);
+        }
+        allocated_cpu_buffers.clear();
+        delete allocator;
+        cudaStreamDestroy(stream);
+    }
+
+protected:
+    cudaStream_t                            stream;
+    ft::Allocator<ft::AllocatorType::CUDA>* allocator;
+    std::vector<void*>                      allocated_cpu_buffers;
+
+    // Utilities to easily handle tensor instances in test cases.
+
+    ft::Tensor createTensor(const ft::MemoryType mtype,
+                            const ft::DataType dtype,
+                            const std::vector<size_t> shape)
+    {
+        size_t n_elmts  = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
+        size_t buf_size = ft::Tensor::getTypeSize(dtype) * n_elmts;
+
+        void* data = nullptr;
+        if (mtype == ft::MEMORY_CPU || mtype == ft::MEMORY_CPU_PINNED) {
+            data = malloc(buf_size);
+            allocated_cpu_buffers.push_back(data);
+        }
+        else {
+            data = allocator->malloc(buf_size);
+        }
+        return ft::Tensor(mtype, dtype, shape, data);
+    };
+
+    template<typename T>
+    ft::Tensor toHost(ft::Tensor& device_tensor)
+    {
+        if (device_tensor.data == nullptr) {
+            return ft::Tensor();
+        }
+        ft::Tensor host_tensor = createTensor(ft::MEMORY_CPU, device_tensor.type, device_tensor.shape);
+        ft::cudaAutoCpy(host_tensor.getPtr<T>(), device_tensor.getPtr<T>(), host_tensor.size(), stream);
+        cudaStreamSynchronize(stream);
+        return host_tensor;
+    };
+
+    template<typename T>
+    ft::Tensor toDevice(ft::Tensor& host_tensor)
+    {
+        if (host_tensor.data == nullptr) {
+            return ft::Tensor();
+        }
+        ft::Tensor device_tensor = createTensor(ft::MEMORY_GPU, host_tensor.type, host_tensor.shape);
+        ft::cudaAutoCpy(device_tensor.getPtr<T>(), host_tensor.getPtr<T>(), host_tensor.size(), stream);
+        return device_tensor;
+    };
+
+    void copyTensor(ft::Tensor& dst, ft::Tensor& src)
+    {
+        FT_CHECK_WITH_INFO(
+            src.sizeBytes() == dst.sizeBytes(),
+            ft::fmtstr("src and dst has different size (%ld != %ld)", src.sizeBytes(), dst.sizeBytes()));
+        ft::cudaAutoCpy(dst.getPtr<char>(), src.getPtr<char>(), src.sizeBytes(), stream);
+        cudaStreamSynchronize(stream);
+    }
+
+};
+
+}
diff --git a/tests/unittests/test_activation.cu b/tests/unittests/test_activation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1dd66818c47b74ae8e8c6212a5304f3536a2324a
--- /dev/null
+++ b/tests/unittests/test_activation.cu
@@ -0,0 +1,154 @@
+#include <iostream>   // snprintf
+#include <string>     // std::string
+#include <vector>     // std::vector
+
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+
+#include "unittest_utils.h"
+
+using namespace fastertransformer;
+
+#define PRINT_LIMIT 16
+#define EPSILON (1e-20)
+#define EPSILON_FP16 (1e-10)
+
+struct TestCase {
+    std::string name;
+    size_t m;
+    size_t n;
+    size_t ite;
+
+    std::string toString()
+    {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "TestCase[name=%s, m=%ld, n=%ld]", name.c_str(), m, n);
+        return buf;
+    }
+
+    void print()
+    {
+        FT_LOG_INFO(toString());
+    }
+};
+
+template<typename T>
+void testActivationKernel(TestCase tc)
+{
+    const int m = tc.m;
+    const int n = tc.n;
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    T *output_baseline, *output_opt1, *bias;
+    deviceMalloc(&output_baseline, m * n);
+    deviceMalloc(&output_opt1, m * n);
+    deviceMalloc(&bias, n);
+    cudaD2Dcpy(output_opt1, output_baseline, m * n);
+    invokeGenericActivation<GeluActivation>(output_baseline,
+                                            (const T*) bias,
+                                            (const T*) nullptr,
+                                            (const T*) nullptr,
+                                            (const int*) nullptr,
+                                            (const T*) nullptr,
+                                            m,
+                                            n,
+                                            0,
+                                            (const float*) nullptr,
+                                            (const float*) nullptr,
+                                            stream);
+    invokeAddBiasGeluV2(output_opt1, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
+    bool passed = checkResult(tc.name, output_baseline, output_opt1, m * n, true, true);
+    FT_CHECK(passed);
+
+    const int ite = tc.ite;
+    CudaTimer cuda_timer_baseline(stream);
+    // warmup
+    for (int i = 0; i < ite; i++) {
+        invokeGenericActivation<GeluActivation>(output_baseline,
+                                                (const T*) bias,
+                                                (const T*) nullptr,
+                                                (const T*) nullptr,
+                                                (const int*) nullptr,
+                                                (const T*) nullptr,
+                                                m,
+                                                n,
+                                                0,
+                                                (const float*) nullptr,
+                                                (const float*) nullptr,
+                                                stream);
+    }
+    cuda_timer_baseline.start();
+    for (int i = 0; i < ite; i++) {
+        invokeGenericActivation<GeluActivation>(output_baseline,
+                                                (const T*) bias,
+                                                (const T*) nullptr,
+                                                (const T*) nullptr,
+                                                (const int*) nullptr,
+                                                (const T*) nullptr,
+                                                m,
+                                                n,
+                                                0,
+                                                (const float*) nullptr,
+                                                (const float*) nullptr,
+                                                stream);
+    }
+    float total_time_baseline = cuda_timer_baseline.stop();
+
+    CudaTimer cuda_timer_opt(stream);
+    // warmup
+    for (int i = 0; i < ite; i++) {
+        invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
+    }
+    cuda_timer_opt.start();
+    for (int i = 0; i < ite; i++) {
+        invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
+    }
+    float total_time_opt = cuda_timer_opt.stop();
+    FT_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
+                tc.toString().c_str(),
+                total_time_baseline / ite * 1000.f,
+                total_time_opt / ite * 1000.f,
+                total_time_baseline / total_time_opt,
+                ite);
+
+    deviceFree(output_baseline);
+    deviceFree(output_opt1);
+    deviceFree(bias);
+}
+
+int main()
+{
+    printf("[INFO] Device: %s \n", getDeviceName().c_str());
+    std::vector<TestCase> test_cases{
+        // TC: name / m / n
+        TestCase{"addBiasGelu", 32, 1024, 1000},
+        TestCase{"addBiasGelu", 128, 1024, 1000},
+        TestCase{"addBiasGelu", 2048, 1024, 1000},
+        TestCase{"addBiasGelu", 32, 3072, 1000},
+        TestCase{"addBiasGelu", 128, 3072, 1000},
+        TestCase{"addBiasGelu", 2048, 3072, 1000},
+        TestCase{"addBiasGelu", 32, 4096, 1000},
+        TestCase{"addBiasGelu", 128, 4096, 1000},
+        TestCase{"addBiasGelu", 2048, 4096, 1000},
+        TestCase{"addBiasGelu", 32, 8192, 1000},
+        TestCase{"addBiasGelu", 128, 8192, 1000},
+        TestCase{"addBiasGelu", 2048, 8192, 1000},
+        TestCase{"addBiasGelu", 32, 49152, 1000},
+        TestCase{"addBiasGelu", 128, 49152, 1000},
+        TestCase{"addBiasGelu", 2048, 49152, 1000},
+        TestCase{"addBiasGelu", 32, 81920, 1000},
+        TestCase{"addBiasGelu", 128, 81920, 1000},
+        TestCase{"addBiasGelu", 2048, 81920, 1000},
+    };
+
+    for (auto& tc : test_cases) {
+        // testActivationKernel<float>(tc);
+        testActivationKernel<half>(tc);
+    }
+    FT_LOG_INFO("testActivationKernel done");
+
+    return 0;
+}
diff --git a/tests/unittests/test_attention_kernels.cu b/tests/unittests/test_attention_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..da0455ad17b85e7dbe592c794d2256f735bd527f
--- /dev/null
+++ b/tests/unittests/test_attention_kernels.cu
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tests/unittests/gtest_utils.h"
+#include "src/fastertransformer/kernels/gen_relative_pos_bias.h"
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/nccl_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+
+#include <curand.h>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+using namespace fastertransformer;
+
+namespace {
+
+struct AttentionKernelTestParam {
+    size_t batch_size    = 4;
+    size_t q_length      = 32;
+    size_t k_length      = 32;
+    size_t head_num      = 4;
+    size_t size_per_head = 32;
+
+    bool   use_fp32_qk_buf      = false;
+    size_t rotary_embedding_dim = 0;
+    bool   neox_rotary_style    = false;
+
+    float  q_scaling = 1.0f;
+};
+
+namespace utils {
+
+#define CHECK_CURAND(cmd) do {                                                      \
+    curandStatus_t err = cmd;                                                       \
+    if (err != CURAND_STATUS_SUCCESS) {                                             \
+        throw std::runtime_error(                                                   \
+            fmtstr("[FT][ERROR] curand runtime error: %d", err)); \
+    }} while(0)                                                                     \
+
+__global__ void convert_and_copy(half* dst, const float* src, const size_t size)
+{
+    for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+        dst[idx] = __float2half(src[idx]);
+    }
+}
+
+#ifdef ENABLE_BF16
+__global__ void convert_and_copy(__nv_bfloat16* dst, const float* src, const size_t size)
+{
+    for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+        dst[idx] = __float2bfloat16(src[idx]);
+    }
+}
+#endif
+
+template<typename T>
+void normal(curandGenerator_t curng, T* buf, size_t size, float mean, float stddev)
+{
+    float* tmp_buf = nullptr;
+    deviceMalloc(&tmp_buf, size);
+
+    // Generate random values in float data type.
+    CHECK_CURAND(curandGenerateNormal(curng, tmp_buf, size / 2, mean, stddev));
+    sync_check_cuda_error();
+
+    // Convert and copy to the output buffer if it is not of type float.
+    dim3 block(512);
+    dim3 grid(min(static_cast<int>((size + block.x - 1) / block.x), 256));
+    convert_and_copy<<<grid, block>>>(buf, tmp_buf, size);
+    cudaDeviceSynchronize();
+
+    deviceFree(tmp_buf);
+    sync_check_cuda_error();
+}
+
+template<>
+void normal(curandGenerator_t curng, float* buf, size_t size, float mean, float stddev)
+{
+    // Generate random values in float data type.
+    CHECK_CURAND(curandGenerateNormal(curng, buf, size / 2, mean, stddev));
+    sync_check_cuda_error();
+}
+
+template<typename T>
+void normal(curandGenerator_t curng, Tensor& tensor, float mean = 0.0f, float stddev = 1.0f)
+{
+    if (tensor.size() > 0) {
+        FT_CHECK(tensor.type == getTensorType<T>());
+        normal(curng, tensor.getPtr<T>(), tensor.size(), mean, stddev);
+    }
+}
+
+__host__ uint32_t pow2_rounddown(uint32_t x)
+{
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    x >>= 1;
+    return x + 1;
+}
+
+}  // namespace utils
+
+////////////////////////////
+// Reference computation.
+////////////////////////////
+
+template<typename T>
+inline T safe_add_bias(const T v, const T* bias, const size_t bias_idx)
+{
+    return bias == nullptr ? v : ::math::add(v, bias[bias_idx]);
+}
+
+template<typename T>
+void computeQkSoftmax(T*           attn_score,
+                      const T*     qk,
+                      const T*     attn_mask,
+                      const T*     pos_bias,
+                      const size_t batch_size,
+                      const size_t num_heads,
+                      const size_t q_length,
+                      const size_t k_length,
+                      const T      qk_scale)
+{
+    // attn_score [batch_size, num_heads, q_length, k_length]
+    // qk         [batch_size, num_heads, q_length, k_length]
+    // attn_mask  [batch_size, 1, q_length, k_length]
+    // pos_bias   [1, num_heads, q_length, k_length]
+
+    // batch, head index.
+    for (size_t bhi = 0; bhi < batch_size * num_heads; ++bhi) {
+        size_t bi = bhi / num_heads;  // batch index.
+        size_t hi = bhi % num_heads;  // head index.
+        // The attention mask of the current batch.
+        const T* mask = &attn_mask[bi * q_length * k_length];
+        // The position bias of the current head.
+        const T* head_pos_bias = pos_bias != nullptr ? &pos_bias[hi * q_length * k_length] : nullptr;
+
+        for (size_t qi = 0; qi < q_length; ++qi) {
+            float maxval = -FLT_MAX;
+            for (size_t ki = 0; ki < k_length; ++ki) {
+                size_t qk_idx = qi * k_length + ki;
+                if (int(mask[qk_idx]) > 0) {  // mask = 0 or 1.
+                    float val = (float)safe_add_bias(::math::mul(qk_scale, qk[qk_idx]), head_pos_bias, qk_idx);
+                    if (val > maxval) {
+                        maxval = val;
+                    }
+                }
+            }
+            float sum = 0.0f;
+            for (size_t ki = 0; ki < k_length; ++ki) {
+                size_t qk_idx = qi * k_length + ki;
+                if (int(mask[qk_idx]) > 0) {  // mask = 0 or 1.
+                    float val = (float)safe_add_bias(::math::mul(qk_scale, qk[qk_idx]), head_pos_bias, qk_idx);
+                    sum += expf(val - maxval);
+                }
+            }
+            for (size_t ki = 0; ki < k_length; ++ki) {
+                size_t qk_idx = qi * k_length + ki;
+                if (int(mask[qk_idx]) > 0) {  // mask = 0 or 1.
+                    float val = (float)safe_add_bias(::math::mul(qk_scale, qk[qk_idx]), head_pos_bias, qk_idx);
+                    attn_score[qk_idx] = static_cast<T>(expf(val - maxval) / (sum + EPSILON));
+                }
+                else {
+                    attn_score[qk_idx] = T(0.0f);
+                }
+            }
+        }
+
+        // Move the data pointers to the next.
+        attn_score += q_length * k_length;
+        qk         += q_length * k_length;
+    }
+}
+
+template<typename T>
+class AttentionKernelTest : public FtTestBase {
+
+private:
+    using FtTestBase::stream;
+    using FtTestBase::allocator;
+
+    unsigned long long seed = 31;
+    curandGenerator_t  curng;
+
+    Tensor randomAttentionMask(const std::vector<size_t> shape)
+    {
+        // shape (batch_size, 1, max_input_length, max_input_length + max_prompt_length)
+
+        // Create a attention mask tensor and buffer.
+        Tensor attn_mask = createTensor(MEMORY_GPU, getTensorType<T>(), shape);
+
+        // Set the mask values.
+        size_t batch_size   = shape[0];
+        size_t max_q_length = shape[2];
+        size_t max_k_length = shape[3];
+        // TODO: Enable prompts.
+        size_t max_prompt_length = max_k_length - max_q_length;
+
+        Tensor h_seq_lengths    = createTensor(MEMORY_CPU, TYPE_INT32, {batch_size});
+        Tensor h_prompt_lengths = createTensor(MEMORY_CPU, TYPE_INT32, {batch_size});
+        initRandomInt(h_seq_lengths.getPtr<int>(), batch_size, max_q_length, max_q_length + 1);
+        initRandomInt(h_prompt_lengths.getPtr<int>(), batch_size, 0, max_prompt_length + 1);
+
+        Tensor d_seq_lengths    = createTensor(MEMORY_GPU, TYPE_INT32, {batch_size});
+        Tensor d_prompt_lengths = createTensor(MEMORY_GPU, TYPE_INT32, {batch_size});
+        copyTensor(d_seq_lengths, h_seq_lengths);
+        copyTensor(d_prompt_lengths, h_prompt_lengths);
+
+        // Used gpt_kernels function to build attention mask.
+        invokeBuildDecoderAttentionMask(attn_mask.getPtr<T>(),
+                                        d_seq_lengths.getPtr<int>(),
+                                        d_prompt_lengths.getPtr<int>(),
+                                        batch_size,
+                                        max_q_length,
+                                        max_prompt_length,
+                                        stream);
+        sync_check_cuda_error();
+        return attn_mask;
+    };
+
+public:
+    void SetUp() override
+    {
+        FtTestBase::SetUp();
+        CHECK_CURAND(curandCreateGenerator(&curng, CURAND_RNG_PSEUDO_DEFAULT));
+        CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(curng, seed));
+    }
+
+    void TearDown() override
+    {
+        curandDestroyGenerator(curng);
+        FtTestBase::TearDown();
+    }
+
+    void runTestMaskedSoftmax(AttentionKernelTestParam param, bool is_benchmark = false) {
+        DataType dtype = getTensorType<T>();
+
+        std::vector<size_t> qk_shape {param.batch_size, param.head_num, param.q_length, param.k_length};
+
+        bool use_fp32_qk = param.use_fp32_qk_buf && dtype != TYPE_FP32;
+
+        Tensor qk        = createTensor(MEMORY_GPU, dtype, qk_shape);
+        Tensor qk_fp32   = use_fp32_qk ? createTensor(MEMORY_GPU, TYPE_FP32, qk_shape) : Tensor();
+        Tensor attn_mask = randomAttentionMask({param.batch_size, 1, param.q_length, param.k_length});
+        // Input random initialization
+        if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
+            utils::normal<float>(curng, qk_fp32);
+        }
+        else {
+            utils::normal<T>(curng, qk);
+        }
+
+        // Clone to host for reference computation if needed.
+        Tensor h_qk        = is_benchmark ? Tensor() : toHost<T>(qk);
+        Tensor h_attn_mask = is_benchmark ? Tensor() : toHost<T>(attn_mask);
+        Tensor h_qk_fp32   = is_benchmark ? Tensor() : toHost<float>(qk_fp32);
+
+        T scale = static_cast<T>(1 / sqrtf(param.size_per_head * 1.0f));
+
+        if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
+            MaskedSoftmaxParam<T, float> softmax_param;
+            softmax_param.attention_score    = qk.getPtr<T>();
+            softmax_param.qk                 = qk_fp32.getPtr<float>();
+            softmax_param.attention_mask     = attn_mask.getPtr<T>();
+            softmax_param.batch_size         = param.batch_size;
+            softmax_param.num_heads          = param.head_num;
+            softmax_param.q_length           = param.q_length;
+            softmax_param.k_length           = param.k_length;
+            softmax_param.qk_scale           = scale;
+            invokeMaskedSoftmax(softmax_param, stream);
+            sync_check_cuda_error();
+        }
+        else {
+            MaskedSoftmaxParam<T, T> softmax_param;
+            softmax_param.attention_score    = qk.getPtr<T>();
+            softmax_param.qk                 = qk.getPtr<T>();
+            softmax_param.attention_mask     = attn_mask.getPtr<T>();
+            softmax_param.batch_size         = param.batch_size;
+            softmax_param.num_heads          = param.head_num;
+            softmax_param.q_length           = param.q_length;
+            softmax_param.k_length           = param.k_length;
+            softmax_param.qk_scale           = scale;
+            invokeMaskedSoftmax(softmax_param, stream);
+            sync_check_cuda_error();
+        }
+
+        if (!is_benchmark) {
+            if (use_fp32_qk) {
+                computeQkSoftmax(h_qk.getPtr<T>(),
+                                 h_qk_fp32.getPtr<T>(),
+                                 h_attn_mask.getPtr<T>(),
+                                 (T*)nullptr,
+                                 param.batch_size,
+                                 param.head_num,
+                                 param.q_length,
+                                 param.k_length,
+                                 scale);
+            }
+            else {
+                computeQkSoftmax(h_qk.getPtr<T>(),
+                                 h_qk.getPtr<T>(),
+                                 h_attn_mask.getPtr<T>(),
+                                 (T*)nullptr,
+                                 param.batch_size,
+                                 param.head_num,
+                                 param.q_length,
+                                 param.k_length,
+                                 scale);
+            }
+            bool passed = checkResult("MaskedSoftmax", qk.getPtr<T>(), h_qk.getPtr<T>(), qk.size());
+            EXPECT_TRUE(passed);
+        }
+    }
+
+    void runTestAlibiMaskedSoftmax(AttentionKernelTestParam param, bool is_benchmark = false) {
+        DataType dtype = getTensorType<T>();
+
+        std::vector<size_t> qk_shape {param.batch_size, param.head_num, param.q_length, param.k_length};
+
+        bool use_fp32_qk = param.use_fp32_qk_buf && dtype != TYPE_FP32;
+
+        Tensor qk           = createTensor(MEMORY_GPU, dtype, qk_shape);
+        Tensor qk_fp32      = use_fp32_qk ? createTensor(MEMORY_GPU, TYPE_FP32, qk_shape) : Tensor();
+        Tensor attn_mask    = randomAttentionMask({param.batch_size, 1, param.q_length, param.k_length});
+        Tensor alibi_slopes = createTensor(MEMORY_GPU, dtype, {param.head_num});
+
+        // Input random initialization
+        if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
+            utils::normal<float>(curng, qk_fp32);
+        }
+        else {
+            utils::normal<T>(curng, qk);
+        }
+        invokeBuildAlibiSlopes(alibi_slopes.getPtr<T>(), param.head_num, stream);
+        sync_check_cuda_error();
+
+        Tensor h_alibi_slopes = createTensor(MEMORY_CPU, dtype, {param.head_num});
+        Tensor h_alibi_bias   = is_benchmark ? Tensor() :
+            createTensor(MEMORY_CPU, dtype, {param.head_num, param.q_length, param.k_length});
+        // The nearest power of 2 equal to / smaller than num_heads followed by HF's implementation.
+        T* alibi_slope_ptr = h_alibi_slopes.getPtr<T>();
+        int num_heads_pow2 = utils::pow2_rounddown(param.head_num);
+        for (size_t h = 0; h < param.head_num; ++h) {
+            // The slope of linear bias of the attention head
+            if (h < num_heads_pow2) {
+                alibi_slope_ptr[h] = static_cast<T>(powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2) - 3.f)), h + 1));
+            } else {
+                alibi_slope_ptr[h] = static_cast<T>(
+                    powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2 << 1) - 3.f)), (h - num_heads_pow2) * 2 + 1));
+            }
+            if (h_alibi_bias.size() > 0) {
+                T* alibi_bias_ptr = h_alibi_bias.getPtr<T>();
+                for (size_t qi = 0; qi < param.q_length; ++qi) {
+                    for (size_t ki = 0; ki < param.k_length; ++ki) {
+                        size_t hqk_idx = (h * param.q_length + qi) * param.k_length + ki;
+                        alibi_bias_ptr[hqk_idx] = ::math::mul(alibi_slope_ptr[h], T(0.0f + ki - qi));
+                    }
+                }
+            }
+        }
+        EXPECT_TRUE(
+            checkResult("CheckAlibiSlopes", alibi_slopes.getPtr<T>(), h_alibi_slopes.getPtr<T>(), param.head_num));
+
+        // Clone to host for reference computation if needed.
+        Tensor h_qk        = is_benchmark ? Tensor() : toHost<T>(qk);
+        Tensor h_attn_mask = is_benchmark ? Tensor() : toHost<T>(attn_mask);
+        Tensor h_qk_fp32   = is_benchmark ? Tensor() : toHost<float>(qk_fp32);
+
+        T scale = static_cast<T>(1 / sqrtf(param.size_per_head * 1.0f));
+
+        if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
+            MaskedSoftmaxParam<T, float> softmax_param;
+            softmax_param.attention_score    = qk.getPtr<T>();
+            softmax_param.qk                 = qk_fp32.getPtr<float>();
+            softmax_param.attention_mask     = attn_mask.getPtr<T>();
+            softmax_param.linear_bias_slopes = alibi_slopes.getPtr<T>();
+            softmax_param.batch_size         = param.batch_size;
+            softmax_param.num_heads          = param.head_num;
+            softmax_param.q_length           = param.q_length;
+            softmax_param.k_length           = param.k_length;
+            softmax_param.qk_scale           = scale;
+            invokeMaskedSoftmax(softmax_param, stream);
+            sync_check_cuda_error();
+        }
+        else {
+            MaskedSoftmaxParam<T, T> softmax_param;
+            softmax_param.attention_score    = qk.getPtr<T>();
+            softmax_param.qk                 = qk.getPtr<T>();
+            softmax_param.attention_mask     = attn_mask.getPtr<T>();
+            softmax_param.linear_bias_slopes = alibi_slopes.getPtr<T>();
+            softmax_param.batch_size         = param.batch_size;
+            softmax_param.num_heads          = param.head_num;
+            softmax_param.q_length           = param.q_length;
+            softmax_param.k_length           = param.k_length;
+            softmax_param.qk_scale           = scale;
+            invokeMaskedSoftmax(softmax_param, stream);
+            sync_check_cuda_error();
+        }
+
+        if (!is_benchmark) {
+            if (use_fp32_qk) {
+                computeQkSoftmax(h_qk.getPtr<T>(),
+                                 h_qk_fp32.getPtr<T>(),
+                                 h_attn_mask.getPtr<T>(),
+                                 h_alibi_bias.getPtr<T>(),
+                                 param.batch_size,
+                                 param.head_num,
+                                 param.q_length,
+                                 param.k_length,
+                                 scale);
+            }
+            else {
+                computeQkSoftmax(h_qk.getPtr<T>(),
+                                 h_qk.getPtr<T>(),
+                                 h_attn_mask.getPtr<T>(),
+                                 h_alibi_bias.getPtr<T>(),
+                                 param.batch_size,
+                                 param.head_num,
+                                 param.q_length,
+                                 param.k_length,
+                                 scale);
+            }
+            bool passed = checkResult("AlibiMaskedSoftmax", qk.getPtr<T>(), h_qk.getPtr<T>(), qk.size());
+            EXPECT_TRUE(passed);
+        }
+    }
+};
+
+TYPED_TEST_SUITE(AttentionKernelTest, SupportTypes);
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_NoPrompt) {
+    this->runTestMaskedSoftmax({1, 12, 12, 1, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_NoPrompt2) {
+    // q_length is not multiple of 4.
+    this->runTestMaskedSoftmax({1, 11, 11, 4, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_HasPrompt) {
+    this->runTestMaskedSoftmax({1, 12, 24, 2, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_HasPrompt2) {
+    this->runTestMaskedSoftmax({1, 11, 24, 2, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_LongSequence1024) {
+    this->runTestMaskedSoftmax({1, 12, 1024, 2, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_LongSequence2048) {
+    this->runTestMaskedSoftmax({1, 12, 2048, 2, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_LongSequence3072) {
+    this->runTestMaskedSoftmax({1, 12, 3072, 2, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, MaskedSoftmax_LongSequence4096) {
+    this->runTestMaskedSoftmax({1, 12, 4096, 2, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, Benchmark_MaskedSoftmax_LongSequence1024) {
+    // Assume the bloom 176B model with 8 TP.
+    this->runTestMaskedSoftmax({8, 1024, 1024, 14, 128, false, 0, false, true}, true);
+}
+
+TYPED_TEST(AttentionKernelTest, Benchmark_MaskedSoftmax_LongSequence2048) {
+    // Assume the bloom 176B model with 8 TP.
+    this->runTestMaskedSoftmax({8, 2048, 2048, 14, 128, false, 0, false, true}, true);
+}
+
+TYPED_TEST(AttentionKernelTest, Benchmark_MaskedSoftmax_LongSequence4096) {
+    // Assume the bloom 176B model with 8 TP.
+    this->runTestMaskedSoftmax({8, 4096, 4096, 14, 128, false, 0, false, true}, true);
+}
+
+TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence1) {
+    this->runTestAlibiMaskedSoftmax({1, 12, 12, 4, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence2) {
+    // q_length is not multiple of 4.
+    this->runTestAlibiMaskedSoftmax({1, 11, 11, 4, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence_HasPrompt1) {
+    this->runTestAlibiMaskedSoftmax({1, 12, 20, 4, 32, false, 0, false});
+}
+
+TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence_HasPrompt2) {
+    // q_length is not multiple of 4.
+    this->runTestAlibiMaskedSoftmax({1, 11, 20, 4, 32, false, 0, false});
+}
+
+// Tests for long sentence generation. Assume the bloom 176B model with 8 TP.
+
+TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence1024) {
+    this->runTestAlibiMaskedSoftmax({8, 1024, 1024, 14, 128, false, 0, false, true}, true);
+}
+
+TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence2048) {
+    this->runTestAlibiMaskedSoftmax({8, 2048, 2048, 14, 128, false, 0, false, true}, true);
+}
+
+TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence3072) {
+    this->runTestAlibiMaskedSoftmax({8, 3072, 3072, 14, 128, false, 0, false, true}, true);
+}
+
+TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence4096) {
+    this->runTestAlibiMaskedSoftmax({4, 4096, 4096, 14, 128, false, 0, false, true}, true);
+}
+
+}  // end of namespace
diff --git a/tests/unittests/test_context_attention_layer.cu b/tests/unittests/test_context_attention_layer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..18e93a212dd5c65c42379725cd250cec03f60aef
--- /dev/null
+++ b/tests/unittests/test_context_attention_layer.cu
@@ -0,0 +1,380 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <assert.h>
+#include <cstdlib>
+#include <math.h>
+#include <numeric>
+#include <random>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/transform.h>
+
+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/models/llama/llama_kernels.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "unittest_utils.h"
+
+using namespace fastertransformer;
+
+template<typename scalar_t>
+__global__ void pad_query_kernel(
+    scalar_t* query_ptr, const int* cu_seqlens, int batch_size, int batch_stride, int seq_stride, int max_seq_length)
+{
+    int batch_id = blockIdx.x;
+    int seqlen   = cu_seqlens[batch_id + 1] - cu_seqlens[batch_id];
+
+    query_ptr += batch_id * batch_stride;
+    for (int tid = threadIdx.x; tid < batch_stride; tid += blockDim.x) {
+        int seq_id = (tid / seq_stride) % max_seq_length;
+        if (seq_id >= seqlen) {
+            query_ptr[tid] = scalar_t(0.0f);
+        }
+    }
+}
+
+template<typename scalar_t>
+void pad_query(scalar_t*    query_ptr,
+               const int*   cu_seqlens,
+               int          batch_size,
+               int          batch_stride,
+               int          seq_stride,
+               int          max_seq_length,
+               cudaStream_t stream)
+{
+    pad_query_kernel<<<batch_size, 512, 0, stream>>>(
+        query_ptr, cu_seqlens, batch_size, batch_stride, seq_stride, max_seq_length);
+}
+
+template<typename scalar_t>
+__global__ void
+pad_out_kernel(scalar_t* out_ptr, const int* cu_seqlens, int batch_size, int batch_stride, int seq_stride)
+{
+    int seqlen = cu_seqlens[batch_size];
+
+    for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < batch_size * batch_stride;
+         tid += blockDim.x * gridDim.x) {
+        int seq_id = (tid / seq_stride);
+        if (seq_id >= seqlen) {
+            out_ptr[tid] = scalar_t(0.0f);
+        }
+    }
+}
+
+template<typename scalar_t>
+void pad_out(
+    scalar_t* out_ptr, const int* cu_seqlens, int batch_size, int batch_stride, int seq_stride, cudaStream_t stream)
+{
+    pad_out_kernel<<<batch_size, 512, 0, stream>>>(out_ptr, cu_seqlens, batch_size, batch_stride, seq_stride);
+}
+
+template<typename scalar_t>
+void naive_mha(scalar_t*        out_ptr,
+               scalar_t*        query_ptr,
+               scalar_t*        key_ptr,
+               scalar_t*        val_ptr,
+               scalar_t*        mask_ptr,
+               scalar_t*        q_buf_ptr,
+               scalar_t*        k_buf_ptr,
+               scalar_t*        v_buf_ptr,
+               scalar_t*        qk_buf_ptr,
+               scalar_t*        out_buf_ptr,
+               int*             padding_offset,
+               int*             cu_seqlens,
+               int              batch_size,
+               int              head_num,
+               int              key_len,
+               int              seq_len,
+               int              size_per_head,
+               cudaStream_t     stream,
+               cublasMMWrapper* cublas_wrapper_)
+{
+    const scalar_t qk_scale = static_cast<scalar_t>(1.f / sqrtf(size_per_head * 1.f));
+    // create
+
+    //////////////////////////////////////////////
+    /// Q,K,V
+    /// transpose <B,s,h,D> -> <B,h,s,D>
+    /// TODO: remove padding
+    // invokeTransposeQKV(q_buf_ptr,
+    //                    query_ptr,
+    //                    batch_size,
+    //                    head_num,
+    //                    seq_len,
+    //                    size_per_head,
+    //                    nullptr,  // scale, only used in int8 mode
+    //                    0,        // int8_mode
+    //                    stream);
+    // invokeTransposeQKV(k_buf_ptr,
+    //                    key_ptr,
+    //                    batch_size,
+    //                    head_num,
+    //                    key_len,
+    //                    size_per_head,
+    //                    nullptr,  // scale, only used in int8 mode
+    //                    0,        // int8_mode
+    //                    stream);
+    // invokeTransposeQKV(v_buf_ptr,
+    //                    val_ptr,
+    //                    batch_size,
+    //                    head_num,
+    //                    key_len,
+    //                    size_per_head,
+    //                    nullptr,  // scale, only used in int8 mode
+    //                    0,        // int8_mode
+    //                    stream);
+
+    q_buf_ptr = query_ptr;
+    k_buf_ptr = key_ptr;
+    v_buf_ptr = val_ptr;
+
+    //////////////////////////////////////////////
+    /// Q*K batch gemm
+    /// -> [B, H, s, t + s]
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        key_len,                  // m
+                                        seq_len,                  // n
+                                        size_per_head,            // k
+                                        k_buf_ptr,                // A
+                                        size_per_head,            // lda
+                                        key_len * size_per_head,  // strideA
+                                        q_buf_ptr,                // B
+                                        size_per_head,            // ldb
+                                        seq_len * size_per_head,  // strideB
+                                        qk_buf_ptr,               // C
+                                        key_len,                  // ldc
+                                        seq_len * key_len,        // strideC
+                                        batch_size * head_num);   // batchCount
+
+    //////////////////////////////////////////////
+    /// ! masked softmax (kernel asserts k_length <= 4096)
+    MaskedSoftmaxParam<scalar_t, scalar_t> param{};
+    param.attention_score    = qk_buf_ptr;
+    param.qk                 = qk_buf_ptr;
+    param.attention_mask     = mask_ptr;
+    param.batch_size         = batch_size;
+    param.q_length           = seq_len;
+    param.k_length           = key_len;
+    param.num_heads          = head_num;
+    param.qk_scale           = qk_scale;
+    param.linear_bias_slopes = nullptr;
+    invokeMaskedSoftmax(param, stream);
+
+    //////////////////////////////////////////////
+    /// softmax(QK)*V batch gemm
+    cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        size_per_head,            // m
+                                        seq_len,                  // n
+                                        key_len,                  // k
+                                        v_buf_ptr,                // A
+                                        size_per_head,            // lda
+                                        key_len * size_per_head,  // strideA,
+                                        qk_buf_ptr,               // B
+                                        key_len,                  // ldb
+                                        key_len * seq_len,        // strideB
+                                        out_buf_ptr,              // C
+                                        size_per_head,            // ldc,
+                                        seq_len * size_per_head,  // strideC
+                                        batch_size * head_num);   // batchCount
+
+    //////////////////////////////////////////////
+    /// transpose <B,h,s,D> -> <B,s,h,D>
+    int num_token = batch_size * seq_len;
+    invokeTransposeAttentionOutRemovePadding(out_buf_ptr,
+                                             out_ptr,
+                                             num_token,
+                                             batch_size,
+                                             seq_len,
+                                             head_num,
+                                             size_per_head,
+                                             padding_offset,
+                                             nullptr,
+                                             0,
+                                             stream);
+
+    pad_out(out_ptr, cu_seqlens, batch_size, head_num * seq_len * size_per_head, head_num * size_per_head, stream);
+}
+
+template<typename scalar_t>
+struct UpdateMask {
+    UpdateMask() {}
+    __host__ __device__ scalar_t operator()(const scalar_t& x) const
+    {
+        return x > scalar_t(0.0f) ? scalar_t(1.0f) : scalar_t(0.0f);
+    }
+};
+
+static const char* usage = "Usage: %s <batch-size> <num-heads> <key-len> <query-len> <size-per-head>\n"
+                           "Example: $test_context_attention_layer 2, 8, 1024, 512, 128\n";
+
+int main(int argc, const char* argv[])
+{
+    using namespace fastertransformer;
+    using scalar_t                            = half;
+    static const cudaDataType_t kCudaDataType = std::is_same<scalar_t, half>::value ? CUDA_R_16F : CUDA_R_32F;
+
+    Logger::getLogger().setLevel(Logger::INFO);
+
+    if (argc != 6) {
+        printf(usage, argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    // First create an instance of an engine.
+    std::random_device rnd_device;
+    // Specify the engine and distribution.
+    std::mt19937 mersenne_engine{rnd_device()};  // Generates random integers
+
+    int batch_size    = atoi(argv[1]);
+    int num_heads     = atoi(argv[2]);
+    int key_len       = atoi(argv[3]);
+    int seq_len       = atoi(argv[4]);
+    int size_per_head = atoi(argv[5]);
+
+    // Create stream and handle
+    cudaStream_t     stream;
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    cudaStreamCreate(&stream);
+    cublasCreate(&cublas_handle);
+    cublasLtCreate(&cublaslt_handle);
+    cublasSetStream(cublas_handle, stream);
+
+    cublasAlgoMap* cublas_algo_map = new cublasAlgoMap("gemm_config.in");
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+    allocator.setStream(stream);
+    std::mutex*     cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper =
+        cublasMMWrapper(cublas_handle, cublaslt_handle, stream, cublas_algo_map, cublas_wrapper_mutex, &allocator);
+    cublas_wrapper.setGemmConfig(kCudaDataType, kCudaDataType, kCudaDataType, kCudaDataType);
+
+    // initialize device
+    scalar_t* query_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * seq_len * size_per_head * sizeof(scalar_t));
+    scalar_t* key_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * key_len * size_per_head * sizeof(scalar_t));
+    scalar_t* val_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * key_len * size_per_head * sizeof(scalar_t));
+    scalar_t* mask_ptr = (scalar_t*)allocator.malloc(batch_size * seq_len * key_len * sizeof(scalar_t));
+    scalar_t* expect_out_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * seq_len * size_per_head * sizeof(scalar_t), true);
+    scalar_t* actual_out_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * seq_len * size_per_head * sizeof(scalar_t), true);
+    scalar_t* q_buf_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * seq_len * size_per_head * sizeof(scalar_t), true);
+    scalar_t* k_buf_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * key_len * size_per_head * sizeof(scalar_t), true);
+    scalar_t* v_buf_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * key_len * size_per_head * sizeof(scalar_t), true);
+    scalar_t* qk_buf_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * seq_len * key_len * sizeof(scalar_t), true);
+    scalar_t* out_buf_ptr =
+        (scalar_t*)allocator.malloc(batch_size * num_heads * seq_len * size_per_head * sizeof(scalar_t), true);
+
+    auto* h_pinned_token_num_ptr = (size_t*)allocator.malloc(sizeof(size_t), true);
+    auto* padding_offset_ptr     = (int*)allocator.malloc(sizeof(int) * batch_size * seq_len, false);
+    auto* cu_seqlens_ptr         = (int*)allocator.malloc(sizeof(int) * (batch_size + 1), false);
+    // auto* input_lengths  = (int*)allocator.malloc(sizeof(int) * batch_size, false);
+    thrust::device_vector<int> input_lengths(batch_size);
+    thrust::host_vector<int>   input_lengths_host(batch_size);
+
+    cudaRandomUniform<scalar_t>(query_ptr, batch_size * num_heads * seq_len * size_per_head);
+    cudaRandomUniform<scalar_t>(key_ptr, batch_size * num_heads * key_len * size_per_head);
+    cudaRandomUniform<scalar_t>(val_ptr, batch_size * num_heads * key_len * size_per_head);
+    cudaRandomUniform<scalar_t>(mask_ptr, batch_size * seq_len * key_len);
+    thrust::transform(
+        thrust::device, mask_ptr, mask_ptr + batch_size * seq_len * key_len, mask_ptr, UpdateMask<scalar_t>());
+
+    // create random length for batch
+    std::uniform_int_distribution<int> dist{seq_len / 2, seq_len};
+    auto                               gen = [&dist, &mersenne_engine]() { return dist(mersenne_engine); };
+    std::generate(begin(input_lengths_host), end(input_lengths_host), gen);
+    thrust::copy(input_lengths_host.begin(), input_lengths_host.end(), input_lengths.begin());
+    size_t  h_token_num = 0;
+    size_t* h_pinned_token_num;
+    auto    input_lengths_ptr = thrust::raw_pointer_cast(input_lengths.data());
+    cudaMallocHost((void**)&h_pinned_token_num, sizeof(size_t));
+    invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num,
+                                       &h_token_num,
+                                       padding_offset_ptr,
+                                       cu_seqlens_ptr,
+                                       input_lengths_ptr,
+                                       batch_size,
+                                       seq_len,
+                                       stream);
+    cudaFreeHost((void*)h_pinned_token_num);
+
+    // compute gt
+    naive_mha<scalar_t>(expect_out_ptr,
+                        query_ptr,
+                        key_ptr,
+                        val_ptr,
+                        mask_ptr,
+                        q_buf_ptr,
+                        k_buf_ptr,
+                        v_buf_ptr,
+                        qk_buf_ptr,
+                        out_buf_ptr,
+                        padding_offset_ptr,
+                        cu_seqlens_ptr,
+                        batch_size,
+                        num_heads,
+                        key_len,
+                        seq_len,
+                        size_per_head,
+                        stream,
+                        &cublas_wrapper);
+
+    // compute actual
+    using AttentionOp = FlashAttentionOp<scalar_t>;
+    using Layout      = typename AttentionOp::AttentionLayout;
+    Layout      layout_q{.stride_batch = num_heads * seq_len * size_per_head,
+                         .stride_seq   = size_per_head,
+                         .stride_head  = seq_len * size_per_head};
+    Layout      layout_k{.stride_batch = num_heads * key_len * size_per_head,
+                         .stride_seq   = size_per_head,
+                         .stride_head  = key_len * size_per_head};
+    Layout      layout_v{.stride_batch = num_heads * key_len * size_per_head,
+                         .stride_seq   = size_per_head,
+                         .stride_head  = key_len * size_per_head};
+    Layout      layout_o{.stride_batch = num_heads * seq_len * size_per_head,
+                         .stride_seq   = num_heads * size_per_head,
+                         .stride_head  = size_per_head,
+                         .use_seqlens  = true};
+    AttentionOp flash_attention(batch_size, num_heads, key_len, seq_len, size_per_head);
+    float*      accum_buf_ptr = (float*)allocator.malloc(flash_attention.get_workspace_size(), true);
+
+    typename AttentionOp::Params attn_params{.attn_out     = actual_out_ptr,
+                                             .query        = query_ptr,
+                                             .key          = key_ptr,
+                                             .val          = val_ptr,
+                                             .mask         = mask_ptr,
+                                             .out_accum    = accum_buf_ptr,
+                                             .cu_seqlens_q = cu_seqlens_ptr,
+                                             .cu_seqlens_k = nullptr,
+                                             .layout_q     = layout_q,
+                                             .layout_k     = layout_k,
+                                             .layout_v     = layout_v,
+                                             .layout_o     = layout_o};
+    flash_attention(attn_params, stream);
+    sync_check_cuda_error();
+
+    // int num_rows = 8;
+    // printf("expect:\n");
+    // printMatrix(expect_out_ptr, num_rows, size_per_head, size_per_head, true);
+    // printf("actual:\n");
+    // printMatrix(actual_out_ptr, num_rows, size_per_head, size_per_head, true);
+    checkResult(
+        "all close:", actual_out_ptr, expect_out_ptr, batch_size * num_heads * seq_len * size_per_head, true, true);
+
+    delete cublas_algo_map;
+    delete cublas_wrapper_mutex;
+}
diff --git a/tests/unittests/test_gemm.cu b/tests/unittests/test_gemm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3eb39a1a00edaa481adc2e51169d2d2890f5d3a7
--- /dev/null
+++ b/tests/unittests/test_gemm.cu
@@ -0,0 +1,909 @@
+#include <assert.h>
+#include <math.h>
+#include <cublas_v2.h>
+#include <numeric>
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+
+#include "src/fastertransformer/layers/DenseWeight.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/gemm.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+using namespace fastertransformer;
+
+// Can be replaced by the function provided by a test framework
+
+class TestFailureError : public std::exception {
+private:
+    std::string msg_;
+public:
+    explicit TestFailureError() = default;
+    explicit TestFailureError(std::string name, std::string msg = "") {
+        msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
+    }
+    const char* what () const throw () {
+        return msg_.c_str();
+    }
+};
+
+#define EXPECT_TRUE(cond)                           \
+    do { if(!(cond)) {                              \
+        FT_LOG_ERROR("TEST FAIL [%s] at %s:%d",     \
+                     __func__, __FILE__, __LINE__); \
+        throw TestFailureError(__func__);           \
+    } } while(false)
+
+#define EXPECT_ALMOST_EQUAL(name, dtype, ctype, out, ref)       \
+    do {                                                        \
+        bool is_ok = checkResult<dtype,ctype>(name, out, ref);  \
+        if(!is_ok) {                                            \
+            FT_LOG_ERROR("TEST FAIL [%s] at %s:%d",             \
+                        __func__, __FILE__, __LINE__);          \
+            throw TestFailureError(__func__);                   \
+        }                                                       \
+    } while(false)
+
+////////////////////////////////////////////////////////////////////////////////////
+
+// TensorWrapper is to handle a tensor object as well as its memory buffer,
+// because tensor.data is const we cannot set values.
+class TensorWrapper {
+private:
+    IAllocator* allocator;
+
+public:
+    std::vector<size_t> shape;
+    DataType type;
+    Tensor* tensor;
+    void* data;
+
+    TensorWrapper(IAllocator* allocator, DataType dtype, std::vector<size_t> shape, bool zero_init = false)
+    {
+        this->allocator = allocator;
+        this->type = dtype;
+        this->shape = shape;
+
+        size_t tensor_memsize = this->memsize();
+        this->data = this->allocator->malloc(tensor_memsize, false);
+        if (zero_init) {
+            check_cuda_error(cudaMemset(data, 0x0, tensor_memsize));
+        } else {
+            setRandomValues();
+        }
+        this->tensor = new Tensor(MEMORY_GPU, dtype, shape, data);
+    }
+
+    TensorWrapper(TensorWrapper const& other)
+        : allocator(other.allocator), shape(other.shape), type(other.type), data(other.data), tensor(other.tensor)
+    {
+        FT_LOG_DEBUG("TensorWrapper copy: this=%p other=%p", data, other.data);
+    }
+    ~TensorWrapper()
+    {
+        delete tensor;
+        allocator->free((void**)(&data));
+    }
+
+    void setInvalidValues()
+    {
+        size_t type_size = tensor->type == TYPE_FP32 ? sizeof(float) : sizeof(half);
+        size_t tensor_size = type_size * tensor->size();
+        // Fill by a random number to guarantee invalid values
+        check_cuda_error(cudaMemset(data, 0xdc, tensor_size));
+    }
+
+    void setRandomValues() {
+        // random initialization
+        size_t num_elements = this->size();
+        switch (this->type) {
+            case TYPE_FP32:
+                cudaRandomUniform((float*)data, num_elements);
+                break;
+            case TYPE_FP16:
+                cudaRandomUniform((half*)data, num_elements);
+                break;
+            default:
+                // Will be added more if needed.
+                throw std::runtime_error("Not supported data type");
+        }
+    }
+
+    size_t size() {
+        size_t n_elements = 1;
+        for (size_t s : this->shape) {
+            n_elements *= s;
+        }
+        return n_elements;
+    }
+
+    size_t memsize() {
+        size_t type_size = 0;
+        switch (this->type) {
+            case TYPE_FP32:
+                type_size = sizeof(float);
+                break;
+            case TYPE_FP16:
+                type_size = sizeof(half);
+                break;
+            default:
+                throw std::runtime_error("Not supported data type.");
+        }
+        return type_size * this->size();
+    }
+};
+
+template<DataType computeType>
+void computeReference(GemmOp transa,
+                      GemmOp transb,
+                      TensorWrapper& C,
+                      TensorWrapper& A,
+                      TensorWrapper& B,
+                      float alpha = 1.0f,
+                      float beta = 0.0f)
+{
+    size_t m = C.shape[0];
+    size_t n = C.shape[1];
+    size_t k = A.shape[1];
+
+    size_t lda = (transa == GEMM_OP_N) ? k : m;
+    size_t ldb = (transb == GEMM_OP_N) ? n : k;
+    size_t ldc = n;
+
+    cudaDataType_t atype = (A.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+    cudaDataType_t btype = (B.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+    cudaDataType_t ctype = (C.type == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+    cudaDataType_t compute_type = (computeType == TYPE_FP16) ? CUDA_R_16F : CUDA_R_32F;
+
+    cublasHandle_t cublas_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+
+    half h_alpha = (half)alpha;
+    half h_beta = (half)beta;
+    const void* _alpha = (computeType == TYPE_FP16) ? (const void*)&h_alpha : (const void*)&alpha;
+    const void* _beta = (computeType == TYPE_FP16) ? (const void*)&h_beta : (const void*)&beta;
+
+    check_cuda_error(cublasGemmEx(cublas_handle,
+                                  getCublasOperation(transb),
+                                  getCublasOperation(transa),
+                                  n, m, k,
+                                  _alpha,
+                                  (const void*)B.data, btype, ldb,
+                                  (const void*)A.data, atype, lda,
+                                  _beta,
+                                  (void*)C.data, ctype, ldc,
+                                  compute_type,
+                                  CUBLAS_GEMM_DEFAULT));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    cudaDeviceSynchronize();
+}
+
+bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
+{
+    // Params: a = value to compare and b = reference
+    // This function follows implementation of numpy.isclose(), which checks
+    //   abs(a - b) <= (atol + rtol * abs(b)).
+    // Note that the inequality above is asymmetric where b is considered as
+    // a reference value. To account into both absolute/relative errors, it
+    // uses absolute tolerance and relative tolerance at the same time. The
+    // default values of atol and rtol borrowed from numpy.isclose(). For the
+    // case of nan value, the result will be true.
+    if (isnan(a) && isnan(b)) {
+        return true;
+    }
+    return fabs(a - b) <= (atol + rtol * fabs(b));
+}
+
+template<typename T>
+bool _checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref, float atol, float rtol) {
+    assert(out.type == ref.type);
+
+    size_t out_size = out.size();
+    size_t ref_size = ref.size();
+    T* h_out = reinterpret_cast<T*>(malloc(sizeof(T) * out_size));
+    T* h_ref = reinterpret_cast<T*>(malloc(sizeof(T) * ref_size));
+
+    cudaMemcpy(h_out, out.data, sizeof(T) * out_size, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_ref, ref.data, sizeof(T) * ref_size, cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+
+    size_t failures = 0;
+    for (size_t i = 0; i < out_size; ++i) {
+        // The values for the output and the reference.
+        float a = (float)h_out[i];
+        float b = (float)h_ref[i];
+
+        bool ok = almostEqual(a, b, atol, rtol);
+        // Print the error.
+        if( !ok && failures < 4 ) {
+            FT_LOG_ERROR(">> invalid result for i=%lu:", i);
+            FT_LOG_ERROR(">>    found......: %10.6f", a);
+            FT_LOG_ERROR(">>    expected...: %10.6f", b);
+            FT_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
+            FT_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
+        }
+
+        // Update the number of failures.
+        failures += ok ? 0 : 1;
+    }
+
+    // Allow not matched up to 1% elements.
+    size_t tol_failures = (size_t)(0.01 * out_size);
+    FT_LOG_INFO("check....... %30s : %s (failures: %.2f%% atol: %.2e rtol: %.2e)",
+                name.c_str(), failures <= tol_failures ? "OK" : "FAILED",
+                100. * failures / out_size, atol, rtol);
+    return failures <= tol_failures;
+}
+
+template<typename T, DataType computeType>
+bool checkResult(std::string name, TensorWrapper& out, TensorWrapper& ref) {
+    float atol = (computeType == TYPE_FP32) ? 1e-6f : 1e-3f;
+    float rtol = (computeType == TYPE_FP32) ? 1e-4f : 1e-1f;
+    bool is_ok = false;
+    if (sizeof(T) == 4) {
+        is_ok = _checkResult<float>(name, out, ref, atol, rtol);
+    } else {
+        is_ok = _checkResult<half>(name, out, ref, atol, rtol);
+    }
+    return is_ok;
+}
+
+template<typename T, DataType computeType>
+bool checkResult(TensorWrapper& out, TensorWrapper& ref) {
+    return checkResult<T, computeType>("", out, ref);
+}
+
+template<typename T>
+std::string toString() {
+    std::string str = "dtype=";
+    str += std::is_same<T, float>::value ? "FP32" : "FP16";
+    return str;
+}
+
+template<typename T, DataType ctype>
+std::string toString() {
+    std::string str = "dtype=";
+    str += std::is_same<T, float>::value ? "FP32" : "FP16";
+    str += ", compute_type=";
+    str += (ctype == TYPE_FP32) ? "FP32" : "FP16";
+    return str;
+}
+
+std::string toString(GemmOp op) {
+    return op == GEMM_OP_N ? "N" : "T";
+}
+
+struct GemmOpPair {
+    GemmOp transa;
+    GemmOp transb;
+};
+
+static const std::vector<GemmOpPair> op_pairs {{GEMM_OP_N, GEMM_OP_N},
+                                               {GEMM_OP_N, GEMM_OP_T},
+                                               {GEMM_OP_T, GEMM_OP_N},
+                                               {GEMM_OP_T, GEMM_OP_T}};
+
+static inline std::string getTestName(const char* func_name, GemmOp transa, GemmOp transb,
+                                      size_t m, size_t n, size_t k)
+{
+    return fmtstr("%s [opA=%s, opB=%s, m=%ld, n=%ld, k=%ld]",
+                  func_name, getGemmOpString(transa).c_str(), getGemmOpString(transb).c_str(),
+                  m, n, k);
+}
+
+static inline std::string getTestName(const char* func_name, GemmOpPair op_pairs,
+                                      size_t m, size_t n, size_t k)
+{
+    return getTestName(func_name, op_pairs.transa, op_pairs.transb, m, n, k);
+}
+
+
+/////////////////////////////////// Unittests //////////////////////////////////////////
+
+template<typename T, DataType computeType>
+void testGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
+    FT_LOG_INFO("Matmul function correctness test [m=%ld, n=%ld, k=%ld, %s]",
+                m, n, k, toString<T, computeType>().c_str());
+    cudaStream_t stream;
+    check_cuda_error(cudaStreamCreate(&stream));
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+
+    DataType dtype = getTensorType<T>();
+    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
+    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
+    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
+    TensorWrapper expected(&allocator, dtype, {m, n}, true);
+
+    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
+    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
+
+    for (auto &op_pair : op_pairs) {
+        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
+        FT_LOG_DEBUG(tc_name);
+        computeReference<computeType>(op_pair.transa, op_pair.transb,
+                                      expected, a_tensor, b_tensor);
+
+        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
+        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
+        size_t ldc = n;
+
+        c_tensor.setInvalidValues(); // to guarantee C has invalid data
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, a_tensor.type, lda,
+                   b_tensor.data, b_tensor.type, ldb,
+                   c_tensor.data, c_tensor.type, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, lda,
+                   b_tensor.data, ldb,
+                   c_tensor.data, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, b_tensor.data, c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                    a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
+    }
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+template<typename T, DataType computeType>
+void testGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
+    // Test if Gemm is consistent with cublasWrapper
+    FT_LOG_INFO("Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+                m, n, k, toString<T, computeType>().c_str());
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+    cudaStream_t stream;
+    check_cuda_error(cudaStreamCreate(&stream));
+
+    DataType dtype = getTensorType<T>();
+    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
+    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
+    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
+    TensorWrapper expected(&allocator, dtype, {m, n}, true);
+
+    cublasHandle_t cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle));
+    check_cuda_error(cublasSetStream(cublas_handle, stream));
+    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper(cublas_handle,
+                                   cublaslt_handle,
+                                   stream,
+                                   &cublas_algo_map,
+                                   cublas_wrapper_mutex,
+                                   &allocator);
+
+    cudaDataType_t cuda_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t cuda_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
+    cublas_wrapper.setGemmConfig(cuda_dtype, cuda_dtype, cuda_dtype, cuda_ctype);
+
+    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
+    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
+
+    for (auto &op_pair : op_pairs) {
+        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
+
+        // Switch A/B because Gemm expects column major layout as cublas does.
+        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
+        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
+        size_t ldc = n;
+        cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
+                            getCublasOperation(op_pair.transa),
+                            n, m, k,
+                            b_tensor.data, ldb,
+                            a_tensor.data, lda,
+                            expected.data, ldc);
+
+        c_tensor.setInvalidValues(); // to guarantee C has invalid data
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, a_tensor.type, lda,
+                   b_tensor.data, b_tensor.type, ldb,
+                   c_tensor.data, c_tensor.type, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, lda,
+                   b_tensor.data, ldb,
+                   c_tensor.data, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, b_tensor.data, c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                    a_tensor.data, DenseWeight<T>{(const T*)b_tensor.data, nullptr, nullptr}, c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
+    }
+
+    delete cublas_wrapper_mutex;
+    check_cuda_error(cublasLtDestroy(cublaslt_handle));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+template<typename T, DataType computeType>
+void testGemmConsistencyBatchedMatmul(size_t m, size_t n, size_t k) {
+    // Test if Gemm is consistent with cublasWrapper
+    FT_LOG_INFO("Batched gemm function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+                m, n, k, toString<T, computeType>().c_str());
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+    cudaStream_t stream;
+    check_cuda_error(cudaStreamCreate(&stream));
+
+    // batch of in/out tensors
+    DataType a_type = getTensorType<T>();
+    DataType b_type = getTensorType<T>();
+    DataType c_type = getTensorType<T>();
+    std::vector<TensorWrapper*> a_tensors;
+    std::vector<TensorWrapper*> b_tensors;
+    std::vector<TensorWrapper*> c_tensors;
+    std::vector<TensorWrapper*> expecteds;
+    const size_t batch_size = 3;
+    for (size_t i = 0; i < batch_size; ++i) {
+        a_tensors.push_back(new TensorWrapper(&allocator, a_type, {m, k}, false));
+        b_tensors.push_back(new TensorWrapper(&allocator, b_type, {k, n}, false));
+        c_tensors.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true));
+        expecteds.push_back(new TensorWrapper(&allocator, c_type, {m, n}, true));
+    }
+
+    const T* hA[]{(const T*)a_tensors[0]->data,
+                  (const T*)a_tensors[1]->data,
+                  (const T*)a_tensors[2]->data,
+                  nullptr,  // for memory alignment.
+                  (const T*)b_tensors[0]->data,
+                  (const T*)b_tensors[1]->data,
+                  (const T*)b_tensors[2]->data,
+                  nullptr,  // for memory alignment.
+                  (const T*)c_tensors[0]->data,
+                  (const T*)c_tensors[1]->data,
+                  (const T*)c_tensors[2]->data,
+                  nullptr,  // for memory alignment.
+                  (const T*)expecteds[0]->data,
+                  (const T*)expecteds[1]->data,
+                  (const T*)expecteds[2]->data};
+
+    T** batch_tensor_ptrs = reinterpret_cast<T**>(allocator.malloc(sizeof(T*) * 16, false));
+    check_cuda_error(cudaMemcpyAsync(
+        (void*)batch_tensor_ptrs, hA, sizeof(T*) * 16, cudaMemcpyHostToDevice, stream));
+    const void* const* batch_a = reinterpret_cast<const void* const*>(batch_tensor_ptrs);
+    const void* const* batch_b = reinterpret_cast<const void* const*>(batch_tensor_ptrs + 4);
+    void* const* batch_c = reinterpret_cast<void* const*>(batch_tensor_ptrs + 8);
+    void* const* batch_expected = reinterpret_cast<void* const*>(batch_tensor_ptrs + 12);
+
+    cublasHandle_t cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle));
+    check_cuda_error(cublasSetStream(cublas_handle, stream));
+    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper(cublas_handle,
+                                   cublaslt_handle,
+                                   stream,
+                                   &cublas_algo_map,
+                                   cublas_wrapper_mutex,
+                                   &allocator);
+
+    cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
+    cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype);
+
+    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
+    gemm->setTypes(a_type, b_type, c_type, computeType);
+
+    for (auto &op_pair : op_pairs) {
+        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
+        FT_LOG_DEBUG(tc_name);
+
+        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
+        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
+        size_t ldc = n;
+
+        // Switch A/B because Gemm expects column major layout as cublas does.
+        cublas_wrapper.batchedGemm(getCublasOperation(op_pair.transb),  // N
+                                   getCublasOperation(op_pair.transa),  // T
+                                   n,
+                                   m,
+                                   k,
+                                   (const void* const*)batch_b, ldb,
+                                   (const void* const*)batch_a, lda,
+                                   (void* const*)batch_expected, ldc,
+                                   batch_size);
+
+        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                          batch_a, a_type, lda,
+                          batch_b, b_type, ldb,
+                          batch_c, c_type, ldc,
+                          batch_size);
+        for (size_t i = 0; i < batch_size; ++i) {
+            EXPECT_ALMOST_EQUAL(tc_name + " api1 batch" + std::to_string(i),
+                                T, computeType, *c_tensors[i], *expecteds[i]);
+        }
+
+        for (size_t i = 0; i < batch_size; ++i) {
+            c_tensors[i]->setInvalidValues();
+        }
+        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                          batch_a, lda,
+                          batch_b, ldb,
+                          batch_c, ldc,
+                          batch_size);
+        for (size_t i = 0; i < batch_size; ++i) {
+            EXPECT_ALMOST_EQUAL(tc_name + " api2 batch" + std::to_string(i),
+                                T, computeType, *c_tensors[i], *expecteds[i]);
+        }
+
+        for (size_t i = 0; i < batch_size; ++i) {
+            c_tensors[i]->setInvalidValues();
+        }
+        gemm->batchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                          batch_a, batch_b, batch_c, batch_size);
+        for (size_t i = 0; i < batch_size; ++i) {
+            EXPECT_ALMOST_EQUAL(tc_name + " api3 batch" + std::to_string(i),
+                                T, computeType, *c_tensors[i], *expecteds[i]);
+        }
+    }
+    a_tensors.clear();
+    b_tensors.clear();
+    c_tensors.clear();
+    expecteds.clear();
+    delete cublas_wrapper_mutex;
+    check_cuda_error(cublasLtDestroy(cublaslt_handle));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+
+template<typename T, DataType computeType>
+void testGemmConsistencyStridedBatchedMatmul(size_t batch_size, size_t m, size_t n, size_t k) {
+    // Test if Gemm is consistent with cublasWrapper
+    FT_LOG_INFO("Strided batched gemm function consistency test [bsz=%ld, m=%ld, n=%ld, k=%ld, %s]",
+                batch_size, m, n, k, toString<T, computeType>().c_str());
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+    cudaStream_t stream;
+    check_cuda_error(cudaStreamCreate(&stream));
+
+    DataType data_type = getTensorType<T>();
+    TensorWrapper a_tensor(&allocator, data_type, {batch_size, m, k}, false);
+    TensorWrapper b_tensor(&allocator, data_type, {batch_size, k, n}, false);
+    TensorWrapper c_tensor(&allocator, data_type, {batch_size, m, n}, true);
+    TensorWrapper expected(&allocator, data_type, {batch_size, m, n}, true);
+
+    cublasHandle_t cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle));
+    check_cuda_error(cublasSetStream(cublas_handle, stream));
+    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper(cublas_handle,
+                                   cublaslt_handle,
+                                   stream,
+                                   &cublas_algo_map,
+                                   cublas_wrapper_mutex,
+                                   &allocator);
+
+    cudaDataType_t dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t ctype = (computeType == DataType::TYPE_FP32) ? CUDA_R_32F : CUDA_R_16F;
+    cublas_wrapper.setGemmConfig(dtype, dtype, dtype, ctype);
+
+    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, false, false);
+    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
+
+    for (auto &op_pair : op_pairs) {
+        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
+
+        // Switch A/B because Gemm expects column major layout as cublas does.
+        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
+        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
+        size_t ldc = n;
+
+        int64_t stridea = m * k;
+        int64_t strideb = k * n;
+        int64_t stridec = m * n;
+
+        float alpha = 1.0f;
+        float beta = 0.0f;
+
+        cublas_wrapper.stridedBatchedGemm(getCublasOperation(op_pair.transb),
+                                          getCublasOperation(op_pair.transa),
+                                          n,
+                                          m,
+                                          k,
+                                          alpha,
+                                          b_tensor.data,
+                                          getCublasDataType(b_tensor.type),
+                                          ldb,
+                                          strideb,
+                                          a_tensor.data,
+                                          getCublasDataType(a_tensor.type),
+                                          lda,
+                                          stridea,
+                                          beta,
+                                          expected.data,
+                                          getCublasDataType(expected.type),
+                                          ldc,
+                                          stridec,
+                                          batch_size,
+                                          getCublasDataType(computeType));
+
+        c_tensor.setInvalidValues();  // to guarantee C has invalid data
+        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                                 a_tensor.data, a_tensor.type, lda, stridea,
+                                 b_tensor.data, b_tensor.type, ldb, strideb,
+                                 c_tensor.data, c_tensor.type, ldc, stridec,
+                                 batch_size, computeType, alpha, beta);
+        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                                 a_tensor.data, lda, stridea,
+                                 b_tensor.data, ldb, strideb,
+                                 c_tensor.data, ldc, stridec,
+                                 batch_size, alpha, beta);
+        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                                 a_tensor.data, stridea,
+                                 b_tensor.data, strideb,
+                                 c_tensor.data, stridec,
+                                 batch_size, alpha, beta);
+        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->stridedBatchedGemm(op_pair.transa, op_pair.transb, m, n, k,
+                                 a_tensor.data,
+                                 b_tensor.data,
+                                 c_tensor.data,
+                                 batch_size, alpha, beta);
+        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
+    }
+
+    delete cublas_wrapper_mutex;
+    check_cuda_error(cublasLtDestroy(cublaslt_handle));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+#ifdef SPARSITY_ENABLED
+// The current SpGemm only supports TYPE_FP16 for T, computeType,
+// but let us keep these template variables for later use.
+template<typename T, DataType computeType>
+void testSpGemmCorrectnessMatmul(size_t m, size_t n, size_t k) {
+    FT_LOG_INFO("Sparse gemm function correctness test [m=%ld, n=%ld, k=%ld, %s]",
+                m, n, k, toString<T, computeType>().c_str());
+    cudaStream_t stream;
+    check_cuda_error(cudaStreamCreate(&stream));
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+
+    DataType dtype = getTensorType<T>();
+    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
+    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
+    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
+    TensorWrapper expected(&allocator, dtype, {m, n}, true);
+
+    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
+    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
+
+    for (auto &op_pair : op_pairs) {
+        // A/B will be switched in SpGemm.
+        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
+        FT_LOG_DEBUG(tc_name);
+
+        b_tensor.setRandomValues();
+        pruneMatrixB(b_tensor.data, stream,
+                     b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
+        computeReference<computeType>(op_pair.transa, op_pair.transb,
+                                      expected, a_tensor, b_tensor);
+
+        void* b_compressed;
+        compressMatrixB(&b_compressed, allocator, stream,
+                        b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
+                        op_pair.transb);
+
+        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
+        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
+        size_t ldc = n;
+
+        c_tensor.setInvalidValues(); // to guarantee C has invalid data
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, a_tensor.type, lda,
+                   b_compressed, b_tensor.type, ldb,
+                   c_tensor.data, c_tensor.type, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, lda,
+                   b_compressed, ldb,
+                   c_tensor.data, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api2", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, b_compressed, c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data,
+                   DenseWeight<T>{(const T*)b_tensor.data, nullptr, (const T*)b_compressed},
+                   c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api4", T, computeType, c_tensor, expected);
+
+        allocator.free((void**)(&b_compressed));
+    }
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+template<typename T, DataType computeType>
+void testSpGemmConsistencyMatmul(size_t m, size_t n, size_t k) {
+    // Test if Gemm is consistent with cublasWrapper
+    FT_LOG_INFO("Sparse Matmul function consistency test [m=%ld, n=%ld, k=%ld, %s]",
+                m, n, k, toString<T, computeType>().c_str());
+
+    Allocator<AllocatorType::CUDA> allocator(getDevice());
+    cudaStream_t stream;
+    check_cuda_error(cudaStreamCreate(&stream));
+
+    DataType dtype = getTensorType<T>();
+    TensorWrapper a_tensor(&allocator, dtype, {m, k}, false);
+    TensorWrapper b_tensor(&allocator, dtype, {k, n}, false);
+    TensorWrapper c_tensor(&allocator, dtype, {m, n}, true);
+    TensorWrapper expected(&allocator, dtype, {m, n}, true);
+
+    cublasHandle_t cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    check_cuda_error(cublasCreate(&cublas_handle));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle));
+    check_cuda_error(cublasSetStream(cublas_handle, stream));
+    cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+    std::mutex* cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper cublas_wrapper(cublas_handle,
+                                   cublaslt_handle,
+                                   stream,
+                                   &cublas_algo_map,
+                                   cublas_wrapper_mutex,
+                                   &allocator);
+
+    cudaDataType_t cu_dtype = std::is_same<float, T>::value ? CUDA_R_32F : CUDA_R_16F;
+    cudaDataType_t cu_ctype = (DataType::TYPE_FP32 == computeType) ? CUDA_R_32F : CUDA_R_16F;
+    cublas_wrapper.setGemmConfig(cu_dtype, cu_dtype, cu_dtype, cu_ctype);
+
+    std::shared_ptr<Gemm> gemm = createGemm(&allocator, stream, true, false);
+    gemm->setTypes(a_tensor.type, b_tensor.type, c_tensor.type, computeType);
+
+    for (auto &op_pair : op_pairs) {
+        std::string tc_name = getTestName(__func__, op_pair, m, n, k);
+        FT_LOG_DEBUG(tc_name);
+
+        b_tensor.setRandomValues();
+        pruneMatrixB(b_tensor.data, stream,
+                     b_tensor.shape[0], b_tensor.shape[1], op_pair.transb);
+
+        // Switch A/B because Gemm expects column major layout as cublas does.
+        size_t lda = (op_pair.transa == GEMM_OP_N) ? k : m;
+        size_t ldb = (op_pair.transb == GEMM_OP_N) ? n : k;
+        size_t ldc = n;
+        cublas_wrapper.Gemm(getCublasOperation(op_pair.transb),
+                            getCublasOperation(op_pair.transa),
+                            n,
+                            m,
+                            k,
+                            b_tensor.data, ldb,
+                            a_tensor.data, lda,
+                            expected.data, ldc);
+
+        void* b_compressed;
+        compressMatrixB(&b_compressed, allocator, stream,
+                        b_tensor.data, b_tensor.shape[0], b_tensor.shape[1],
+                        op_pair.transb);
+
+        c_tensor.setInvalidValues();  // to guarantee C has invalid data
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, a_tensor.type, lda,
+                   b_compressed, b_tensor.type, ldb,
+                   c_tensor.data, c_tensor.type, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb,  m, n, k,
+                   a_tensor.data, lda,
+                   b_compressed, ldb,
+                   c_tensor.data, ldc);
+        EXPECT_ALMOST_EQUAL(tc_name + " api1", T, computeType, c_tensor, expected);
+
+        c_tensor.setInvalidValues();
+        gemm->gemm(op_pair.transa, op_pair.transb, m, n, k,
+                   a_tensor.data, b_compressed, c_tensor.data);
+        EXPECT_ALMOST_EQUAL(tc_name + " api3", T, computeType, c_tensor, expected);
+    }
+
+    delete cublas_wrapper_mutex;
+    check_cuda_error(cublasLtDestroy(cublaslt_handle));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+#endif
+
+int main(int argc, char* argv[]) {
+    // testGemmCreate();
+    using testcase_t = std::tuple<size_t, size_t, size_t>;
+
+    std::vector<testcase_t> testcases = {{16, 32, 64},
+                                         {255, 255, 255},
+                                         {1041, 2047, 9999},
+                                         {1041, 1, 9999},
+                                         {1041, 999, 1}};
+
+    // Computation correctness tests
+    for (testcase_t &tc : testcases) {
+        size_t m = std::get<0>(tc);
+        size_t n = std::get<1>(tc);
+        size_t k = std::get<2>(tc);
+
+        testGemmCorrectnessMatmul<float, TYPE_FP32>(m, n, k);
+        testGemmCorrectnessMatmul<half, TYPE_FP32>(m, n, k);
+        testGemmCorrectnessMatmul<half, TYPE_FP16>(m, n, k);
+
+        testGemmConsistencyMatmul<float, TYPE_FP32>(m, n, k);
+        testGemmConsistencyMatmul<half, TYPE_FP32>(m, n, k);
+        testGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
+
+        testGemmConsistencyBatchedMatmul<float, TYPE_FP32>(m, n, k);
+        testGemmConsistencyBatchedMatmul<half, TYPE_FP32>(m, n, k);
+        testGemmConsistencyBatchedMatmul<half, TYPE_FP16>(m, n, k);
+
+        testGemmConsistencyStridedBatchedMatmul<float, TYPE_FP32>(7, m, n, k);
+        testGemmConsistencyStridedBatchedMatmul<half, TYPE_FP32>(7, m, n, k);
+        testGemmConsistencyStridedBatchedMatmul<half, TYPE_FP16>(7, m, n, k);
+    }
+
+#ifdef SPARSITY_ENABLED
+    // Reset for SpGemm test.
+    testcases.clear();
+    testcases.insert(testcases.end(),
+                    {{8, 32, 32},  // minimum possible example.
+                     {8, 32, 64},
+                     {64, 64, 64},
+                     {16, 32, 64},
+                     {1024, 32, 1024},
+                     {1024, 1024, 32},
+                     {16, 1024, 1024},
+                     {1024, 1024, 1024}});
+
+    for (testcase_t &tc : testcases) {
+        size_t m = std::get<0>(tc);
+        size_t n = std::get<1>(tc);
+        size_t k = std::get<2>(tc);
+        testSpGemmCorrectnessMatmul<half, TYPE_FP16>(m, n, k);
+        testSpGemmConsistencyMatmul<half, TYPE_FP16>(m, n, k);
+    }
+#endif
+    FT_LOG_INFO("Test done");
+    return 0;
+}
diff --git a/tests/unittests/test_gpt_kernels.cu b/tests/unittests/test_gpt_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cef9590782a588c930b7a860f37923acf18dae1c
--- /dev/null
+++ b/tests/unittests/test_gpt_kernels.cu
@@ -0,0 +1,338 @@
+#include <vector>
+#include <random>
+
+#include "src/fastertransformer/kernels/gpt_kernels.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include "unittest_utils.h"
+
+int test_find_context_dups();
+int test_compact();
+int test_uncompact();
+
+int main(int argc, char* argv[])
+{
+    bool all_passed = true;
+    bool passed;
+
+    passed = test_find_context_dups() == EXIT_SUCCESS;
+    all_passed |= passed;
+    printf("%s", passed ? "." : "X");
+    if (!passed) {
+        puts("\ntest_find_context_dups: FAILED");
+    }
+
+    passed = test_compact() == EXIT_SUCCESS;
+    all_passed |= passed;
+    printf("%s", passed ? "." : "X");
+    if (!passed) {
+        puts("\ntest_compact: FAILED");
+    }
+
+    passed = test_uncompact() == EXIT_SUCCESS;
+    all_passed |= passed;
+    printf("%s", passed ? "." : "X");
+    if (!passed) {
+        puts("\ntest_uncompact: FAILED");
+    }
+
+    puts("");
+    return all_passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+int test_find_context_dups()
+{
+    const size_t vec_size = 1234;
+    const size_t batch_size = 8;
+    // Reference to the first unique vector
+    const std::vector<int> shared_contexts_ref {0, 0, 2, 3, 4, 4, 3, 3};
+
+    // Which compact index belong to what vector
+    const std::vector<int> batch_idx_to_compact_idx {0, 0, 1, 2, 3, 3, 2, 2};
+    std::vector<int> batch_idx_to_compact_idx_test(batch_size);
+
+    // Reverse map of batch_idx_to_compact_idx
+    const std::vector<int> compact_idx_to_batch_idx {0, 2, 3, 4, -1, -1, -1, -1};
+    std::vector<int> compact_idx_to_batch_idx_test(batch_size, -1);
+
+    std::vector<int> input_ids;
+    std::vector<int> default_vector(vec_size, 0);
+
+    for (size_t i = 0; i < batch_size; ++i) {
+        default_vector[vec_size - 1] = shared_contexts_ref[i];
+        input_ids.insert(input_ids.end(), default_vector.begin(), default_vector.end());
+    }
+
+    std::vector<int> shared_contexts_test(batch_size);
+
+    int* d_input_ids;
+    int* d_shared_contexts_test;
+    int* d_batch_idx_to_compact_idx;
+    int* d_compact_to_batch;
+    int* d_compact_size;
+    cudaMalloc(&d_input_ids, batch_size * vec_size * sizeof(int));
+    cudaMalloc(&d_shared_contexts_test, batch_size * sizeof(int));
+    cudaMalloc(&d_batch_idx_to_compact_idx, batch_size * sizeof(int));
+    cudaMalloc(&d_compact_to_batch, batch_size * sizeof(int));
+    cudaMalloc(&d_compact_size, sizeof(int));
+
+    cudaH2Dcpy(d_input_ids, input_ids.data(), batch_size * vec_size);
+    cudaH2Dcpy(d_compact_to_batch, compact_idx_to_batch_idx_test.data(), batch_size);
+
+    invokeFindContextDups(d_shared_contexts_test,
+            d_batch_idx_to_compact_idx,
+            d_compact_to_batch,
+            d_compact_size,
+            d_input_ids,
+            batch_size,
+            vec_size);
+
+    int compact_size;
+    cudaD2Hcpy(shared_contexts_test.data(), d_shared_contexts_test, batch_size);
+    cudaD2Hcpy(batch_idx_to_compact_idx_test.data(), d_batch_idx_to_compact_idx, batch_size);
+    cudaD2Hcpy(compact_idx_to_batch_idx_test.data(), d_compact_to_batch, batch_size);
+    cudaD2Hcpy(&compact_size, d_compact_size, 1);
+
+    cudaFree(d_input_ids);
+    cudaFree(d_shared_contexts_test);
+
+    EXPECT_TRUE(shared_contexts_test == shared_contexts_ref);
+    EXPECT_TRUE(batch_idx_to_compact_idx == batch_idx_to_compact_idx_test);
+    EXPECT_TRUE(compact_idx_to_batch_idx_test == compact_idx_to_batch_idx);
+    EXPECT_TRUE(compact_size == 4);
+
+    return EXIT_SUCCESS;
+}
+
+int test_compact()
+{
+    size_t batch_size = 128;
+    size_t compact_size = 5;
+    size_t seq_len = 40;
+    size_t hidden_dimension = 8;
+    auto generator_f = std::bind(std::uniform_real_distribution<float>(-1.0, 1.0), std::mt19937());
+    auto generator_i = std::bind(std::uniform_int_distribution<int>(0, 128), std::mt19937());
+
+    // decoder_input [batch_size, seq_len, hidden_dimension] ->
+    // compact_decoder_input [compact_size, seq_len, hidden_dimension]
+    std::vector<float> decoder_input(batch_size * seq_len * hidden_dimension);
+    std::vector<float> compact_decoder_input(compact_size * seq_len * hidden_dimension);
+    std::generate(decoder_input.begin(), decoder_input.end(), generator_f);
+    float *d_decoder_input, *d_compact_decoder_input;
+    cudaMalloc(&d_decoder_input, decoder_input.size() * sizeof(float));
+    cudaMalloc(&d_compact_decoder_input, compact_decoder_input.size() * sizeof(float));
+    cudaH2Dcpy(d_decoder_input, decoder_input.data(), decoder_input.size());
+
+    // attention_mask [batch_size, seq_len, seq_len] ->
+    // compact_attention_mask [compact_size, seq_len, seq_len]
+    std::vector<float> attention_mask(batch_size * seq_len * seq_len);
+    std::vector<float> compact_attention_mask(compact_size * seq_len * seq_len);
+    std::generate(attention_mask.begin(), attention_mask.end(), generator_f);
+    float *d_attention_mask, *d_compact_attention_mask;
+    cudaMalloc(&d_attention_mask, attention_mask.size() * sizeof(float));
+    cudaMalloc(&d_compact_attention_mask, compact_attention_mask.size() * sizeof(float));
+    cudaH2Dcpy(d_attention_mask, attention_mask.data(), attention_mask.size());
+
+    // input_lengths [batch_size] -> compact_input_lengths [compact_size]
+    std::vector<int> input_lengths(batch_size);
+    std::vector<int> compact_input_lengths(compact_size);
+    std::generate(input_lengths.begin(), input_lengths.end(), generator_i);
+    int *d_input_lengths, *d_compact_input_lengths;
+    cudaMalloc(&d_input_lengths, input_lengths.size() * sizeof(int));
+    cudaMalloc(&d_compact_input_lengths, compact_input_lengths.size() * sizeof(int));
+    cudaH2Dcpy(d_input_lengths, input_lengths.data(), input_lengths.size());
+
+    // compact_idx [compact_size]
+    /* std::vector<int> compact_idx {0, 3}; */
+    std::vector<int> compact_idx {0, 29, 42, 44, 100};
+    int *d_compact_idx;
+    cudaMalloc(&d_compact_idx, compact_idx.size() * sizeof(int));
+    cudaH2Dcpy(d_compact_idx, compact_idx.data(), compact_idx.size());
+
+    invokeCompactInputs<float>(d_compact_decoder_input,
+                               d_compact_attention_mask,
+                               d_compact_input_lengths,
+                               d_decoder_input,
+                               d_attention_mask,
+                               d_input_lengths,
+                               d_compact_idx,
+                               compact_size,
+                               seq_len,
+                               hidden_dimension);
+
+    cudaD2Hcpy(compact_decoder_input.data(), d_compact_decoder_input, compact_decoder_input.size());
+    cudaD2Hcpy(compact_attention_mask.data(), d_compact_attention_mask, compact_attention_mask.size());
+    cudaD2Hcpy(compact_input_lengths.data(), d_compact_input_lengths, compact_input_lengths.size());
+
+    for (size_t i = 0; i < compact_size; i++) {
+        for (size_t t = 0; t < seq_len; t++) {
+            for (size_t h = 0; h < hidden_dimension; h++) {
+                EXPECT_TRUE(compact_decoder_input[(i * seq_len + t) * hidden_dimension + h] ==
+                            decoder_input[(compact_idx[i] * seq_len + t) * hidden_dimension + h]);
+            }
+        }
+    }
+
+    for (size_t i = 0; i < compact_size; i++) {
+        for (size_t t1 = 0; t1 < seq_len; t1++) {
+            for (size_t t2 = 0; t2 < seq_len; t2++) {
+                EXPECT_TRUE(compact_attention_mask[(i * seq_len + t1) * seq_len + t2] ==
+                            attention_mask[(compact_idx[i] * seq_len + t1) * seq_len + t2]);
+            }
+        }
+    }
+
+    for (size_t i = 0; i < compact_size; i++) {
+        EXPECT_TRUE(compact_input_lengths[i] == input_lengths[compact_idx[i]]);
+    }
+
+    cudaFree(d_decoder_input);
+    cudaFree(d_compact_decoder_input);
+    cudaFree(d_attention_mask);
+    cudaFree(d_compact_attention_mask);
+    cudaFree(d_input_lengths);
+    cudaFree(d_compact_input_lengths);
+    cudaFree(d_compact_idx);
+
+    return EXIT_SUCCESS;
+}
+
+int test_uncompact()
+{
+    // compact_decoder_outputs [compact_size, seq_len, hidden_dimension] ->
+    // decoder_outputs [batch_size, seq_len, hidden_dimension]
+    size_t batch_size = 128;
+    size_t compact_size = 6;
+    size_t local_batch_size = compact_size / 2;
+    size_t seq_len = 40;
+    size_t max_seq_len = 60;
+    size_t hidden_dimension = 8;
+    size_t num_layer = 2;
+    size_t num_head = 2;
+    size_t size_per_head = 4;
+    auto generator_f = std::bind(std::uniform_real_distribution<float>(-1.0, 1.0), std::mt19937());
+    auto generator_i = std::bind(std::uniform_int_distribution<int>(0, compact_size - 1), std::mt19937());
+
+    std::vector<float> compact_decoder_outputs(compact_size * seq_len * hidden_dimension);
+    std::vector<float> decoder_outputs(batch_size * seq_len * hidden_dimension);
+    std::vector<float> k_cache_compact(num_layer * compact_size * num_head * size_per_head * seq_len);
+    std::vector<float> v_cache_compact(num_layer * compact_size * num_head * seq_len * size_per_head);
+    std::vector<float> k_cache_out(num_layer * batch_size * num_head * size_per_head * max_seq_len);
+    std::vector<float> v_cache_out(num_layer * batch_size * num_head * max_seq_len * size_per_head);
+
+    std::generate(compact_decoder_outputs.begin(), compact_decoder_outputs.end(), generator_f);
+    std::generate(k_cache_compact.begin(), k_cache_compact.end(), generator_f);
+    std::generate(v_cache_compact.begin(), v_cache_compact.end(), generator_f);
+
+    std::vector<int> batch_to_compact_idx(batch_size);
+    std::generate(batch_to_compact_idx.begin(), batch_to_compact_idx.end(), generator_i);
+
+    float *d_compact_decoder_outputs, *d_decoder_outputs, *d_k_cache, *d_v_cache;
+    float *d_k_cache_compact, *d_v_cache_compact;
+
+    cudaMalloc(&d_compact_decoder_outputs, compact_decoder_outputs.size() * sizeof(float));
+    cudaH2Dcpy(d_compact_decoder_outputs, compact_decoder_outputs.data(), compact_decoder_outputs.size());
+
+    cudaMalloc(&d_k_cache_compact, k_cache_compact.size() * sizeof(float));
+    cudaMalloc(&d_v_cache_compact, v_cache_compact.size() * sizeof(float));
+    cudaH2Dcpy(d_k_cache_compact, k_cache_compact.data(), k_cache_compact.size());
+    cudaH2Dcpy(d_v_cache_compact, v_cache_compact.data(), v_cache_compact.size());
+
+    cudaMalloc(&d_k_cache, k_cache_out.size() * sizeof(float));
+    cudaMalloc(&d_v_cache, v_cache_out.size() * sizeof(float));
+    cudaMemset(d_k_cache, 0, k_cache_out.size() * sizeof(float));
+    cudaMemset(d_v_cache, 0, v_cache_out.size() * sizeof(float));
+
+    cudaMalloc(&d_decoder_outputs, decoder_outputs.size() * sizeof(float));
+
+    int *d_batch_to_compact_idx;
+    cudaMalloc(&d_batch_to_compact_idx, batch_to_compact_idx.size() * sizeof(int));
+    cudaH2Dcpy(d_batch_to_compact_idx, batch_to_compact_idx.data(), batch_to_compact_idx.size());
+
+    const size_t cache_stride_dst = max_seq_len * hidden_dimension;
+    const size_t cache_stride_src = seq_len * hidden_dimension;
+    for (size_t ite = 0; ite < (batch_size / local_batch_size); ite++) {
+        for (size_t l = 0; l < num_layer; l++) {
+
+            const float *k_cache_offset = d_k_cache_compact + (l * compact_size + ite * local_batch_size) * cache_stride_src;
+            const float *v_cache_offset = d_v_cache_compact + (l * compact_size + ite * local_batch_size) * cache_stride_src;
+
+            invokeUnCompactCaches(d_k_cache + l * batch_size * cache_stride_dst,
+                                  d_v_cache + l * batch_size * cache_stride_dst,
+                                  k_cache_offset,
+                                  v_cache_offset,
+                                  d_batch_to_compact_idx,
+                                  batch_size,
+                                  num_head,
+                                  max_seq_len,
+                                  seq_len,
+                                  size_per_head,
+                                  local_batch_size,
+                                  ite);
+        }
+    }
+
+    invokeUnCompactOutputs(d_decoder_outputs,
+                           d_compact_decoder_outputs,
+                           d_batch_to_compact_idx,
+                           batch_size,
+                           cache_stride_src);
+
+    cudaD2Hcpy(decoder_outputs.data(), d_decoder_outputs, decoder_outputs.size());
+    cudaD2Hcpy(k_cache_out.data(), d_k_cache, k_cache_out.size());
+    cudaD2Hcpy(v_cache_out.data(), d_v_cache, v_cache_out.size());
+
+    for (size_t i = 0; i < batch_size; i++) {
+        for (size_t t = 0; t < seq_len; t++) {
+            for (size_t h = 0; h < hidden_dimension; h++) {
+                EXPECT_TRUE(decoder_outputs[(i * seq_len + t) * hidden_dimension] ==
+                            compact_decoder_outputs[(batch_to_compact_idx[i] * seq_len + t) * hidden_dimension]);
+            }
+        }
+    }
+
+    size_t x_size = (16 / sizeof(float));
+    for (size_t l = 0; l < num_layer; l++) {
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t h = 0; h < num_head; h++) {
+                for (size_t dh = 0; dh < size_per_head / x_size; dh++) {
+                    for (size_t t = 0; t < seq_len; t++) {
+                        for (size_t x = 0; x < x_size; x++) {
+                            auto src = batch_to_compact_idx[i];
+                            EXPECT_TRUE(
+                                    k_cache_out[((((l * batch_size + i  ) * num_head + h) * (size_per_head / x_size) + dh) *
+                                        max_seq_len + t) * x_size + x] ==
+                                    k_cache_compact[((((l * compact_size + src) * num_head + h) * (size_per_head / x_size) + dh) *
+                                        seq_len + t) * x_size + x]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    for (size_t l = 0; l < num_layer; l++) {
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t h = 0; h < num_head; h++) {
+                for (size_t t = 0; t < seq_len; t++) {
+                    for (size_t dh = 0; dh < size_per_head; dh++) {
+                        auto src = batch_to_compact_idx[i];
+                        EXPECT_TRUE(
+                                v_cache_out[(((l * batch_size + i  ) * num_head + h) * max_seq_len + t) * size_per_head + dh] ==
+                                v_cache_compact[(((l * compact_size + src) * num_head + h) * seq_len + t) * size_per_head + dh]);
+                    }
+                }
+            }
+        }
+    }
+
+    cudaFree(d_compact_decoder_outputs);
+    cudaFree(d_k_cache_compact);
+    cudaFree(d_v_cache_compact);
+    cudaFree(d_k_cache);
+    cudaFree(d_v_cache);
+    cudaFree(d_decoder_outputs);
+    cudaFree(d_batch_to_compact_idx);
+
+    return EXIT_SUCCESS;
+}
diff --git a/tests/unittests/test_int8.cu b/tests/unittests/test_int8.cu
new file mode 100644
index 0000000000000000000000000000000000000000..acc3c7f2174dcc98f43ec808d5399d5b446f8182
--- /dev/null
+++ b/tests/unittests/test_int8.cu
@@ -0,0 +1,94 @@
+#include <algorithm>
+#include <iostream>
+#include <math.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/kernels/transpose_int8_kernels.h"
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+
+#include "tests/unittests/gtest_utils.h"
+
+using namespace fastertransformer;
+
+class Int8TestSuite: public FtTestBase {
+
+public:
+    void SetUp() override
+    {
+        FtTestBase::SetUp();
+    }
+    void TearDown() override
+    {
+        FtTestBase::TearDown();
+    }
+
+protected:
+    using FtTestBase::stream;
+    using FtTestBase::allocator;
+
+    struct cudaDeviceProp prop;
+
+    void testTransposition();
+};
+
+void fill_tensor_random(Tensor a) {
+    const size_t num_elems = a.size();
+    std::vector<int8_t> host_values(num_elems);
+    std::uniform_int_distribution<int8_t> int8_random(-128, 127);
+    std::mt19937 rng(0);
+
+    std::generate(host_values.begin(), host_values.end(), [&int8_random, &rng](){ return int8_random(rng); });
+    cudaH2Dcpy(a.getPtr<int8_t>(), host_values.data(), num_elems);
+}
+
+void reference_transpose_host(std::vector<int8_t>& a_t_host, const Tensor& a)
+{
+    std::vector<int8_t> a_host(a.size());
+    cudaD2Hcpy(a_host.data(), a.getPtr<int8_t>(), a.size());
+
+    for (unsigned int i = 0; i < a.shape[0]; i++) {
+        for (unsigned int j = 0; j < a.shape[1]; j++) {
+            a_t_host[j * a.shape[0] + i] = a_host[i * a.shape[1] + j];
+        }
+    }
+}
+
+void Int8TestSuite::testTransposition()
+{
+    const int m = 32;
+    const int k = 2048;
+    const int n = 2048;
+
+    int8_t *a_data, *a_t_data;
+
+    cudaMalloc(&a_data, m * k * sizeof(int8_t));
+    Tensor a {MEMORY_GPU, TYPE_INT8, {32, 2048}, a_data};
+    fill_tensor_random(a);
+
+    cudaMalloc(&a_t_data, k * m * sizeof(int8_t));
+    Tensor a_t {MEMORY_GPU, TYPE_INT8, {2048, 32}, a_t_data};
+
+    std::vector<int8_t> a_t_host_ref(a_t.size());
+    reference_transpose_host(a_t_host_ref, a);
+
+    invokeTransposeInt8Tensor(a_t, a);
+    bool result = checkResult("", a_t.getPtr<int8_t>(), a_t_host_ref.data(), a_t.size());
+
+    cudaFree(a_data);
+    cudaFree(a_t_data);
+
+    EXPECT_TRUE(result);
+}
+
+TEST_F(Int8TestSuite, TranspositionCorrectness)
+{
+    this->testTransposition();
+}
diff --git a/tests/unittests/test_logprob_kernels.cu b/tests/unittests/test_logprob_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..486f771a2d6c72b24d9b60ff8925650f2f3e82a3
--- /dev/null
+++ b/tests/unittests/test_logprob_kernels.cu
@@ -0,0 +1,303 @@
+#include <assert.h>
+#include <math.h>
+#include <float.h>
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+#include <sys/time.h>
+
+#include "src/fastertransformer/kernels/logprob_kernels.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/logger.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include "tests/unittests/gtest_utils.h"
+
+using namespace fastertransformer;
+
+////////////////////////////////////////////////////////////////////////////////////
+
+struct LogProbKernelTestParam {
+    size_t max_input_length;
+    size_t batch_size;
+    size_t vocab_size;
+    size_t beam_width;
+
+    std::string toString() {
+        return fmtstr("LogProbKernelTestParam[max_input_length=%ld, batch=%ld, vocab=%ld, beam_width=%ld]",
+                      max_input_length, batch_size, vocab_size, beam_width);
+    }
+};
+
+/////////////////////////////////// Unittests //////////////////////////////////////////
+template<typename T>
+class LogProbKernelTest : public FtTestBase {
+
+protected:
+    void computeCumLogProbs(float* cum_log_probs,
+                            float* log_probs,
+                            const T* logits,
+                            const int* input_ids,
+                            const int* input_lengths,
+                            const size_t max_input_length,
+                            const size_t batch_size,
+                            const size_t vocab_size,
+                            const size_t vocab_size_padded)
+    {
+        for (size_t step = 0; step < max_input_length; ++step) {
+            for (size_t i = 0; i < batch_size; ++i) {
+                if ((int)step == 0) {
+                    if (log_probs != nullptr) {
+                        log_probs[i] = 0.0f;
+                    }
+                    cum_log_probs[i] = 0.0f;
+                }
+                else if ((int)step < input_lengths[i]) {
+                    size_t step_offset = (step - 1) * batch_size * vocab_size_padded;
+                    const T* vec = logits + step_offset + i * vocab_size_padded;
+                    float max_logits = -FLT_MAX;
+                    for (size_t v = 0; v < vocab_size; ++v) {
+                        float val = static_cast<float>(vec[v]);
+                        if (val > max_logits) {
+                            max_logits = val;
+                        }
+                    }
+                    float sum = 0.0f;
+                    for (size_t v = 0; v < vocab_size; ++v) {
+                        sum += expf(static_cast<float>(vec[v]) - max_logits);
+                    }
+                    int token_id = input_ids[step * batch_size + i];
+                    float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
+                    if (log_probs != nullptr) {
+                        log_probs[step * batch_size + i] = log_prob;
+                    }
+                    cum_log_probs[i] += log_prob;
+                }
+            }
+        }
+    }
+
+    void computeCumLogProbsBatchFirst(float* cum_log_probs,
+                                      float* log_probs,
+                                      const T* logits,
+                                      const int* input_ids,
+                                      const int* input_lengths,
+                                      const size_t max_input_length,
+                                      const size_t batch_size,
+                                      const size_t vocab_size,
+                                      const size_t vocab_size_padded)
+    {
+        for (size_t i = 0; i < batch_size; ++i) {
+            size_t batch_offset = i * max_input_length * vocab_size_padded;
+            for (size_t step = 0; step < max_input_length; ++step) {
+                if ((int)step == 0) {
+                    if (log_probs != nullptr) {
+                        log_probs[i * max_input_length] = 0.0f;
+                    }
+                    cum_log_probs[i] = 0.0f;
+                }
+                else if ((int)step < input_lengths[i]) {
+                    const T* vec = logits + batch_offset + (step - 1) * vocab_size_padded;
+                    float max_logits = -FLT_MAX;
+                    for (size_t v = 0; v < vocab_size; ++v) {
+                        float val = static_cast<float>(vec[v]);
+                        if (val > max_logits) {
+                            max_logits = val;
+                        }
+                    }
+                    float sum = 0.0f;
+                    for (size_t v = 0; v < vocab_size; ++v) {
+                        sum += expf(static_cast<float>(vec[v]) - max_logits);
+                    }
+                    int token_id = input_ids[i * max_input_length + step];
+                    float log_prob = static_cast<float>(vec[token_id]) - max_logits - log(sum);
+                    if (log_probs != nullptr) {
+                        log_probs[i * max_input_length + step] = log_prob;
+                    }
+                    cum_log_probs[i] += log_prob;
+                }
+            }
+        }
+    }
+
+public:
+
+    void runTest(LogProbKernelTestParam param) {
+        size_t max_input_length = param.max_input_length;
+        size_t batchxbeam = param.batch_size * param.beam_width;
+        size_t vocab_size = param.vocab_size;
+        // Make multiple of 8 as GPT does.
+        size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
+
+        // input values
+        T* h_logits = new T[max_input_length * batchxbeam * vocab_size];
+        int* h_input_ids = new int[max_input_length * batchxbeam];
+        int* h_input_lengths = new int[batchxbeam];
+
+        // output buffers
+        float* expected_cum_log_probs = new float[batchxbeam];
+
+        // initialize host buffers
+        initRandom(h_logits, max_input_length * batchxbeam * vocab_size, -10.0f / vocab_size, -1.0f);
+        initRandomInt(h_input_ids, max_input_length * batchxbeam, 0, vocab_size);
+        initRandomInt(h_input_lengths, batchxbeam, 1, max_input_length + 1);
+        memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
+
+        // device buffers
+        T* d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size));
+        int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
+        int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
+
+        // initialize device buffers
+        cudaH2Dcpy(d_logits, h_logits, max_input_length * batchxbeam * vocab_size);
+        cudaH2Dcpy(d_input_ids, h_input_ids, max_input_length * batchxbeam);
+        cudaH2Dcpy(d_input_lengths, h_input_lengths, batchxbeam);
+        deviceFill(d_cum_log_probs, batchxbeam, 0.0f);
+
+        size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
+        void* workspace = allocator->malloc(workspace_size);
+        invokeLogProbFromLogits(d_cum_log_probs,
+                                d_logits,
+                                d_input_ids,
+                                d_input_lengths,
+                                max_input_length,
+                                batchxbeam,
+                                vocab_size,
+                                vocab_size_padded,
+                                workspace,
+                                workspace_size,
+                                stream,
+                                false);
+        computeCumLogProbs(expected_cum_log_probs,
+                           nullptr,
+                           h_logits,
+                           h_input_ids,
+                           h_input_lengths,
+                           max_input_length,
+                           batchxbeam,
+                           vocab_size,
+                           vocab_size_padded);
+        bool passed = checkResult(param.toString(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
+        EXPECT_TRUE(passed);
+
+        FT_LOG_DEBUG("free host buffers");
+        delete[] expected_cum_log_probs;
+        delete[] h_input_lengths;
+        delete[] h_input_ids;
+        delete[] h_logits;
+    }
+
+    void runBatchFirstTest(LogProbKernelTestParam param) {
+        size_t max_input_length = param.max_input_length;
+        size_t batchxbeam = param.batch_size * param.beam_width;
+        size_t vocab_size = param.vocab_size;
+        // Make multiple of 8 as GPT does.
+        size_t vocab_size_padded = static_cast<size_t>(ceil(vocab_size / 8.f) * 8);
+
+        // input values
+        T* h_logits = new T[max_input_length * batchxbeam * vocab_size_padded];
+        int* h_input_ids = new int[max_input_length * batchxbeam];
+        int* h_input_lengths = new int[batchxbeam];
+
+        // output buffers
+        float* expected_cum_log_probs = new float[batchxbeam];
+
+        // initialize host buffers
+        initRandom(h_logits, max_input_length * batchxbeam * vocab_size_padded, -10.0f / vocab_size, -1.0f);
+        initRandomInt(h_input_ids, max_input_length * batchxbeam, 0, vocab_size);
+        initRandomInt(h_input_lengths, batchxbeam, 1, max_input_length + 1);
+        memset(expected_cum_log_probs, 0, sizeof(float) * batchxbeam);
+
+        // device buffers
+        T* d_logits =
+            reinterpret_cast<T*>(allocator->malloc(sizeof(T) * max_input_length * batchxbeam * vocab_size_padded));
+        int *d_input_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_input_length * batchxbeam));
+        int *d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        float* d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
+
+        // initialize device buffers
+        cudaH2Dcpy(d_logits, h_logits, max_input_length * batchxbeam * vocab_size_padded);
+        cudaH2Dcpy(d_input_ids, h_input_ids, max_input_length * batchxbeam);
+        cudaH2Dcpy(d_input_lengths, h_input_lengths, batchxbeam);
+        check_cuda_error(cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam));
+
+        size_t workspace_size = sizeof(float) * max_input_length * batchxbeam;
+        void* workspace = allocator->malloc(workspace_size);
+        invokeLogProbFromLogits(d_cum_log_probs,
+                                d_logits,
+                                d_input_ids,
+                                d_input_lengths,
+                                max_input_length,
+                                batchxbeam,
+                                vocab_size,
+                                vocab_size_padded,
+                                workspace,
+                                workspace_size,
+                                stream,
+                                true);
+
+        computeCumLogProbsBatchFirst(expected_cum_log_probs,
+                                    nullptr,
+                                    h_logits,
+                                    h_input_ids,
+                                    h_input_lengths,
+                                    max_input_length,
+                                    batchxbeam,
+                                    vocab_size,
+                                    vocab_size_padded);
+        std::string tag = param.toString() + (std::is_same<T, float>::value ? " (fp32)" : " (fp16)");
+        bool passed = checkResult(tag.c_str(), d_cum_log_probs, expected_cum_log_probs, batchxbeam);
+        EXPECT_TRUE(passed);
+
+        delete[] expected_cum_log_probs;
+        delete[] h_input_lengths;
+        delete[] h_input_ids;
+        delete[] h_logits;
+    }
+
+};
+
+
+TYPED_TEST_SUITE(LogProbKernelTest, FloatAndHalfTypes);
+
+TYPED_TEST(LogProbKernelTest, SingleStep)
+{
+    this->runTest({1, 32, 16, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, AccumLongStep129)
+{
+    this->runTest({129, 8, 50211, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, AccumLongStep1023)
+{
+    this->runTest({1023, 8, 5001, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, AccumLongStep4096)
+{
+    this->runTest({4096, 8, 5001, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, BatchFirstSingleStep)
+{
+    this->runBatchFirstTest({1, 32, 16, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, BatchFirstAccumLongStep129)
+{
+    this->runBatchFirstTest({129, 8, 50211, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, BatchFirstAccumLongStep1023)
+{
+    this->runBatchFirstTest({1023, 8, 5001, 1});
+}
+
+TYPED_TEST(LogProbKernelTest, BatchFirstAccumLongStep4096)
+{
+    this->runBatchFirstTest({4096, 8, 5001, 1});
+}
diff --git a/tests/unittests/test_penalty_kernels.cu b/tests/unittests/test_penalty_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4939affa3996eaabc89aff0852a71faee0e77099
--- /dev/null
+++ b/tests/unittests/test_penalty_kernels.cu
@@ -0,0 +1,695 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>   // std::min, std::max
+#include <iostream>    // snprintf
+#include <math.h>      // expf, log
+#include <stdexcept>
+#include <stdlib.h>    // rand
+#include <string>      // std::string
+#include <unordered_map>
+#include <vector>      // std::vector
+
+#include <cublas_v2.h>
+#include <cublasLt.h>
+#include <cuda_runtime.h>
+
+#include "src/fastertransformer/kernels/beam_search_penalty_kernels.h"
+#include "src/fastertransformer/kernels/penalty_types.h"
+#include "src/fastertransformer/kernels/sampling_penalty_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+// #include "tests/unittests/unittest_utils.h"
+#include "tests/unittests/gtest_utils.h"
+
+using namespace fastertransformer;
+
+struct TemperatureTestParam {
+    size_t batch_size;
+    size_t vocab_size;
+    float* temperatures;
+    size_t temperatures_size;
+
+    std::string toString() {
+        return fmtstr("TemperatureTestParam[batch=%ld, vocab=%ld, temperatures=%s]",
+                      batch_size, vocab_size, arr2str(temperatures, temperatures_size).c_str());
+    }
+};
+
+size_t pad_vocab_size(size_t vocab_size, size_t pad = 8) {
+    return (vocab_size + pad - 1) / pad * pad;
+}
+
+template<typename T>
+void applyRepetitonPenalty(T* logits,
+                           const int* output_ids,
+                           const int* input_lengths,
+                           const float repetition_penalty,
+                           const size_t step,
+                           const size_t max_input_length,
+                           const size_t batch_size,
+                           const size_t vocab_size,
+                           const size_t vocab_size_padded)
+{
+    bool* penalized = new bool[vocab_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        std::fill_n(penalized, vocab_size, false);
+        size_t length = std::min<int>(step, input_lengths[i]);
+        size_t offset = i * vocab_size_padded;
+        for (size_t t = 0; t < step; ++t) {
+            if (t >= (size_t)input_lengths[i] && t < max_input_length) {
+                continue;
+            }
+            int token_id = output_ids[i + t * batch_size];
+            if (!penalized[token_id]) {
+                float logit = static_cast<float>(logits[offset + token_id]);
+                logits[offset + token_id] = static_cast<T>(logit < 0.0f ?
+                    logit * repetition_penalty : logit / repetition_penalty);
+                penalized[token_id] = true;
+            }
+        }
+    }
+    delete[] penalized;
+}
+
+template<typename T>
+void batchApplyRepetitonPenalty(T* logits,
+                                const int* output_ids,
+                                const int* input_lengths,
+                                const float* repetition_penalties,
+                                const size_t step,
+                                const size_t max_input_length,
+                                const size_t batch_size,
+                                const size_t vocab_size,
+                                const size_t vocab_size_padded)
+{
+    bool* penalized = new bool[vocab_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        float repetition_penalty = repetition_penalties[i];
+        std::fill_n(penalized, vocab_size, false);
+        size_t offset = i * vocab_size_padded;
+        for (size_t t = 0; t < step; ++t) {
+            if (t >= (size_t)input_lengths[i] && t < max_input_length) {
+                continue;
+            }
+            int token_id = output_ids[i + t * batch_size];
+            if (!penalized[token_id]) {
+                float logit = static_cast<float>(logits[offset + token_id]);
+                logits[offset + token_id] =
+                    static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
+                penalized[token_id] = true;
+            }
+        }
+    }
+    delete[] penalized;
+}
+
+template<typename T>
+void initLogitsAndBias(T* logits,
+                       T* bias,
+                       const size_t batch_size,
+                       const size_t vocab_size,
+                       const size_t vocab_size_padded)
+{
+    initRandom(logits, batch_size * vocab_size_padded, -5.0f, 5.0f);
+    if (bias != nullptr) {
+        initRandom(bias, vocab_size, -5.0f, 5.0f);
+    }
+    bool is_half = sizeof(T) == 2;
+    for (size_t i = 0; i < batch_size; ++i) {
+        for (size_t j = 0; j < vocab_size_padded; ++j) {
+            if (j >= vocab_size) {
+                logits[i * vocab_size_padded + j] = static_cast<T>(is_half ? -65504.f : -FLT_MAX);
+                if (bias != nullptr && i == 0) {
+                    bias[j] = (T)0.0f;
+                }
+            }
+        }
+    }
+}
+
+
+/////////////////////////////////// Tests //////////////////////////////////////////
+
+template<typename T>
+class TemperaturePenaltyTest : public FtTestBase {
+protected:
+    // Set up test
+    size_t batch_size_;
+    size_t vocab_size_;
+    size_t vocab_size_padded_;
+
+    T* h_logits_;
+    T* h_bias_;
+    T* d_logits_;
+    T* d_bias_;
+
+    float* d_temperatures_;
+
+    void subsetup(TemperatureTestParam param) {
+        batch_size_ = param.batch_size;
+        vocab_size_ = param.vocab_size;
+        vocab_size_padded_ = pad_vocab_size(vocab_size_);
+
+        h_logits_ = new T[batch_size_ * vocab_size_padded_];
+        h_bias_ = new T[vocab_size_padded_];
+        initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
+
+        d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
+        d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
+        cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
+        if (param.temperatures_size > 1) {
+            ASSERT_EQ(param.temperatures_size, param.batch_size) << "Invalid test configuration.";
+            d_temperatures_ = reinterpret_cast<float*>(allocator->malloc(sizeof(T) * param.temperatures_size));
+            cudaAutoCpy(d_temperatures_, param.temperatures, batch_size_, stream);
+        }
+    }
+
+    void subteardown() {
+        delete[] h_logits_;
+        delete[] h_bias_;
+    }
+
+    void computeReference(T*           logits,
+                          const T*     bias,
+                          const float* temperatures,
+                          const size_t temperatures_size,
+                          const size_t batch_size,
+                          const size_t vocab_size,
+                          const size_t vocab_size_padded)
+    {
+        for (size_t i = 0; i < batch_size; ++i) {
+            float temperature = temperatures_size > 1 ? temperatures[i] : temperatures[0];
+            ASSERT_GT(temperature, 0.0f) << "temperature should be positive but got " << temperature;
+            for (size_t j = 0; j < vocab_size; ++j) {
+                size_t index = i * vocab_size_padded + j;
+                float logit = static_cast<float>(logits[index]);
+                if (bias != nullptr) {
+                    logit += static_cast<float>(bias[j]);
+                }
+                logits[index] = static_cast<T>(logit / temperature);
+            }
+        }
+    }
+
+
+public:
+    void runTest(TemperatureTestParam param)
+    {
+        subsetup(param);
+        // Do test
+        if (param.temperatures_size == 1) {
+            invokeApplyTemperaturePenalty(d_logits_,
+                                          d_bias_,
+                                          param.temperatures[0],
+                                          batch_size_,
+                                          vocab_size_,
+                                          vocab_size_padded_,
+                                          stream);
+        }
+        else {
+            invokeBatchApplyTemperaturePenalty(d_logits_,
+                                               d_bias_,
+                                               d_temperatures_,
+                                               batch_size_,
+                                               vocab_size_,
+                                               vocab_size_padded_,
+                                               stream);
+        }
+        computeReference(h_logits_,
+                         h_bias_,
+                         param.temperatures,
+                         param.temperatures_size,
+                         batch_size_,
+                         vocab_size_,
+                         vocab_size_padded_);
+        bool passed = checkResult(param.toString(), d_logits_, h_logits_, batch_size_ * vocab_size_padded_);
+        EXPECT_TRUE(passed);
+        subteardown();
+    }
+
+    void runConsistencyTest(TemperatureTestParam param) {
+        // Set up test
+        ASSERT_EQ(param.temperatures_size, 1) << "A consistency test assumes temperatures_size=1";
+        subsetup(param);
+
+        // Run a single runtime value case.
+        invokeApplyTemperaturePenalty(d_logits_,
+                                      d_bias_,
+                                      param.temperatures[0],
+                                      batch_size_,
+                                      vocab_size_,
+                                      vocab_size_padded_,
+                                      stream);
+
+        float temperature = param.temperatures[0];
+        float* h_temperatures = new float[batch_size_];
+        for (size_t i = 0; i < batch_size_; ++i) {
+            h_temperatures[i] = temperature;
+        }
+        d_temperatures_ = reinterpret_cast<float*>(allocator->malloc(sizeof(T) * batch_size_));
+        cudaAutoCpy(d_temperatures_, h_temperatures, batch_size_, stream);
+
+        T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
+        T* d_bias_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
+        cudaAutoCpy(d_bias_batch, h_bias_, vocab_size_padded_, stream);
+
+        invokeBatchApplyTemperaturePenalty(d_logits_batch,
+                                           d_bias_batch,
+                                           d_temperatures_,
+                                           batch_size_,
+                                           vocab_size_,
+                                           vocab_size_padded_,
+                                           stream);
+        bool passed = checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
+        EXPECT_TRUE(passed);
+
+        // Tear down test
+        delete[] h_temperatures;
+        subteardown();
+    }
+};
+
+// Since a compiler doesn't correctly catch the use of a variable inside gtest,
+// we carefully suppress a compile warning message.
+#pragma nv_diag_suppress 177
+
+TYPED_TEST_SUITE(TemperaturePenaltyTest, FloatAndHalfTypes);
+
+TYPED_TEST(TemperaturePenaltyTest, NoPenalty)
+{
+    float temperature = 1.0f;
+    this->runTest({6, 4, &temperature, 1});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, LessThanOne)
+{
+    float temperature = 0.53f;
+    this->runTest({6, 4, &temperature, 1});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, GreaterThaneOne)
+{
+    float temperature = 2.01f;
+    this->runTest({6, 4, &temperature, 1});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, LargeVocab)
+{
+    float temperature = 2.01f;
+    this->runTest({6, 50001, &temperature, 1});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, BatchNoPenalty)
+{
+    size_t batch_size = 6;
+    float* temperatures = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        temperatures[i] = 1.0f;
+    }
+    this->runTest({batch_size, 4, temperatures, batch_size});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, BatchLessThanOne)
+{
+    size_t batch_size = 6;
+    float* temperatures = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        temperatures[i] = 0.53f;
+    }
+    this->runTest({batch_size, 4, temperatures, batch_size});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, BatchGreaterThaneOne)
+{
+    size_t batch_size = 6;
+    float* temperatures = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        temperatures[i] = 2.01f;
+    }
+    this->runTest({batch_size, 4, temperatures, batch_size});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, BatchMixed)
+{
+    size_t batch_size = 6;
+    float* temperatures = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        temperatures[i] = i % 2 ==0 ? 2.01f : 0.53f;
+    }
+    this->runTest({batch_size, 4, temperatures, batch_size});
+}
+
+TYPED_TEST(TemperaturePenaltyTest, Consistency)
+{
+    float temperature = 2.01f;
+    this->runConsistencyTest({6, 4, &temperature, 1});
+}
+
+struct RepetitionPenaltyTestCase {
+    size_t                batch_size;
+    size_t                vocab_size;
+    size_t                max_input_length;
+    float*                repetition_penalties;
+    size_t                repetition_penalties_size;
+    RepetitionPenaltyType repetition_penalty_type;
+
+    std::string toString() {
+        static const std::unordered_map<RepetitionPenaltyType, std::string> typestr_map {
+            {RepetitionPenaltyType::Additive, "additive"},
+            {RepetitionPenaltyType::Multiplicative, "multiplicative"},
+            {RepetitionPenaltyType::None, "none"}};
+        return fmtstr(
+            "RepetitionPenaltyTestCase[batch=%ld, vocab=%ld, max_input_length=%ld, "
+            "repetition_penalties=%s, repetition_penalty_type=%s]",
+            batch_size, vocab_size, max_input_length,
+            arr2str(repetition_penalties, repetition_penalties_size).c_str(),
+            typestr_map.at(repetition_penalty_type).c_str());
+    }
+};
+
+template<typename T>
+class RepetitionPenaltyTest : public FtTestBase {
+protected:
+    // Set up test
+    size_t batch_size_;
+    size_t vocab_size_;
+    size_t vocab_size_padded_;
+    size_t max_input_length_;
+    size_t sequence_length_;
+    size_t step_;
+
+    T* h_logits_;
+    T* h_bias_;
+    int* h_output_ids_;
+    int* h_input_lengths_;
+
+    T* d_logits_;
+    T* d_bias_;
+    int* d_output_ids_;
+    int* d_input_lengths_;
+
+    float* d_repetition_penalties_;
+
+    void subsetup(RepetitionPenaltyTestCase param) {
+        batch_size_ = param.batch_size;
+        vocab_size_ = param.vocab_size;
+        vocab_size_padded_ = pad_vocab_size(vocab_size_);
+        max_input_length_ = param.max_input_length;
+        sequence_length_ = 2 * max_input_length_;  // input + output
+        step_ = sequence_length_ * 0.7;
+
+        h_logits_ = new T[batch_size_ * vocab_size_padded_];
+        h_bias_ = new T[vocab_size_padded_];
+        h_output_ids_ = new int[sequence_length_ * batch_size_];
+        h_input_lengths_ = new int[batch_size_];
+        initLogitsAndBias(h_logits_, h_bias_, batch_size_, vocab_size_, vocab_size_padded_);
+        initRandomInt(h_output_ids_, sequence_length_ * batch_size_, 0, vocab_size_);
+        initRandomInt(h_input_lengths_, batch_size_, 1, max_input_length_);
+
+        d_logits_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
+        d_bias_ = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
+        d_output_ids_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
+        d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
+
+        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
+        cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
+        cudaAutoCpy(d_output_ids_, h_output_ids_, sequence_length_ * batch_size_, stream);
+        cudaAutoCpy(d_input_lengths_, h_input_lengths_, batch_size_, stream);
+        if (param.repetition_penalties_size > 1) {
+            ASSERT_EQ(param.repetition_penalties_size, param.batch_size) << "Invalid test configuration.";
+            d_repetition_penalties_ =
+                reinterpret_cast<float*>(allocator->malloc(sizeof(T) * param.repetition_penalties_size));
+            cudaAutoCpy(d_repetition_penalties_, param.repetition_penalties, batch_size_, stream);
+        }
+    }
+
+    void subteardown() {
+        delete[] h_logits_;
+        delete[] h_bias_;
+        delete[] h_output_ids_;
+        delete[] h_input_lengths_;
+    }
+
+    void computeReference(T*                          logits,
+                          const int*                  output_ids,
+                          const int*                  input_lengths,
+                          const float*                repetition_penalties,
+                          const size_t                repetition_penalties_size,
+                          const RepetitionPenaltyType repetition_penalty_type,
+                          const size_t                step,
+                          const size_t                max_input_length,
+                          const size_t                batch_size,
+                          const size_t                vocab_size,
+                          const size_t                vocab_size_padded)
+    {
+        bool* penalized = new bool[vocab_size];
+        for (size_t i = 0; i < batch_size; ++i) {
+            float repetition_penalty =
+                repetition_penalties_size > 1 ? repetition_penalties[i] : repetition_penalties[0];
+
+            std::fill_n(penalized, vocab_size, false);
+            size_t offset = i * vocab_size_padded;
+            for (size_t t = 0; t < step; ++t) {
+                if (t >= (size_t)input_lengths[i] && t < max_input_length) {
+                    continue;
+                }
+                int token_id = output_ids[i + t * batch_size];
+                if (!penalized[token_id]) {
+                    float logit = static_cast<float>(logits[offset + token_id]);
+                    switch (repetition_penalty_type) {
+                        case RepetitionPenaltyType::Additive:
+                            logits[offset + token_id] = static_cast<T>(logit - repetition_penalty);
+                            break;
+                        case RepetitionPenaltyType::Multiplicative:
+                            logits[offset + token_id] =
+                                static_cast<T>(logit < 0.0f ? logit * repetition_penalty : logit / repetition_penalty);
+                            break;
+                        case RepetitionPenaltyType::None:
+                            // None. do nothing.
+                            break;
+                        default:
+                            throw std::domain_error("Invalid repetition penalty type.");
+                    }
+                    penalized[token_id] = true;
+                }
+            }
+        }
+        delete[] penalized;
+    }
+
+public:
+    void runTest(RepetitionPenaltyTestCase param)
+    {
+        subsetup(param);
+        // Do test
+        if (param.repetition_penalties_size == 1) {
+            invokeApplyRepetitionPenalty(d_logits_,
+                                         param.repetition_penalties[0],
+                                         nullptr,
+                                         d_output_ids_,
+                                         batch_size_,
+                                         batch_size_,
+                                         vocab_size_,
+                                         vocab_size_padded_,
+                                         d_input_lengths_,
+                                         max_input_length_,
+                                         step_,
+                                         param.repetition_penalty_type,
+                                         stream);
+        }
+        else {
+            invokeBatchApplyRepetitionPenalty(d_logits_,
+                                              d_repetition_penalties_,
+                                              d_output_ids_,
+                                              batch_size_,
+                                              batch_size_,
+                                              vocab_size_padded_,
+                                              d_input_lengths_,
+                                              max_input_length_,
+                                              step_,
+                                              param.repetition_penalty_type,
+                                              stream);
+        }
+        computeReference(h_logits_,
+                         h_output_ids_,
+                         h_input_lengths_,
+                         param.repetition_penalties,
+                         param.repetition_penalties_size,
+                         param.repetition_penalty_type,
+                         step_,
+                         max_input_length_,
+                         batch_size_,
+                         vocab_size_,
+                         vocab_size_padded_);
+        bool passed = checkResult(param.toString(), d_logits_, h_logits_, batch_size_ * vocab_size_padded_);
+        EXPECT_TRUE(passed);
+        subteardown();
+    }
+
+    void runConsistencyTest(RepetitionPenaltyTestCase param) {
+        // Set up test
+        ASSERT_EQ(param.repetition_penalties_size, 1) << "A consistency test assumes repetition_penalties_size=1";
+        subsetup(param);
+
+        // Run a single runtime value case.
+        invokeApplyRepetitionPenalty(d_logits_,
+                                     param.repetition_penalties[0],
+                                     nullptr,
+                                     d_output_ids_,
+                                     batch_size_,
+                                     batch_size_,
+                                     vocab_size_,
+                                     vocab_size_padded_,
+                                     d_input_lengths_,
+                                     max_input_length_,
+                                     step_,
+                                     param.repetition_penalty_type,
+                                     stream);
+
+        float* h_repetition_penalties = new float[batch_size_];
+        for (size_t i = 0; i < batch_size_; ++i) {
+            h_repetition_penalties[i] = param.repetition_penalties[0];
+        }
+        d_repetition_penalties_ = reinterpret_cast<float*>(allocator->malloc(sizeof(T) * batch_size_));
+        cudaAutoCpy(d_repetition_penalties_, h_repetition_penalties, batch_size_, stream);
+
+        T* d_logits_batch = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size_ * vocab_size_padded_));
+        cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
+        invokeBatchApplyRepetitionPenalty(d_logits_batch,
+                                          d_repetition_penalties_,
+                                          d_output_ids_,
+                                          batch_size_,
+                                          batch_size_,
+                                          vocab_size_padded_,
+                                          d_input_lengths_,
+                                          max_input_length_,
+                                          step_,
+                                          param.repetition_penalty_type,
+                                          stream);
+        bool passed =
+            checkResult(param.toString(), d_logits_, d_logits_batch, batch_size_ * vocab_size_padded_, true, true);
+        EXPECT_TRUE(passed);
+
+        // Tear down test
+        delete[] h_repetition_penalties;
+        subteardown();
+    }
+};
+
+TYPED_TEST_SUITE(RepetitionPenaltyTest, FloatAndHalfTypes);
+
+TYPED_TEST(RepetitionPenaltyTest, NoPenalty)
+{
+    float repetition_penalty = 1.0f;
+    this->runTest({6, 4, 5, &repetition_penalty, 1, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, LessThanOne)
+{
+    float repetition_penalty = 0.53f;
+    this->runTest({6, 4, 5, &repetition_penalty, 1, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, GreaterThaneOne)
+{
+    float repetition_penalty = 2.01f;
+    this->runTest({6, 4, 5, &repetition_penalty, 1, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, LargeVocab)
+{
+    float repetition_penalty = 2.01f;
+    this->runTest({6, 50001, 1003, &repetition_penalty, 1, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, BatchNoPenalty)
+{
+    size_t batch_size = 6;
+    float* repetition_penalties = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        repetition_penalties[i] = 1.0f;
+    }
+    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, BatchLessThanOne)
+{
+    size_t batch_size = 6;
+    float* repetition_penalties = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        repetition_penalties[i] = 0.53f;
+    }
+    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, BatchGreaterThaneOne)
+{
+    size_t batch_size = 6;
+    float* temperatures = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        temperatures[i] = 2.01f;
+    }
+    this->runTest({batch_size, 4, 5, temperatures, batch_size, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, BatchMixed)
+{
+    size_t batch_size = 6;
+    float* repetition_penalties = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
+    }
+    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, Consistency)
+{
+    float repetition_penalty = 2.01f;
+    this->runConsistencyTest({6, 4, 5, &repetition_penalty, 1, RepetitionPenaltyType::Multiplicative});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditive)
+{
+    size_t batch_size = 6;
+    float* repetition_penalties = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        repetition_penalties[i] = i % 2 ==0 ? 2.01f : 0.53f;
+    }
+    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero)
+{
+    float repetition_penalty = 1.0f;
+    this->runTest({6, 4, 5, &repetition_penalty, 1, RepetitionPenaltyType::Additive});
+}
+
+TYPED_TEST(RepetitionPenaltyTest, PenaltyTypeAdditiveHasDefaultValueZero2)
+{
+    size_t batch_size = 6;
+    float* repetition_penalties = new float[batch_size];
+    for (size_t i = 0; i < batch_size; ++i) {
+        repetition_penalties[i] = i % 2 ==0 ? 1.0f : 0.0f;
+    }
+    this->runTest({batch_size, 4, 5, repetition_penalties, batch_size, RepetitionPenaltyType::Additive});
+}
+
+// Turn on the warning message.
+#pragma nv_diag_suppress 177
diff --git a/tests/unittests/test_sampling.cu b/tests/unittests/test_sampling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d83af06015b143259a89b33bc8241b2d4914fb33
--- /dev/null
+++ b/tests/unittests/test_sampling.cu
@@ -0,0 +1,1521 @@
+#include <algorithm>  // std::min, std::max
+#include <iostream>   // snprintf
+#include <math.h>     // expf, log
+#include <stdlib.h>   // rand
+#include <string>     // std::string
+#include <vector>     // std::vector
+
+#include <cublasLt.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include "tests/unittests/unittest_utils.h"
+
+using namespace fastertransformer;
+
+struct TestCase {
+    std::string name;
+    size_t      batch_size;
+    size_t      vocab_size;
+    size_t      beam_width;
+    size_t      top_k;
+    float       top_p;
+    size_t      output_len;
+
+    std::string toString()
+    {
+        char buf[100];
+        snprintf(buf,
+                 sizeof(buf),
+                 "TestCase[name=%s, batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]",
+                 name.c_str(),
+                 batch_size,
+                 vocab_size,
+                 beam_width,
+                 top_k,
+                 top_p,
+                 output_len);
+        return buf;
+    }
+
+    void print()
+    {
+        FT_LOG_INFO(toString());
+    }
+};
+
+template<typename T>
+void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
+{
+    // Compute the log probability from logits.
+    //   logits = batch_size x vocab_size vector.
+    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
+    for (int bidx = 0; bidx < batch_size; ++bidx) {
+        float sum = 0.0f;
+        for (int i = 0; i < vocab_size; ++i) {
+            sum += expf((float)logits[bidx * vocab_size + i]);
+        }
+        for (int i = 0; i < vocab_size; ++i) {
+            int idx    = bidx * vocab_size + i;
+            probs[idx] = static_cast<T>(expf((float)logits[idx]) / (sum + EPSILON));
+        }
+    }
+}
+
+template<typename T>
+void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
+{
+    // Compute the log probability from logits.
+    //   logits = batch_size x vocab_size vector.
+    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
+    for (int bidx = 0; bidx < batch_size; ++bidx) {
+        float sum = 0.0f;
+        for (int i = 0; i < vocab_size; ++i) {
+            sum += expf(logits[bidx * vocab_size + i]);
+        }
+        for (int i = 0; i < vocab_size; ++i) {
+            int idx       = bidx * vocab_size + i;
+            logprobs[idx] = static_cast<T>(logf(expf(logits[idx]) / (sum + EPSILON) + EPSILON));
+        }
+    }
+}
+
+/////////////////////////////////// Tests //////////////////////////////////////////
+
+template<typename T>
+void testCumLogProbComputation(TestCase tc)
+{
+
+    bool is_fp32 = std::is_same<T, float>::value;
+
+    size_t             beam_width = tc.beam_width;
+    uint               top_k      = tc.top_k;
+    float              top_p      = tc.top_p;
+    unsigned long long seed       = 0;
+    // use default values having no effect.
+    float temperature        = 1.0f;
+    float len_penalty        = 0.0f;
+    float repetition_penalty = 1.0f;
+
+    size_t batch_size     = tc.batch_size;
+    size_t vocab_size     = tc.vocab_size;
+    int    end_id         = 3;
+    size_t max_input_len  = 0;  // has no effect.
+    size_t max_output_len = tc.output_len;
+    size_t max_seq_len    = max_input_len + max_output_len;
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+
+    cudaStream_t     stream;
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    check_cuda_error(cudaStreamCreate(&stream));
+    check_cuda_error(cublasCreate(&cublas_handle));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle));
+    check_cuda_error(cublasSetStream(cublas_handle, stream));
+
+    cublasAlgoMap                   cublas_algo_map(GEMM_CONFIG);
+    Allocator<AllocatorType::CUDA>* allocator = new Allocator<AllocatorType::CUDA>(getDevice());
+    allocator->setStream(stream);
+
+    std::mutex*      cublas_wrapper_mutex = new std::mutex();
+    cublasMMWrapper* cublas_wrapper =
+        new cublasMMWrapper(cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
+
+    DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+                                                                            vocab_size,
+                                                                            end_id,
+                                                                            stream,
+                                                                            cublas_wrapper,
+                                                                            allocator,
+                                                                            false,   // is_free_buffer_after_forward
+                                                                            &prop);  // cuda_device_prop
+
+    const DataType data_type   = getTensorType<T>();
+    size_t         logits_size = batch_size * beam_width * vocab_size;
+    T*             logits_buf  = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * logits_size, true));
+
+    // Logit values in the host of shape ((batch_size x beam) x vocab_size) where beam = 1.
+    T*     h_logits               = new T[batch_size * beam_width * vocab_size];
+    T*     h_probs                = new T[batch_size * beam_width * vocab_size];
+    T*     h_log_probs            = new T[batch_size * beam_width * vocab_size];
+    float* h_cum_log_probs        = new float[batch_size * beam_width];
+    float* h_output_log_probs     = new float[max_output_len * batch_size * beam_width];
+    float* expected_cum_log_probs = new float[batch_size * beam_width];
+    initRandom(h_logits, batch_size * beam_width * vocab_size, -10.0f / vocab_size, -1.0f);
+    computeProb(h_probs, h_logits, batch_size * beam_width, vocab_size);
+    computeLogProb(h_log_probs, h_logits, batch_size * beam_width, vocab_size);
+    memset(expected_cum_log_probs, 0, sizeof(float) * batch_size * beam_width);
+
+#ifndef NDEBUG
+    FT_LOG_DEBUG("logit values");
+    printMatrixWithLimit(h_logits, batch_size * beam_width, vocab_size, vocab_size, false);
+    FT_LOG_DEBUG("\nprob values");
+    printMatrixWithLimit(h_probs, batch_size * beam_width, vocab_size, vocab_size, false);
+    FT_LOG_DEBUG("\nlog-prob values");
+    printMatrixWithLimit(h_log_probs, batch_size * beam_width, vocab_size, vocab_size, false);
+#endif
+
+    int*   tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width));
+    float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width));
+    float* output_log_probs =
+        reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
+
+    int* output_ids   = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
+    int* h_output_ids = new int[batch_size * beam_width];
+
+    int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+    deviceFill(end_ids, batch_size, end_id);
+
+    // Init by zero.
+    cudaMemset(cum_log_probs, 0, sizeof(float) * batch_size * beam_width);
+    cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width);
+    cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width);
+
+    TensorMap input_tensors({{"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
+                             {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+                             {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
+                             {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+                             {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &len_penalty}},
+                             {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
+    dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors);
+
+    for (size_t step = max_input_len; step < max_output_len; ++step) {
+        uint ite = 0;
+        // Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob).
+        cudaH2Dcpy(logits_buf, h_logits, logits_size);
+        TensorMap dynamic_decode_input_tensors(
+            {{"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, logits_buf}},
+             {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
+             {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+             {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
+             {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
+             {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+             {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
+             {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
+             {"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
+             {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+             {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
+             {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+             {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &len_penalty}},
+             {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
+
+        // common outputs
+        TensorMap dynamic_decode_output_tensors(
+            {{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
+             {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
+             {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
+             {"output_log_probs",
+              Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
+             {"parent_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, nullptr}},
+             {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}},
+             // necessary for beam search.
+             {"tgt_cache_indirection",
+              Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width, max_output_len}, nullptr}}});
+
+        dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+
+        FT_LOG_DEBUG("Step %2d generated ids", step);
+        cudaD2Hcpy(
+            h_output_ids,
+            (int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(step * (batch_size * beam_width)),
+            batch_size * beam_width);
+        cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width);
+        cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width);
+        for (size_t i = 0; i < batch_size * beam_width; ++i) {
+            int idx = i * vocab_size + h_output_ids[i];
+            expected_cum_log_probs[i] += (float)h_log_probs[idx];
+            FT_LOG_DEBUG("| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
+                         "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
+                         (int)step,
+                         (int)i,
+                         (int)idx,
+                         (int)h_output_ids[i],
+                         h_output_log_probs[step * batch_size * beam_width + i],
+                         (float)h_log_probs[idx],
+                         h_cum_log_probs[i],
+                         expected_cum_log_probs[i],
+                         (float)h_probs[idx]);
+        }
+        FT_LOG_DEBUG("");
+
+#ifndef NDEBUG
+        // print output ids
+        for (size_t s = max_input_len; s < max_seq_len; ++s) {
+            cudaD2Hcpy(
+                h_output_ids,
+                (int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(s * (batch_size * beam_width)),
+                batch_size * beam_width);
+            printf("%02d ", (int)s);
+            for (size_t b = 0; b < batch_size; ++b) {
+                printf("%3d ", (int)h_output_ids[b]);
+            }
+            printf("\n");
+        }
+#endif
+    }
+    std::string tag    = tc.toString() + (is_fp32 ? " (fp32)" : " (fp16)");
+    bool        passed = checkResult(tag, cum_log_probs, expected_cum_log_probs, batch_size * beam_width);
+    EXPECT_TRUE(passed);
+
+    delete[] expected_cum_log_probs;
+    delete[] h_output_log_probs;
+    delete[] h_cum_log_probs;
+    delete[] h_logits;
+    delete[] h_log_probs;
+    delete[] h_probs;
+    delete[] h_output_ids;
+
+    delete dynamic_decode_layer;
+    delete cublas_wrapper;
+    delete allocator;
+    check_cuda_error(cudaStreamDestroy(stream));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    check_cuda_error(cublasLtDestroy(cublaslt_handle));
+}
+
+void printTensors(TensorMap* map, size_t limit = 8)
+{
+    FT_LOG_INFO("Tensors:");
+    for (auto& kv : *map) {
+        Tensor t = kv.second;
+        FT_LOG_INFO(" - %-18s : %s", kv.first.c_str(), t.toString().c_str());
+    }
+}
+
+template<typename T>
+class SamplingDecodeTest {
+private:
+    unsigned long long              seed           = 0;
+    const static unsigned long long max_seed       = 30;
+    const size_t                    batch_size     = 6;
+    const size_t                    beam_width     = 1;
+    const size_t                    batchxbeam     = batch_size * beam_width;
+    const size_t                    vocab_size     = 8;
+    const size_t                    max_input_len  = 0;  // has no effect.
+    const size_t                    max_output_len = 3;
+    const size_t                    max_seq_len    = max_input_len + max_output_len;
+    const int                       end_id         = vocab_size - 1;
+    const DataType                  data_type      = getTensorType<T>();
+
+    // vocab size 8 & length 3
+    T* test_input_logits;
+
+    Allocator<AllocatorType::CUDA>* allocator;
+    std::mutex*                     cublas_wrapper_mutex;
+    cublasMMWrapper*                cublas_wrapper;
+    DynamicDecodeLayer<T>*          dynamic_decode_layer;
+
+    int*   h_output_ids;
+    T*     h_logits;
+    T*     h_probs;
+    T*     h_log_probs;
+    float* h_cum_log_probs;
+    float* h_output_log_probs;
+
+    T*     d_logits;
+    int*   d_input_lengths;
+    float* d_cum_log_probs;
+    float* d_output_log_probs;
+    int*   d_output_ids;
+    int*   d_end_ids;
+
+    void setup(unsigned long long seed = 0)
+    {
+        this->seed = seed;
+        struct cudaDeviceProp prop;
+        check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+        cudaStream_t     stream;
+        cublasHandle_t   cublas_handle;
+        cublasLtHandle_t cublaslt_handle;
+        check_cuda_error(cudaStreamCreate(&stream));
+        check_cuda_error(cublasCreate(&cublas_handle));
+        check_cuda_error(cublasLtCreate(&cublaslt_handle));
+        check_cuda_error(cublasSetStream(cublas_handle, stream));
+        cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+        allocator = new Allocator<AllocatorType::CUDA>(getDevice());
+        allocator->setStream(stream);
+        cublas_wrapper_mutex = new std::mutex();
+        cublas_wrapper       = new cublasMMWrapper(
+            cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
+        dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+                                                         vocab_size,
+                                                         end_id,
+                                                         stream,
+                                                         cublas_wrapper,
+                                                         allocator,
+                                                         false,   // is_free_buffer_after_forward
+                                                         &prop);  // cuda_device_prop
+
+        h_output_ids       = new int[batchxbeam];
+        h_logits           = new T[batchxbeam * vocab_size];
+        h_probs            = new T[batchxbeam * vocab_size];
+        h_log_probs        = new T[batchxbeam * vocab_size];
+        h_cum_log_probs    = new float[batchxbeam];
+        h_output_log_probs = new float[max_output_len * batchxbeam];
+
+        // prob = (0.4, 0.3, 0.2, 0.1, ...)
+        test_input_logits = new T[24]{
+            -0.9163,  -1.2040,  -1.6094,  -2.3026,  -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX,  // step 0
+            -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163,  -1.2040,  -1.6094,  -2.3026,   // step 1
+            -FLT_MAX, -FLT_MAX, -0.9163,  -1.2040,  -1.6094,  -2.3026,  -FLT_MAX, -FLT_MAX   // step 2
+        };
+
+        d_logits           = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
+        d_input_lengths    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        d_cum_log_probs    = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
+        d_output_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batchxbeam));
+        d_output_ids       = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
+        d_end_ids          = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+
+        // Init by zero.
+        cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam);
+        cudaMemset(d_output_log_probs, 0, sizeof(float) * max_output_len * batchxbeam);
+        cudaMemset(d_output_ids, 0, sizeof(int) * max_seq_len * batchxbeam);
+        deviceFill(d_end_ids, batchxbeam, end_id, stream);
+    }
+
+    void teardown()
+    {
+        delete[] test_input_logits;
+        delete[] h_output_ids;
+        delete[] h_logits;
+        delete[] h_probs;
+        delete[] h_log_probs;
+        delete[] h_cum_log_probs;
+        delete[] h_output_log_probs;
+        delete dynamic_decode_layer;
+        delete cublas_wrapper;
+        delete cublas_wrapper_mutex;
+        delete allocator;
+    }
+
+    TensorMap* createInputTensors(
+        int* topk, size_t topk_size, float* topp, size_t topp_size, float* temperature, float* repetition_penalty)
+    {
+        // construct common input tensors
+        TensorMap* input_tensors = new TensorMap();
+        if (topk != nullptr) {
+            input_tensors->insert({"runtime_top_k", {MEMORY_CPU, TYPE_INT32, {topk_size}, topk}});
+        }
+        if (topp != nullptr) {
+            input_tensors->insert({"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {topp_size}, topp}});
+        }
+        if (temperature != nullptr) {
+            input_tensors->insert({"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, temperature}});
+        }
+        if (repetition_penalty != nullptr) {
+            input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}});
+        }
+        input_tensors->insert(
+            {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
+        input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}});
+        input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}});
+        input_tensors->insert(
+            {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
+        input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}});
+        input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}});
+        return input_tensors;
+    }
+
+    TensorMap* createOutputTensors()
+    {
+        // construct common output tensors
+        TensorMap* output_tensors = new TensorMap();
+        output_tensors->insert(
+            {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}});
+        output_tensors->insert({"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}});
+        output_tensors->insert(
+            {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, d_cum_log_probs}});
+        output_tensors->insert(
+            {"output_log_probs",
+             Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
+        output_tensors->insert({"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
+        return output_tensors;
+    }
+
+    void batchH2Dcpy(T* dst, T* src, size_t m, size_t n)
+    {
+        for (size_t i = 0; i < m; ++i) {
+            cudaH2Dcpy(dst + i * n, src, n);
+        }
+    }
+
+    bool checkResult(std::string name, int* d_output_ids, std::vector<std::set<int>>& expected_ids)
+    {
+        assert(expected_ids.size() == max_seq_len * batchxbeam);
+        int* h_output_ids = new int[max_seq_len * batchxbeam];
+        cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam);
+        int failures = 0;
+        for (size_t i = 0; i < max_seq_len * batchxbeam; ++i) {
+            size_t        s     = i / batchxbeam;
+            size_t        b     = i % batchxbeam;
+            std::set<int> expts = expected_ids.at(i);
+            if (expts.count(h_output_ids[i]) == 0) {
+                if (failures < 10) {
+                    std::stringstream ss;
+                    ss << " - Fail " << name << " (step=" << s << ", batch=" << b << ") "
+                       << "actual=" << h_output_ids[i] << ", expected";
+                    for (auto& expt : expts) {
+                        ss << " " << expt;
+                    }
+                    FT_LOG_DEBUG("%s", ss.str().c_str());
+                }
+                ++failures;
+            }
+        }
+        delete[] h_output_ids;
+        FT_LOG_DEBUG("check...%6s : %s (failures: %d / %d)",
+                     failures == 0 ? "....OK" : "FAILED",
+                     name.c_str(),
+                     failures,
+                     max_seq_len * batchxbeam);
+        return failures == 0;
+    }
+
+    bool testSampling(std::string                name,
+                      std::vector<std::set<int>> expected_output_ids,
+                      int*                       top_ks,
+                      size_t                     top_k_size,
+                      float*                     top_ps,
+                      size_t                     top_p_size,
+                      float*                     temperature,
+                      float*                     repetition_penalty)
+    {
+        FT_LOG_INFO("Test %s", name.c_str());
+        std::string tag    = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
+        bool        passed = true;
+        for (unsigned long long seed = 0; seed < max_seed; ++seed) {
+            this->setup(seed);
+            size_t     step = max_input_len;
+            uint       ite  = 0;
+            TensorMap* input_tensors =
+                createInputTensors(top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
+            input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
+            input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
+            input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}});
+            TensorMap* output_tensors = createOutputTensors();
+
+            dynamic_decode_layer->setup(batch_size, beam_width, input_tensors);
+            for (step = max_input_len; step < max_output_len; ++step) {
+                // Reset by the test value since the sampling layer internally update the logit buffer.
+                batchH2Dcpy(input_tensors->at("logits").getPtr<T>(),
+                            test_input_logits + step * vocab_size,
+                            batchxbeam,
+                            vocab_size);
+                dynamic_decode_layer->forward(output_tensors, input_tensors);
+            }
+            bool is_ok = checkResult(tag + fmtstr(" seed=%lld", seed), d_output_ids, expected_output_ids);
+            passed &= is_ok;
+#ifndef NDEBUG
+            if (!is_ok) {
+                FT_LOG_ERROR("actual output ids");
+                printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
+            }
+#endif
+            delete output_tensors;
+            delete input_tensors;
+            this->teardown();
+        }
+        FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
+        return passed;
+    }
+
+    bool testSamplingWithLocalBatch(std::string                name,
+                                    std::vector<std::set<int>> expected_output_ids,
+                                    int*                       top_ks,
+                                    size_t                     top_k_size,
+                                    float*                     top_ps,
+                                    size_t                     top_p_size,
+                                    float*                     temperature,
+                                    float*                     repetition_penalty)
+    {
+        FT_LOG_INFO("Test %s", name.c_str());
+        std::string tag    = fmtstr("Test %s T=%s", name.c_str(), std::is_same<T, float>::value ? "fp32" : "fp16");
+        bool        passed = true;
+        size_t      local_batch_size = 2;
+        uint        ite              = 1;
+        for (unsigned long long seed = 0; seed < max_seed; ++seed) {
+            this->setup(seed);
+            size_t     step = max_input_len;
+            TensorMap* input_tensors =
+                createInputTensors(top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
+            input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
+            input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
+            input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}});
+            TensorMap* output_tensors = createOutputTensors();
+
+            dynamic_decode_layer->setup(batch_size, beam_width, input_tensors);
+            for (step = max_input_len; step < max_output_len; ++step) {
+                // Reset by the test value since the sampling layer internally update the logit buffer.
+                batchH2Dcpy(input_tensors->at("logits").getPtr<T>(),
+                            test_input_logits + step * vocab_size,
+                            batchxbeam,
+                            vocab_size);
+                dynamic_decode_layer->forward(output_tensors, input_tensors);
+            }
+            bool is_ok = checkResult(tag + fmtstr(" seed=%lld", seed), d_output_ids, expected_output_ids);
+            passed &= is_ok;
+#ifndef NDEBUG
+            if (!is_ok) {
+                FT_LOG_ERROR("actual output ids");
+                printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
+            }
+#endif
+            delete output_tensors;
+            delete input_tensors;
+            this->teardown();
+        }
+        FT_LOG_INFO("check...%6s : %s", passed ? "....OK" : "FAILED", tag.c_str());
+        return passed;
+    }
+
+public:
+    void testTopK()
+    {
+        int                        top_k = 2;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            //  0       1       2       3       4       5
+            {0, 1},
+            {0, 1},
+            {0, 1},
+            {0, 1},
+            {0, 1},
+            {0, 1},  // step 0
+            {4, 5},
+            {4, 5},
+            {4, 5},
+            {4, 5},
+            {4, 5},
+            {4, 5},  // step 1
+            {2, 3},
+            {2, 3},
+            {2, 3},
+            {2, 3},
+            {2, 3},
+            {2, 3}  // step 2
+        };
+        bool passed = this->testSampling("TopK", expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
+        EXPECT_TRUE(true);
+    }
+
+    void testBatchTopK()
+    {
+        int*                       top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            //  0    1    2       3    4    5
+            {0, 1},
+            {0},
+            {0},
+            {0, 1},
+            {0},
+            {0},  // step 0
+            {4, 5},
+            {4},
+            {4},
+            {4, 5},
+            {4},
+            {4},  // step 1
+            {2, 3},
+            {2},
+            {2},
+            {2, 3},
+            {2},
+            {2}  // step 2
+        };
+        bool passed =
+            this->testSampling("BatchTopK", expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
+        delete[] top_ks;
+        EXPECT_TRUE(passed);
+    }
+
+    void testTopP()
+    {
+        float                      top_p = 0.3;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling("TopP", expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
+        EXPECT_TRUE(true);
+    }
+
+    void testBatchTopP()
+    {
+        float*                     top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
+        std::vector<std::set<int>> expected_output_ids{
+            {0},
+            {0, 1},
+            {0, 1},
+            {0},
+            {0, 1},
+            {0, 1},  // step 0
+            {4},
+            {4, 5},
+            {4, 5},
+            {4},
+            {4, 5},
+            {4, 5},  // step 1
+            {2},
+            {2, 3},
+            {2, 3},
+            {2},
+            {2, 3},
+            {2, 3}  // step 2
+        };
+        bool passed =
+            this->testSampling("BatchTopP", expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testTopKTopP()
+    {
+        int                        top_k = 2;
+        float                      top_p = 0.3;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling("TopP", expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
+        EXPECT_TRUE(true);
+    }
+
+    void testBatchTopKTopP()
+    {
+        std::string                name   = "BatchTopKTopP";
+        int*                       top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
+        float                      top_p  = 0.3;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
+        delete[] top_ks;
+        EXPECT_TRUE(passed);
+    }
+
+    void testTopKBatchTopP()
+    {
+        std::string                name   = "TopKBatchTopP";
+        int                        top_k  = 2;
+        float*                     top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0, 1},
+            {0},
+            {0, 1},
+            {0, 1},
+            {0},
+            {0, 1},  // step 0
+            {4, 5},
+            {4},
+            {4, 5},
+            {4, 5},
+            {4},
+            {4, 5},  // step 1
+            {2, 3},
+            {2},
+            {2, 3},
+            {2, 3},
+            {2},
+            {2, 3}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testBatchTopKBatchTopP()
+    {
+        std::string                name   = "BatchTopKBatchTopP";
+        int*                       top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
+        float*                     top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0, 1},
+            {0},
+            {0, 1},
+            {0, 1},
+            {0},
+            {0, 1},  // step 0
+            {4, 5},
+            {4},
+            {4, 5},
+            {4, 5},
+            {4},
+            {4, 5},  // step 1
+            {2, 3},
+            {2},
+            {2, 3},
+            {2, 3},
+            {2},
+            {2, 3}  // step 2
+        };
+        bool passed =
+            this->testSampling(name, expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ks;
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsZeroTopK()
+    {
+        std::string                name  = "InvalidArgsZeroTopK";
+        int                        top_k = 0;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsZeroTopP()
+    {
+        std::string                name  = "InvalidArgsZeroTopP";
+        float                      top_p = 0;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsZeroTopKTopP()
+    {
+        std::string                name  = "InvalidArgsZeroTopKTopP";
+        int                        top_k = 0;
+        float                      top_p = 0;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsZeroBatchTopKTopP()
+    {
+        std::string                name   = "InvalidArgsZeroBatchTopKTopP";
+        int*                       top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
+        float                      top_p  = 0;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
+        delete[] top_ks;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsZeroTopKBatchTopP()
+    {
+        std::string                name   = "InvalidArgsZeroTopKBatchTopP";
+        int                        top_k  = 0;
+        float*                     top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},
+            {0},  // step 0
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},
+            {4},  // step 1
+            {2},
+            {2},
+            {2},
+            {2},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsBatchTopKContainZero()
+    {
+        std::string                name   = "InvalidArgsBatchTopKContainZero";
+        int*                       top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0, 1},
+            {0},
+            {0},
+            {0},
+            {0, 1},
+            {0},  // step 0
+            {4, 5},
+            {4},
+            {4},
+            {4},
+            {4, 5},
+            {4},  // step 1
+            {2, 3},
+            {2},
+            {2},
+            {2},
+            {2, 3},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
+        delete[] top_ks;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsBatchTopPContainZero()
+    {
+        std::string                name   = "InvalidArgsBatchTopPContainZero";
+        float*                     top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0, 1},
+            {0, 1},
+            {0},
+            {0, 1},
+            {0},
+            {0},  // step 0
+            {4, 5},
+            {4, 5},
+            {4},
+            {4, 5},
+            {4},
+            {4},  // step 1
+            {2, 3},
+            {2, 3},
+            {2},
+            {2, 3},
+            {2},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsBatchTopKTopPContainZero()
+    {
+        std::string                name   = "InvalidArgsBatchTopKTopPContainZero";
+        int*                       top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
+        float                      top_p  = 0.0;
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0, 1},
+            {0, 1},
+            {0},
+            {0},
+            {0, 1},
+            {0},  // step 0
+            {4, 5},
+            {4, 5},
+            {4},
+            {4},
+            {4, 5},
+            {4},  // step 1
+            {2, 3},
+            {2, 3},
+            {2},
+            {2},
+            {2, 3},
+            {2}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
+        delete[] top_ks;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsTopKBatchTopPContainZero()
+    {
+        std::string                name   = "InvalidArgsTopKBatchTopPContainZero";
+        int                        top_k  = 0;
+        float*                     top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0, 1},
+            {0},
+            {0},
+            {0, 1},  // step 0
+            {4},
+            {4},
+            {4, 5},
+            {4},
+            {4},
+            {4, 5},  // step 1
+            {2},
+            {2},
+            {2, 3},
+            {2},
+            {2},
+            {2, 3}  // step 2
+        };
+        bool passed = this->testSampling(name, expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testInvalidArgsBatchTopKBatchTopPContainZero()
+    {
+        std::string                name   = "InvalidArgsBatchTopKBatchTopPContainZero";
+        int*                       top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
+        float*                     top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0},
+            {0, 1},
+            {0},
+            {0, 1},  // step 0
+            {4},
+            {4},
+            {4},
+            {4, 5},
+            {4},
+            {4, 5},  // step 1
+            {2},
+            {2},
+            {2},
+            {2, 3},
+            {2},
+            {2, 3}  // step 2
+        };
+        bool passed =
+            this->testSampling(name, expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ks;
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testLocalBatchBatchTopP()
+    {
+        std::string                name   = "LocalBatch_BatchTopP";
+        float*                     top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
+        std::vector<std::set<int>> expected_output_ids{
+            {0},
+            {0},
+            {0, 1},
+            {0},
+            {0},
+            {0},  // step 0
+            {0},
+            {0},
+            {4, 5},
+            {4},
+            {0},
+            {0},  // step 1
+            {0},
+            {0},
+            {2, 3},
+            {2},
+            {0},
+            {0}  // step 2
+        };
+        bool passed = this->testSamplingWithLocalBatch(
+            name, expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testLocalBatchBatchTopKBatchTopP()
+    {
+        std::string                name   = "LocalBatch_BatchTopKBatchTopP";
+        int*                       top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
+        float*                     top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+        std::vector<std::set<int>> expected_output_ids{
+            // batch
+            {0},
+            {0},
+            {0, 1},
+            {0, 1},
+            {0},
+            {0},  // step 0
+            {0},
+            {0},
+            {4, 5},
+            {4, 5},
+            {0},
+            {0},  // step 1
+            {0},
+            {0},
+            {2, 3},
+            {2, 3},
+            {0},
+            {0}  // step 2
+        };
+        bool passed = this->testSamplingWithLocalBatch(
+            name, expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
+        delete[] top_ks;
+        delete[] top_ps;
+        EXPECT_TRUE(passed);
+    }
+
+    void testAll()
+    {
+        this->testTopK();
+        this->testTopP();
+        this->testTopKTopP();
+        this->testBatchTopK();
+        this->testBatchTopP();
+        this->testBatchTopKTopP();
+        this->testTopKBatchTopP();
+        this->testBatchTopKBatchTopP();
+        this->testInvalidArgsZeroTopK();
+        this->testInvalidArgsZeroTopP();
+        this->testInvalidArgsZeroBatchTopKTopP();
+        this->testInvalidArgsZeroTopKBatchTopP();
+        this->testInvalidArgsZeroTopKTopP();
+        this->testInvalidArgsBatchTopKContainZero();
+        this->testInvalidArgsBatchTopPContainZero();
+        this->testInvalidArgsBatchTopKTopPContainZero();
+        this->testInvalidArgsTopKBatchTopPContainZero();
+        this->testInvalidArgsBatchTopKBatchTopPContainZero();
+        this->testLocalBatchBatchTopP();
+        this->testLocalBatchBatchTopKBatchTopP();
+    }
+};
+
+__global__ void generateRandomNumber(unsigned int* vals, curandState_t* states, const int batch_size)
+{
+    int idx = threadIdx.x;
+    if (idx < batch_size) {
+        vals[idx] = curand(states + idx);
+    }
+}
+
+template<typename T>
+static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size)
+{
+    // The same seed produces the same random number.
+    for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
+        for (size_t j = 1; j < period_size; ++j) {
+            if (vals[i] != vals[i + j]) {
+                FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+template<typename T>
+static inline bool isEqualInPeriod(T* vals, size_t size, size_t period_size, size_t except)
+{
+    // The same seed produces the same random number.
+    for (size_t i = 0; i + period_size - 1 < size; i += period_size) {
+        for (size_t j = 1; j < period_size; ++j) {
+            if (j != except && vals[i] != vals[i + j]) {
+                FT_LOG_INFO(" **** *** ** * [%d] %d <> [%d] %d", i, vals[i], i + j, vals[i + j]);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+void testCuandBatchInitialize(const size_t batch_size)
+{
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    curandState_t* curand_states;
+    check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
+    unsigned long long* h_random_seeds = new unsigned long long[batch_size];
+    const size_t        period_size    = 3;
+    for (size_t i = 0; i < batch_size; ++i) {
+        h_random_seeds[i] = i / period_size;
+    }
+    unsigned long long* d_random_seeds;
+    check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
+    check_cuda_error(
+        cudaMemcpy(d_random_seeds, h_random_seeds, sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
+
+    // Initialize curand states.
+    invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
+    sync_check_cuda_error();
+
+    // Generate random numbers using initialized curand states.
+    unsigned int* d_rand_vals;
+    unsigned int* h_rand_vals = new unsigned int[batch_size];
+    check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
+    generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
+    check_cuda_error(
+        cudaMemcpyAsync(h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    // The same seed produces the same random number.
+    bool passed = isEqualInPeriod(h_rand_vals, batch_size, period_size);
+    FT_LOG_INFO("CuandBatchInitTest check....... : %s", passed ? "OK" : "FAILED");
+    EXPECT_TRUE(passed);
+
+    delete h_rand_vals;
+    delete h_random_seeds;
+
+    check_cuda_error(cudaFree(d_rand_vals));
+    check_cuda_error(cudaFree(d_random_seeds));
+    check_cuda_error(cudaFree(curand_states));
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+template<typename T, bool SINGLE_RANDOM_SEED, bool HAS_DIFF_ARGS, bool USE_LOCAL_BATCH>
+void testSamplingLayerCurandInit(TestCase tc)
+{
+    FT_LOG_DEBUG("testSamplingLayerCurandInit %s", tc.toString().c_str());
+    const DataType data_type = getTensorType<T>();
+
+    const size_t beam_width = 1;
+    const uint   top_k      = tc.top_k;
+    const float  top_p      = tc.top_p;
+    // use default values having no effect.
+    const float temperature        = 1.0f;
+    const float len_penalty        = 0.0f;
+    const float repetition_penalty = 1.0f;
+    const int   end_id             = 3;
+
+    const size_t batch_size       = tc.batch_size;
+    const size_t batchxbeam       = batch_size * beam_width;
+    const size_t local_batch_size = USE_LOCAL_BATCH ? 2 : batch_size;
+    assert(batch_size % local_batch_size == 0);
+    const size_t vocab_size     = tc.vocab_size;
+    const size_t max_input_len  = 0;  // has no effect.
+    const size_t max_output_len = tc.output_len;
+    const size_t max_seq_len    = max_input_len + max_output_len;
+
+    struct cudaDeviceProp prop;
+    check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+
+    cudaStream_t     stream;
+    cublasHandle_t   cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    check_cuda_error(cudaStreamCreate(&stream));
+    check_cuda_error(cublasCreate(&cublas_handle));
+    check_cuda_error(cublasLtCreate(&cublaslt_handle));
+    check_cuda_error(cublasSetStream(cublas_handle, stream));
+    cublasAlgoMap                   cublas_algo_map(GEMM_CONFIG);
+    std::mutex*                     cublas_wrapper_mutex = new std::mutex();
+    Allocator<AllocatorType::CUDA>* allocator            = new Allocator<AllocatorType::CUDA>(getDevice());
+    allocator->setStream(stream);
+    cublasMMWrapper* cublas_wrapper =
+        new cublasMMWrapper(cublas_handle, cublaslt_handle, stream, &cublas_algo_map, cublas_wrapper_mutex, allocator);
+    DynamicDecodeLayer<T>* dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+                                                                            vocab_size,
+                                                                            end_id,
+                                                                            stream,
+                                                                            cublas_wrapper,
+                                                                            allocator,
+                                                                            false,   // is_free_buffer_after_forward
+                                                                            &prop);  // cuda_device_prop
+
+    T*   h_logits     = reinterpret_cast<T*>(malloc(sizeof(T) * batchxbeam * vocab_size));
+    int* h_output_ids = reinterpret_cast<int*>(malloc(sizeof(int) * batchxbeam));
+
+    T*   d_logits_buf    = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
+    int* d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+    int* d_output_ids    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
+
+    // Init by zero.
+    cudaMemset(d_input_lengths, 0, sizeof(int) * batchxbeam);
+    cudaMemset(d_output_ids, 0, sizeof(int) * max_seq_len * batchxbeam);
+
+    // Prepare decoding arguments
+    const size_t        random_seed_size = SINGLE_RANDOM_SEED ? 1 : batch_size;
+    const size_t        period_size      = 3;
+    unsigned long long* random_seed      = new unsigned long long[random_seed_size];
+    for (size_t i = 0; i < random_seed_size; ++i) {
+        random_seed[i] = i / period_size;
+    }
+    const bool   has_diff_runtime_args = HAS_DIFF_ARGS;
+    const size_t runtime_args_size     = has_diff_runtime_args ? batch_size : 1;
+    uint*        runtime_top_k         = new uint[runtime_args_size];
+    float*       runtime_top_p         = new float[runtime_args_size];
+    const size_t except_idx            = 1;
+    for (size_t i = 0; i < runtime_args_size; ++i) {
+        runtime_top_k[i] = (top_k > 1) && (i % period_size == except_idx) ? 1 : top_k;
+        runtime_top_p[i] = (i % period_size == except_idx) ? top_p * 0.1f : top_p;
+    }
+    int* d_end_id_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+    deviceFill(d_end_id_buf, batch_size, end_id);
+
+#ifndef NDEBUG
+    FT_LOG_DEBUG("Random Seeds");
+    printMatrixWithLimit(random_seed, 1, random_seed_size, random_seed_size, false);
+#endif
+
+    bool passed = true;
+
+    TensorMap runtime_args;
+    runtime_args.insert({"has_diff_runtime_args", Tensor(MEMORY_CPU, TYPE_BOOL, {1}, &has_diff_runtime_args)});
+    runtime_args.insert({"random_seed", Tensor(MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed)});
+    runtime_args.insert({"runtime_top_k", Tensor(MEMORY_CPU, TYPE_INT32, {runtime_args_size}, runtime_top_k)});
+    runtime_args.insert({"runtime_top_p", Tensor(MEMORY_CPU, TYPE_FP32, {runtime_args_size}, runtime_top_p)});
+    runtime_args.insert({"temperature", Tensor(MEMORY_CPU, TYPE_FP32, {1}, &temperature)});
+    runtime_args.insert({"len_penalty", Tensor(MEMORY_CPU, TYPE_FP32, {1}, &len_penalty)});
+    runtime_args.insert({"repetition_penalty", Tensor(MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty)});
+    dynamic_decode_layer->setup(batch_size, beam_width, &runtime_args);
+
+    for (size_t step = max_input_len; step < max_output_len; ++step) {
+        const size_t iteration_num = batch_size / local_batch_size;
+
+        initRandom(h_logits, beam_width * vocab_size, -10.0f / vocab_size, -1.0f);
+        tile(h_logits, batch_size, beam_width * vocab_size);
+        cudaH2Dcpy(d_logits_buf, h_logits, batchxbeam * vocab_size);
+
+#ifndef NDEBUG
+        FT_LOG_DEBUG("logit values");
+        printMatrixWithLimit(h_logits, batchxbeam, vocab_size, vocab_size, false);
+#endif
+        for (uint ite = 0; ite < iteration_num; ++ite) {
+            TensorMap dynamic_decode_input_tensors(
+                {{"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits_buf}},
+                 {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
+                 {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                 {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
+                 {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
+                 {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                 {"has_diff_runtime_args", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &has_diff_runtime_args}},
+                 {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
+                 {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_id_buf}},
+                 {"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
+                 {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {runtime_args_size}, runtime_top_k}},
+                 {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {runtime_args_size}, runtime_top_p}},
+                 {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+                 {"len_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &len_penalty}},
+                 {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}});
+
+            // common outputs
+            TensorMap dynamic_decode_output_tensors(
+                {{"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
+                 {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
+                 {"parent_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, nullptr}},
+                 {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}},
+                 // necessary for beam search.
+                 {"tgt_cache_indirection",
+                  Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width, max_output_len}, nullptr}}});
+
+            dynamic_decode_layer->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
+            sync_check_cuda_error();
+#ifndef NDEBUG
+            FT_LOG_DEBUG("Step %2d generated ids", step);
+            printMatrix(d_output_ids, max_seq_len, batchxbeam, batchxbeam, true);
+            FT_LOG_DEBUG("");
+#endif
+            // check results.
+            cudaD2Hcpy(h_output_ids,
+                       (int*)dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset(step * batchxbeam),
+                       batchxbeam);
+        }
+        bool is_ok = isEqualInPeriod(h_output_ids, batchxbeam, period_size, except_idx);
+        passed &= is_ok;
+    }
+    std::string tag = fmtstr("%s (seed=%-6s has_diff_args=%-5s local_batch=%-5s T=%s)",
+                             tc.toString().c_str(),
+                             SINGLE_RANDOM_SEED ? "single" : "multi",
+                             HAS_DIFF_ARGS ? "true" : "false",
+                             USE_LOCAL_BATCH ? "true" : "false",
+                             (std::is_same<T, float>::value ? " fp32" : " fp16"));
+    FT_LOG_INFO("check...%s SamplingLayerCurandInitTest %-30s", passed ? "....OK" : "FAILED", tag.c_str());
+    EXPECT_TRUE(passed);
+
+    free(h_logits);
+    free(h_output_ids);
+
+    delete dynamic_decode_layer;
+    delete runtime_top_k;
+    delete runtime_top_p;
+    delete random_seed;
+    delete cublas_wrapper;
+    delete allocator;
+    check_cuda_error(cudaStreamDestroy(stream));
+    check_cuda_error(cublasDestroy(cublas_handle));
+    check_cuda_error(cublasLtDestroy(cublaslt_handle));
+}
+
+int main()
+{
+    std::vector<TestCase> test_cases{
+        // TC: name / batch / vocab / beam / k / p / outlen
+        TestCase{"topk", 6, 4, 1, 1, 0.0f, 4},
+        TestCase{"topk", 6, 4, 1, 4, 0.0f, 4},
+        TestCase{"topk", 6, 51200, 1, 31, 0.0f, 16},
+        TestCase{"topk", 32, 51200, 1, 63, 0.0f, 16},
+        TestCase{"topk", 32, 51200, 1, 64, 0.0f, 16},
+        TestCase{"topp", 6, 4, 1, 0, 0.2f, 4},
+        TestCase{"topp", 6, 4, 1, 0, 0.8f, 4},
+        TestCase{"topp", 6, 4, 1, 0, 1.0f, 4},
+        TestCase{"topp", 6, 51200, 1, 0, 0.8f, 16},
+        TestCase{"topp", 32, 51200, 1, 0, 0.8f, 16},
+        TestCase{"topp", 32, 51200, 1, 0, 1.0f, 16},
+        TestCase{"topk_topp", 6, 4, 1, 1, 0.8f, 16},
+        TestCase{"topk_topp", 6, 4, 1, 4, 1.0f, 16},
+        TestCase{"topk_topp", 6, 51200, 1, 31, 0.8f, 16},
+        TestCase{"topk_topp", 32, 51200, 1, 63, 0.8f, 16},
+        TestCase{"topk_topp", 32, 51200, 1, 64, 1.0f, 16},
+    };
+
+    for (auto& tc : test_cases) {
+        testCumLogProbComputation<float>(tc);
+        testCumLogProbComputation<half>(tc);
+    }
+    FT_LOG_INFO("testCumLogProbComputation done");
+
+    SamplingDecodeTest<float> sampling_decode_test;
+    sampling_decode_test.testAll();
+
+    testCuandBatchInitialize(127);
+    FT_LOG_INFO("testCuandBatchInitialize done");
+
+#define LAUNCH_VARIANTS(T, tc, local_batch)                                                                            \
+    testSamplingLayerCurandInit<T, true, false, local_batch>(tc);                                                      \
+    testSamplingLayerCurandInit<T, true, true, local_batch>(tc);                                                       \
+    testSamplingLayerCurandInit<T, false, false, local_batch>(tc);                                                     \
+    testSamplingLayerCurandInit<T, false, true, local_batch>(tc);
+    for (auto& tc : test_cases) {
+        LAUNCH_VARIANTS(float, tc, false);  // without local batch
+        LAUNCH_VARIANTS(half, tc, false);
+        LAUNCH_VARIANTS(float, tc, true);  // with local batch
+        LAUNCH_VARIANTS(half, tc, true);
+    }
+#undef LAUNCH_VARIANTS
+    FT_LOG_INFO("testSamplingLayerCurandInit done");
+
+    return 0;
+}
diff --git a/tests/unittests/test_sampling_kernels.cu b/tests/unittests/test_sampling_kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4334b4b68adb789a4e31b3fabd9c86c9b02b45b3
--- /dev/null
+++ b/tests/unittests/test_sampling_kernels.cu
@@ -0,0 +1,882 @@
+#include <algorithm>   // std::fill_n
+#include <iostream>    // snprintf
+#include <math.h>      // expf, log
+#include <stdlib.h>    // rand
+#include <string>      // std::string
+#include <vector>      // std::vector
+
+#include <cublas_v2.h>
+#include <cublasLt.h>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/kernels/sampling_topp_kernels.h"
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+
+#include "tests/unittests/gtest_utils.h"
+
+using namespace fastertransformer;
+
+namespace {
+
+struct SamplingKernelTestParam {
+    size_t batch_size;
+    size_t vocab_size;
+    size_t beam_width;
+    uint   top_k;
+    float  top_p;
+    size_t output_len;
+
+    std::string toString()
+    {
+        return fmtstr("SamplingKernelTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%u, p=%3.1f, output_len=%ld]",
+                      batch_size,
+                      vocab_size,
+                      beam_width,
+                      top_k,
+                      top_p,
+                      output_len);
+    }
+};
+
+/////////////////////////////////// Tests //////////////////////////////////////////
+
+template<typename T>
+void computeProb(T* probs, T* logits, int batch_size, int vocab_size)
+{
+    // Compute the log probability from logits.
+    //   logits = batch_size x vocab_size.
+    //   probs =  softmax(logits) (softmax along with vocab dimension)
+    // float is used for either T=float or half, since operations of half are
+    // not fully supported in a host function.
+    for (int bidx = 0; bidx < batch_size; ++bidx) {
+        float maxval = -FLT_MAX;
+        for (int i = 0; i < vocab_size; ++i) {
+            float logit = static_cast<float>(logits[bidx * vocab_size + i]);
+            if (logit > maxval) {
+                maxval = logit;
+            }
+        }
+        float sum = 0.0f;
+        for (int i = 0; i < vocab_size; ++i) {
+            sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
+        }
+        for (int i = 0; i < vocab_size; ++i) {
+            int idx = bidx * vocab_size + i;
+            float logit = static_cast<float>(logits[idx]) - maxval;
+            probs[idx] = static_cast<T>(expf(logit) / (sum + EPSILON));
+        }
+    }
+}
+
+template<typename T>
+void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size)
+{
+    // Compute the log probability from logits.
+    //   logits = batch_size x vocab_size.
+    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
+    // float is used for either T=float or half, since operations of half are
+    // not fully supported in a host function.
+    for (int bidx = 0; bidx < batch_size; ++bidx) {
+        float maxval = -FLT_MAX;
+        for (int i = 0; i < vocab_size; ++i) {
+            float logit = static_cast<float>(logits[bidx * vocab_size + i]);
+            if (logit > maxval) {
+                maxval = logit;
+            }
+        }
+        float sum = 0.0f;
+        for (int i = 0; i < vocab_size; ++i) {
+            sum += expf(static_cast<float>(logits[bidx * vocab_size + i]) - maxval);
+        }
+        for (int i = 0; i < vocab_size; ++i) {
+            int idx = bidx * vocab_size + i;
+            float logit = static_cast<float>(logits[idx]) - maxval;
+            logprobs[idx] = static_cast<T>(logit - logf(sum + EPSILON));
+        }
+    }
+}
+
+template<typename T>
+class SamplingKernelTest: public testing::Test {
+public:
+    void SetUp() override
+    {
+        check_cuda_error(cudaStreamCreate(&stream));
+        allocator = new Allocator<AllocatorType::CUDA>(getDevice());
+        allocator->setStream(stream);
+    }
+    void TearDown() override
+    {
+        delete allocator;
+        check_cuda_error(cudaStreamDestroy(stream));
+    }
+
+protected:
+    unsigned long long seed = 0;
+    cudaStream_t stream;
+    Allocator<AllocatorType::CUDA>* allocator;
+    curandState_t* curand_states;
+};
+
+template<typename T>
+class TopKSamplingKernelTest: public SamplingKernelTest<T> {
+
+protected:
+    const int end_id = 0;
+    using SamplingKernelTest<T>::seed;
+    using SamplingKernelTest<T>::stream;
+    using SamplingKernelTest<T>::allocator;
+    using SamplingKernelTest<T>::curand_states;
+
+public:
+    void runTest(SamplingKernelTestParam param)
+    {
+        size_t batch_size  = param.batch_size;
+        size_t vocab_size  = param.vocab_size;
+        size_t output_len  = param.output_len;
+        size_t max_seq_len = output_len;
+
+        uint  top_k = param.top_k;
+        float top_p = param.top_p;
+
+        // Logit values in the host of shape (batch_size x vocab_size).
+        T* h_logits = new T[batch_size * vocab_size];
+        T* h_probs  = new T[batch_size * vocab_size];
+        T* h_lprobs = new T[batch_size * vocab_size];
+
+        int*  h_output_ids  = new int[batch_size];
+        int*  h_seq_lengths = new int[batch_size];
+        bool* h_finished    = new bool[batch_size];
+
+        float* expected_cum_lprobs = new float[batch_size];
+        std::fill_n(expected_cum_lprobs, batch_size, 0);
+
+        curandState_t* curand_states =
+            reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
+        invokeCurandInitialize(curand_states, batch_size, seed, stream);
+
+        size_t workspace_size = 0;
+        // retrieve the workspace size of the top-k sampling kernel.
+        invokeTopKSampling<T>(nullptr,
+                              workspace_size,
+                              nullptr,
+                              nullptr,
+                              nullptr,
+                              nullptr,
+                              nullptr,
+                              nullptr,
+                              nullptr,
+                              top_k,
+                              1.0f,
+                              vocab_size,
+                              nullptr,
+                              stream,
+                              batch_size,
+                              nullptr);
+        void* workspace = allocator->malloc(workspace_size);
+
+        int*  end_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int*  seq_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        bool* finished    = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+
+        T*     probs         = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size * vocab_size));
+        float* cum_lprobs    = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
+        float* output_lprobs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * output_len * batch_size));
+        int*   output_ids    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size));
+
+        // Init by zero.
+        deviceFill(seq_lengths, batch_size, 0);
+        deviceFill(finished, batch_size, false);
+        deviceFill(end_ids, batch_size, end_id);
+
+        deviceFill(cum_lprobs, batch_size, 0.0f);
+        deviceFill(output_lprobs, output_len * batch_size, 0.0f);
+        deviceFill(output_ids, max_seq_len * batch_size, 0);
+
+        for (size_t step = 0; step < output_len; ++step) {
+            initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+            computeProb(h_probs, h_logits, batch_size, vocab_size);
+            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
+            invokeTopKSampling(workspace,
+                               workspace_size,
+                               // Note that the kernel needs vocab probs instead of
+                               // log-prob if cum_log_probs or output_log_probs are
+                               // provided. It's because the sampling layer already
+                               // preprocesses log_prob_buf when those are provided.
+                               probs,
+                               output_ids + step * batch_size,
+                               seq_lengths,
+                               finished,
+                               cum_lprobs,
+                               output_lprobs + step * batch_size,
+                               curand_states,
+                               top_k,
+                               top_p,
+                               vocab_size,
+                               end_ids,
+                               stream,
+                               batch_size,
+                               nullptr);
+
+            // Compute reference.
+            cudaD2Hcpy(h_output_ids, output_ids + step * batch_size, batch_size);
+            cudaD2Hcpy(h_seq_lengths, seq_lengths, batch_size);
+            cudaD2Hcpy(h_finished, finished, batch_size);
+            computeLogProb(h_lprobs, h_logits, batch_size, vocab_size);
+            for (size_t i = 0; i < batch_size; ++i) {
+                int idx = i * vocab_size + h_output_ids[i];
+                expected_cum_lprobs[i] += (int)step < h_seq_lengths[i] ? (float)h_lprobs[idx] : 0.0f;
+                EXPECT_EQ(h_finished[i], h_output_ids[i] == end_id);
+            }
+        }
+        bool passed = checkResult(param.toString(), cum_lprobs, expected_cum_lprobs, batch_size);
+        EXPECT_TRUE(passed);
+
+        delete[] expected_cum_lprobs;
+        delete[] h_seq_lengths;
+        delete[] h_logits;
+        delete[] h_lprobs;
+        delete[] h_probs;
+        delete[] h_output_ids;
+    }
+
+    void runBatchTest(SamplingKernelTestParam param, bool has_diff_runtime_args, bool use_skip_decode)
+    {
+        size_t batch_size = param.batch_size;
+        size_t vocab_size = param.vocab_size;
+        size_t output_len = param.output_len;
+        size_t seq_len    = output_len;
+
+        int   top_k = param.top_k;
+        float top_p = param.top_p;
+
+        int*   h_top_ks = new int[batch_size];
+        float* h_top_ps = new float[batch_size];
+        for (size_t i = 0; i < batch_size; ++i) {
+            h_top_ks[i] = (!has_diff_runtime_args || i % 3 == 0) ? top_k : 1;
+            h_top_ps[i] = (!has_diff_runtime_args || i % 3 == 0) ? top_p : 0.1 * top_p;
+        }
+        int max_top_k = *std::max_element(h_top_ks, h_top_ks + batch_size);
+
+        // Logit values in the host of shape (batch_size x vocab_size).
+        T* h_logits = new T[batch_size * vocab_size];
+        T* h_probs  = new T[batch_size * vocab_size];
+        T* h_lprobs = new T[batch_size * vocab_size];
+
+        float* expected_cum_lprobs = new float[batch_size];
+
+        int*  h_output_ids  = new int[batch_size];
+        int*  h_seq_lengths = new int[batch_size];
+        bool* h_finished    = new bool[batch_size];
+        bool* h_skip_decode = new bool[batch_size];
+
+        initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+        std::fill_n(expected_cum_lprobs, batch_size, 0);
+        for (size_t i = 0; i < batch_size; ++i) {
+            h_skip_decode[i] = use_skip_decode && (i % 2 == 0);
+        }
+
+        curandState_t* curand_states =
+            reinterpret_cast<curandState_t*>(allocator->malloc(sizeof(curandState_t) * batch_size, false));
+        invokeCurandInitialize(curand_states, batch_size, seed, stream);
+
+        size_t workspace_size = 0;
+        // retrieve the workspace size of the top-k sampling kernel.
+        invokeBatchTopKSampling<T>(nullptr,  // workspace
+                                   workspace_size,
+                                   nullptr,  // log_probs
+                                   nullptr,  // ids
+                                   nullptr,  // sequence_lengths
+                                   nullptr,  // finished
+                                   nullptr,  // cum_log_probs
+                                   nullptr,  // output_log_probs
+                                   nullptr,  // curandstates
+                                   max_top_k,
+                                   nullptr,  // top_ks
+                                   1.0f,
+                                   nullptr,
+                                   vocab_size,
+                                   nullptr,  // end_ids
+                                   stream,
+                                   batch_size,
+                                   nullptr);
+        void* workspace = allocator->malloc(workspace_size, false);
+
+        int*   top_ks = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
+
+        int*  end_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int*  seq_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int*  output_ids  = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * seq_len * batch_size));
+        bool* finished    = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+        bool* skip_decode = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+
+        T*     probs         = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size * vocab_size, true));
+        float* cum_lprobs    = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
+        float* output_lprobs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * output_len * batch_size));
+
+        // Initialize.
+        cudaH2Dcpy(top_ks, h_top_ks, batch_size);
+        cudaH2Dcpy(top_ps, h_top_ps, batch_size);
+        cudaH2Dcpy(skip_decode, h_skip_decode, batch_size);
+
+        deviceFill(end_ids, batch_size, end_id);
+        deviceFill(seq_lengths, batch_size, 0);
+        deviceFill(finished, batch_size, false);
+        deviceFill(cum_lprobs, batch_size, 0.0f);
+        deviceFill(output_lprobs, output_len * batch_size, 0.0f);
+        deviceFill(output_ids, seq_len * batch_size, 0);
+
+        for (size_t step = 0; step < output_len; ++step) {
+            initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+            computeProb(h_probs, h_logits, batch_size, vocab_size);
+            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
+
+            invokeBatchTopKSampling(workspace,
+                                    workspace_size,
+                                    // Note that the kernel needs vocab probs instead of
+                                    // log-prob if cum_log_probs or output_log_probs are
+                                    // provided. It's because the sampling layer already
+                                    // preprocesses log_prob_buf when those are provided.
+                                    probs,
+                                    output_ids + step * batch_size,
+                                    seq_lengths,
+                                    finished,
+                                    cum_lprobs,
+                                    output_lprobs + step * batch_size,
+                                    curand_states,
+                                    max_top_k,
+                                    top_ks,
+                                    1.0f,
+                                    nullptr,
+                                    vocab_size,
+                                    end_ids,
+                                    stream,
+                                    batch_size,
+                                    skip_decode);
+
+            // Compute reference.
+            cudaD2Hcpy(h_output_ids, output_ids + step * batch_size, batch_size);
+            cudaD2Hcpy(h_seq_lengths, seq_lengths, batch_size);
+            cudaD2Hcpy(h_finished, finished, batch_size);
+            computeLogProb(h_lprobs, h_logits, batch_size, vocab_size);
+            for (size_t i = 0; i < batch_size; ++i) {
+                if (!h_skip_decode[i]) {
+                    int idx = i * vocab_size + h_output_ids[i];
+                    expected_cum_lprobs[i] += (int)step < h_seq_lengths[i] ? (float)h_lprobs[idx] : 0.0f;
+                    EXPECT_EQ(h_finished[i], h_output_ids[i] == end_id);
+                }
+            }
+        }
+        bool passed = checkResult(param.toString(), cum_lprobs, expected_cum_lprobs, batch_size);
+        EXPECT_TRUE(passed) << "Fail subtest (has_diff_runtime_args: " << has_diff_runtime_args
+                            << ", skip_decode: " << use_skip_decode << ")";
+
+        delete[] expected_cum_lprobs;
+        delete[] h_seq_lengths;
+        delete[] h_logits;
+        delete[] h_lprobs;
+        delete[] h_probs;
+        delete[] h_output_ids;
+        delete[] h_top_ks;
+        delete[] h_skip_decode;
+    }
+
+    void runBatchTest(SamplingKernelTestParam param)
+    {
+        this->runBatchTest(param, false, false);
+        this->runBatchTest(param, false, true);
+        this->runBatchTest(param, true,  false);
+        this->runBatchTest(param, true,  true);
+    }
+};
+
+TYPED_TEST_SUITE(TopKSamplingKernelTest, FloatAndHalfTypes);
+
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessGreedy)
+{
+    this->runTest({6, 4, 1, 1, 1.0f, 1});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessAncestral)
+{
+    this->runTest({6, 4, 1, 4, 1.0f, 1});
+};
+
+
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK63)
+{
+    this->runTest({16, 51200, 1, 63, 1.0f, 8});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessLargeK1024)
+{
+    this->runTest({16, 51200, 1, 1024, 1.0f, 8});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, CorrectnessTopKTopP)
+{
+    this->runTest({16, 4000, 1, 63, 0.3f, 8});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, NotSupportedLargerThanK1024)
+{
+    EXPECT_THROW(this->runTest({16, 4000, 1, 1025, 1.0f, 8}), std::domain_error);
+};
+
+TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessGreedy)
+{
+    this->runBatchTest({6, 4, 1, 1, 1.0f, 1});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessAncestral)
+{
+    this->runBatchTest({6, 4, 1, 4, 1.0f, 1});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessLargeK63)
+{
+    this->runBatchTest({8, 4000, 1, 63, 1.0f, 8});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessLargeK1024)
+{
+    this->runBatchTest({8, 4000, 1, 1024, 0.0f, 8});
+};
+
+TYPED_TEST(TopKSamplingKernelTest, BatchCorrectnessTopKTopP)
+{
+    this->runBatchTest({8, 4000, 1, 63, 0.3f, 8});
+};
+
+
+template<typename T>
+class TopPSamplingKernelTest: public SamplingKernelTest<T> {
+
+protected:
+    const int end_id = 0;
+    using SamplingKernelTest<T>::seed;
+    using SamplingKernelTest<T>::stream;
+    using SamplingKernelTest<T>::allocator;
+    using SamplingKernelTest<T>::curand_states;
+
+public:
+    void runTest(SamplingKernelTestParam param)
+    {
+        size_t batch_size = param.batch_size;
+        size_t vocab_size = param.vocab_size;
+        size_t output_len = param.output_len;
+        size_t seq_len = output_len;
+
+        float top_p = param.top_p;
+
+        // Logit values in the host of shape (batch_size x vocab_size).
+        T* h_logits = new T[batch_size * vocab_size];
+        T* h_probs  = new T[batch_size * vocab_size];
+        T* h_lprobs = new T[batch_size * vocab_size];
+
+        float* expected_cum_lprobs = new float[batch_size];
+        std::fill_n(expected_cum_lprobs, batch_size, 0);
+
+        int*  h_output_ids  = new int[batch_size];
+        int*  h_seq_lengths = new int[batch_size];
+        bool* h_finished    = new bool[batch_size];
+
+        initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+
+        int device;
+        cudaGetDevice(&device);
+        struct cudaDeviceProp device_prop;
+        cudaGetDeviceProperties(&device_prop, device);
+
+        curandState_t* curand_states = reinterpret_cast<curandState_t*>(
+            allocator->malloc(sizeof(curandState_t) * batch_size, false));
+        invokeCurandInitialize(curand_states, batch_size, seed, stream);
+
+        int* end_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int* seq_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int* output_ids  = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * seq_len * batch_size));
+
+        bool* finished    = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+        bool* skip_decode = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+
+        T*     probs         = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size * vocab_size));
+        float* cum_lprobs    = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
+        float* output_lprobs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * output_len * batch_size));
+
+        int* begin_offsets    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
+        int* end_offsets      = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
+        int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
+
+        size_t workspace_size = 0;
+        size_t cub_temp_storage_size = 0;
+        // retrieve the workspace size of the top-p sampling kernel.
+        invokeTopPSampling<T>(nullptr,  // workspace
+                              workspace_size,
+                              cub_temp_storage_size,
+                              nullptr,  // output_ids
+                              nullptr,  // sequence_length
+                              nullptr,  // finished_buffer
+                              nullptr,  // cum_log_probs
+                              nullptr,  // output_log_probs
+                              (T*)nullptr,  // log_probs
+                              topp_id_vals_buf,
+                              end_offsets,
+                              begin_offsets,
+                              curand_states,
+                              batch_size,
+                              vocab_size,
+                              nullptr,
+                              top_p,
+                              stream,
+                              &device_prop,
+                              nullptr);
+        void* workspace = allocator->malloc(workspace_size);
+
+        // Initialize.
+        deviceFill(end_ids, batch_size, end_id);
+        deviceFill(seq_lengths, batch_size, 0);
+        deviceFill(finished, batch_size, false);
+        deviceFill(cum_lprobs, batch_size, 0.0f);
+        deviceFill(output_lprobs, output_len * batch_size, 0.0f);
+        deviceFill(output_ids, seq_len * batch_size, 0);
+
+        for (size_t step = 0; step < output_len; ++step) {
+            initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+            computeProb(h_probs, h_logits, batch_size, vocab_size);
+            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
+
+            invokeTopPInitialize(topp_id_vals_buf,
+                                 end_offsets,
+                                 begin_offsets,
+                                 batch_size,
+                                 vocab_size,
+                                 stream);
+
+            invokeTopPSampling<T>(workspace,
+                                  workspace_size,
+                                  cub_temp_storage_size,
+                                  output_ids + step * batch_size,
+                                  seq_lengths,
+                                  finished,
+                                  cum_lprobs,
+                                  output_lprobs + step * batch_size,
+                                  // Note that the kernel needs vocab probs instead of
+                                  // log-prob if cum_log_probs or output_log_probs are
+                                  // provided. It's because the sampling layer already
+                                  // preprocesses log_prob_buf when those are provided.
+                                  probs,
+                                  topp_id_vals_buf,
+                                  end_offsets,
+                                  begin_offsets,
+                                  curand_states,
+                                  batch_size,
+                                  vocab_size,
+                                  end_ids,
+                                  top_p,
+                                  stream,
+                                  &device_prop,
+                                  nullptr);
+
+            // Compute reference.
+            cudaD2Hcpy(h_output_ids, output_ids + step * batch_size, batch_size);
+            cudaD2Hcpy(h_seq_lengths, seq_lengths, batch_size);
+            cudaD2Hcpy(h_finished, finished, batch_size);
+            computeLogProb(h_lprobs, h_logits, batch_size, vocab_size);
+            for (size_t i = 0; i < batch_size; ++i) {
+                int idx = i * vocab_size + h_output_ids[i];
+                expected_cum_lprobs[i] += (int)step < h_seq_lengths[i] ? (float)h_lprobs[idx] : 0.0f;
+                EXPECT_EQ(h_finished[i], h_output_ids[i] == end_id);
+            }
+        }
+        bool passed = checkResult(param.toString(), cum_lprobs, expected_cum_lprobs, batch_size);
+        EXPECT_TRUE(passed);
+
+        delete[] expected_cum_lprobs;
+        delete[] h_seq_lengths;
+        delete[] h_logits;
+        delete[] h_lprobs;
+        delete[] h_probs;
+        delete[] h_output_ids;
+    }
+
+    void runBatchTest(SamplingKernelTestParam param, bool has_diff_runtime_args, bool use_skip_decode)
+    {
+        size_t batch_size = param.batch_size;
+        size_t vocab_size = param.vocab_size;
+
+        float top_p = param.top_p;
+        float* h_top_ps = new float[batch_size];
+        // Initialize runtime top k values.
+        for (size_t i = 0; i < batch_size; ++i) {
+            h_top_ps[i] = (!has_diff_runtime_args || i % 3 == 0) ? top_p : 0.1 * top_p;
+        }
+        float max_top_p = *std::max_element(h_top_ps, h_top_ps + batch_size);
+
+        size_t output_len = param.output_len;
+        size_t seq_len = output_len;
+
+        // Logit values in the host of shape (batch_size x vocab_size).
+        T* h_logits = new T[batch_size * vocab_size];
+        T* h_probs  = new T[batch_size * vocab_size];
+        T* h_lprobs = new T[batch_size * vocab_size];
+
+        float* expected_cum_lprobs = new float[batch_size];
+        std::fill_n(expected_cum_lprobs, batch_size, 0);
+
+        int*  h_output_ids  = new int[batch_size];
+        int*  h_seq_lengths = new int[batch_size];
+        bool* h_finished    = new bool[batch_size];
+        bool* h_skip_decode = new bool[batch_size];
+
+        initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+        std::fill_n(expected_cum_lprobs, batch_size, 0);
+        for (size_t i = 0; i < batch_size; ++i) {
+            h_skip_decode[i] = use_skip_decode && (i % 2 == 0);
+        }
+
+        int device;
+        cudaGetDevice(&device);
+        struct cudaDeviceProp device_prop;
+        cudaGetDeviceProperties(&device_prop, device);
+
+        curandState_t* curand_states = reinterpret_cast<curandState_t*>(
+            allocator->malloc(sizeof(curandState_t) * batch_size, false));
+        invokeCurandInitialize(curand_states, batch_size, seed, stream);
+
+        float* top_ps = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
+
+        int* end_ids     = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int* seq_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        int* output_ids  = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * seq_len * batch_size));
+
+        bool* finished    = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+        bool* skip_decode = reinterpret_cast<bool*>(allocator->malloc(sizeof(bool) * batch_size));
+
+        T*     probs         = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batch_size * vocab_size));
+        float* cum_lprobs    = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size));
+        float* output_lprobs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * output_len * batch_size));
+
+        int* begin_offsets    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
+        int* end_offsets      = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * (batch_size + 1)));
+        int* topp_id_vals_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * vocab_size));
+
+        size_t workspace_size = 0;
+        size_t cub_temp_storage_size = 0;
+        // retrieve the workspace size of the top-p sampling kernel.
+        invokeBatchTopPSampling<T>(nullptr,  // workspace
+                                   workspace_size,
+                                   cub_temp_storage_size,
+                                   nullptr,  // output_ids
+                                   nullptr,  // sequence_length
+                                   nullptr,  // finished_buffer
+                                   nullptr,  // cum_log_probs
+                                   nullptr,  // output_log_probs
+                                   (T*)nullptr,  // log_probs
+                                   topp_id_vals_buf,
+                                   end_offsets,
+                                   begin_offsets,
+                                   curand_states,
+                                   batch_size,
+                                   vocab_size,
+                                   nullptr,
+                                   max_top_p,
+                                   top_ps,
+                                   stream,
+                                   &device_prop,
+                                   nullptr);
+        void* workspace = allocator->malloc(workspace_size);
+
+        // Initialize.
+        cudaH2Dcpy(top_ps, h_top_ps, batch_size);
+        cudaH2Dcpy(skip_decode, h_skip_decode, batch_size);
+        deviceFill(end_ids, batch_size, end_id);
+        deviceFill(seq_lengths, batch_size, 0);
+        deviceFill(finished, batch_size, false);
+        deviceFill(cum_lprobs, batch_size, 0.0f);
+        deviceFill(output_lprobs, output_len * batch_size, 0.0f);
+        deviceFill(output_ids, seq_len * batch_size, 0);
+
+        for (size_t step = 0; step < output_len; ++step) {
+            initRandom(h_logits, batch_size * vocab_size, -3.0f, 3.0f);
+            computeProb(h_probs, h_logits, batch_size, vocab_size);
+            cudaH2Dcpy(probs, h_probs, batch_size * vocab_size);
+
+            invokeTopPInitialize(topp_id_vals_buf,
+                                 end_offsets,
+                                 begin_offsets,
+                                 batch_size,
+                                 vocab_size,
+                                 stream);
+
+            invokeBatchTopPSampling<T>(workspace,
+                                       workspace_size,
+                                       cub_temp_storage_size,
+                                       output_ids + step * batch_size,
+                                       seq_lengths,
+                                       finished,
+                                       cum_lprobs,
+                                       output_lprobs + step * batch_size,
+                                       // Note that the kernel needs vocab probs instead of
+                                       // log-prob if cum_log_probs or output_log_probs are
+                                       // provided. It's because the sampling layer already
+                                       // preprocesses log_prob_buf when those are provided.
+                                       probs,
+                                       topp_id_vals_buf,
+                                       end_offsets,
+                                       begin_offsets,
+                                       curand_states,
+                                       batch_size,
+                                       vocab_size,
+                                       end_ids,
+                                       max_top_p,
+                                       top_ps,
+                                       stream,
+                                       &device_prop,
+                                       skip_decode);
+
+            // Compute reference.
+            cudaD2Hcpy(h_output_ids, output_ids + step * batch_size, batch_size);
+            cudaD2Hcpy(h_seq_lengths, seq_lengths, batch_size);
+            cudaD2Hcpy(h_finished, finished, batch_size);
+            computeLogProb(h_lprobs, h_logits, batch_size, vocab_size);
+            for (size_t i = 0; i < batch_size; ++i) {
+                if (!h_skip_decode[i]) {
+                    int idx = i * vocab_size + h_output_ids[i];
+                    expected_cum_lprobs[i] += (int)step < h_seq_lengths[i] ? (float)h_lprobs[idx] : 0.0f;
+                    EXPECT_EQ(h_finished[i], h_output_ids[i] == end_id);
+                }
+            }
+        }
+        bool passed = checkResult(param.toString(), cum_lprobs, expected_cum_lprobs, batch_size);
+        EXPECT_TRUE(passed) << "Fail subtest (has_diff_runtime_args: " << has_diff_runtime_args
+                            << ", skip_decode: " << use_skip_decode << ")";
+
+        delete[] expected_cum_lprobs;
+        delete[] h_seq_lengths;
+        delete[] h_logits;
+        delete[] h_lprobs;
+        delete[] h_probs;
+        delete[] h_output_ids;
+        delete[] h_top_ps;
+        delete[] h_skip_decode;
+    }
+
+    void runBatchTest(SamplingKernelTestParam param)
+    {
+        this->runBatchTest(param, false, false);
+        this->runBatchTest(param, false, true);
+        this->runBatchTest(param, true,  false);
+        this->runBatchTest(param, true,  true);
+    }
+};
+
+TYPED_TEST_SUITE(TopPSamplingKernelTest, FloatAndHalfTypes);
+
+TYPED_TEST(TopPSamplingKernelTest, CorrectnessSmallP)
+{
+    this->runTest({6, 4, 1, 0, 0.2f, 1});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeP)
+{
+    this->runTest({6, 4, 1, 0, 0.9f, 1});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, CorrectnessAncestral)
+{
+    this->runTest({6, 4, 1, 0, 1.0f, 1});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabSmallP)
+{
+    this->runTest({32, 51200, 1, 0, 0.2f, 16});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, CorrectnessLargeVocabLargeP)
+{
+    this->runTest({32, 51200, 1, 0, 0.9f, 16});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessSmallP)
+{
+    this->runBatchTest({6, 4, 1, 0, 0.2f, 1});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP)
+{
+    this->runBatchTest({6, 4, 1, 0, 0.9f, 1});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessSmallP2)
+{
+    this->runBatchTest({8, 4000, 1, 0, 0.2f, 16});
+};
+
+TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
+{
+    this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
+};
+
+__global__
+void generateRandomNumber(unsigned int *vals, curandState_t *states, const int batch_size) {
+    int idx = threadIdx.x;
+    if (idx < batch_size) {
+        vals[idx] = curand(states + idx);
+    }
+}
+
+TEST(SamplingKernelTest, CurandBatchInitialize) {
+    size_t batch_size = 127;
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    curandState_t* curand_states;
+    check_cuda_error(cudaMalloc(&curand_states, sizeof(curandState_t) * batch_size));
+    unsigned long long* h_random_seeds = new unsigned long long[batch_size];
+    const size_t period_size = 3;
+    for (size_t i = 0; i < batch_size; ++i) {
+        h_random_seeds[i] = i / period_size;
+    }
+    unsigned long long* d_random_seeds;
+    check_cuda_error(cudaMalloc(&d_random_seeds, sizeof(unsigned long long) * batch_size));
+    check_cuda_error(cudaMemcpy(d_random_seeds, h_random_seeds,
+                                sizeof(unsigned long long) * batch_size, cudaMemcpyHostToDevice));
+
+    // Initialize curand states.
+    invokeCurandBatchInitialize(curand_states, batch_size, d_random_seeds, stream);
+    sync_check_cuda_error();
+
+    // Generate random numbers using initialized curand states.
+    unsigned int* d_rand_vals;
+    unsigned int* h_rand_vals = new unsigned int[batch_size];
+    check_cuda_error(cudaMalloc(&d_rand_vals, sizeof(unsigned int) * batch_size));
+    generateRandomNumber<<<1, batch_size, 0, stream>>>(d_rand_vals, curand_states, batch_size);
+    check_cuda_error(cudaMemcpyAsync(
+        h_rand_vals, d_rand_vals, sizeof(unsigned int) * batch_size, cudaMemcpyDeviceToHost, stream));
+    check_cuda_error(cudaStreamSynchronize(stream));
+
+    // The same seed produces the same random number.
+    for (size_t i = 0; i + period_size - 1 < batch_size; i += period_size) {
+        for (size_t j = 1; j < period_size; ++j) {
+            EXPECT_TRUE(h_rand_vals[i] == h_rand_vals[i + j])
+                << fmtstr("Fail at val[%d]=%d <> val[%d]=%d", i, h_rand_vals[i], i + j, h_rand_vals[i + j]);
+        }
+    }
+
+    delete h_rand_vals;
+    delete h_random_seeds;
+    check_cuda_error(cudaFree(d_rand_vals));
+    check_cuda_error(cudaFree(d_random_seeds));
+    check_cuda_error(cudaFree(curand_states));
+    check_cuda_error(cudaStreamDestroy(stream));
+}
+
+}  // end of namespace
diff --git a/tests/unittests/test_sampling_layer.cu b/tests/unittests/test_sampling_layer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cdb475b7fbbe8473da16cb16ff0fb145f8bd7c56
--- /dev/null
+++ b/tests/unittests/test_sampling_layer.cu
@@ -0,0 +1,948 @@
+#include <algorithm>   // std::min, std::max
+#include <iostream>    // snprintf
+#include <math.h>      // expf, log
+#include <stdlib.h>    // rand
+#include <string>      // std::string
+#include <vector>      // std::vector
+
+#include <cublas_v2.h>
+#include <cublasLt.h>
+#include <cuda_runtime.h>
+
+#include "src/fastertransformer/kernels/sampling_topk_kernels.h"
+#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
+#include "src/fastertransformer/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/fastertransformer/utils/cublasMMWrapper.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/Tensor.h"
+
+// #include "tests/unittests/unittest_utils.h"
+#include "tests/unittests/gtest_utils.h"
+
+using namespace fastertransformer;
+
+struct SamplingLayerTestParam {
+    size_t batch_size;
+    size_t vocab_size;
+    size_t beam_width;
+    size_t top_k;
+    float top_p;
+    size_t output_len;
+
+    std::string toString() {
+        return fmtstr("SamplingLayerTestParam[batch=%ld, vocab=%ld, beam=%ld, k=%ld, p=%3.1f, output_len=%ld]",
+                      batch_size, vocab_size, beam_width, top_k, top_p, output_len);
+    }
+};
+
+template<typename T>
+void computeProb(T* probs, T* logits, int batch_size, int vocab_size) {
+    // Compute the log probability from logits.
+    //   logits = batch_size x vocab_size vector.
+    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
+    for (int bidx = 0; bidx < batch_size; ++bidx) {
+        float sum = 0.0f;
+        for (int i = 0; i < vocab_size; ++i) {
+            sum += expf((float)logits[bidx * vocab_size + i]);
+        }
+        for (int i = 0; i < vocab_size; ++i) {
+            int idx = bidx * vocab_size + i;
+            probs[idx] = static_cast<T>(expf((float)logits[idx]) / (sum + EPSILON));
+        }
+    }
+}
+
+template<typename T>
+void computeLogProb(T* logprobs, T* logits, int batch_size, int vocab_size) {
+    // Compute the log probability from logits.
+    //   logits = batch_size x vocab_size vector.
+    //   logprobs = log(softmax(logits)) (softmax along with vocab dimension)
+    for (int bidx = 0; bidx < batch_size; ++bidx) {
+        float sum = 0.0f;
+        for (int i = 0; i < vocab_size; ++i) {
+            sum += expf(logits[bidx * vocab_size + i]);
+        }
+        for (int i = 0; i < vocab_size; ++i) {
+            int idx = bidx * vocab_size + i;
+            logprobs[idx] = static_cast<T>(logf(expf(logits[idx]) / (sum + EPSILON) + EPSILON));
+        }
+    }
+}
+
+template<typename T>
+class SamplingDecodeTest: public testing::Test {
+protected:
+    unsigned long long seed = 0;
+    const static unsigned long long max_seed = 30;
+    const size_t batch_size = 6;
+    const size_t beam_width = 1;
+    const size_t batchxbeam = batch_size * beam_width;
+    const size_t vocab_size = 8;
+    const size_t max_input_len = 0;  // has no effect.
+    const size_t max_output_len = 3;
+    const size_t max_seq_len = max_input_len + max_output_len;
+    const int end_id = vocab_size - 1;
+    const DataType data_type = getTensorType<T>();
+
+    // vocab size 8 & length 3
+    T* test_input_logits;
+
+    cudaStream_t stream;
+    ft::Allocator<ft::AllocatorType::CUDA>* allocator;
+    cublasHandle_t cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    std::mutex *cublas_wrapper_mutex;
+    cublasMMWrapper *cublas_wrapper;
+    DynamicDecodeLayer<T> *dynamic_decode_layer;
+
+    int* h_output_ids;
+    T* h_logits;
+    T* h_probs;
+    T* h_log_probs;
+    float* h_cum_log_probs;
+    float* h_output_log_probs;
+
+    T* d_logits;
+    int* d_input_lengths;
+    float* d_cum_log_probs;
+    float* d_output_log_probs;
+    int* d_output_ids;
+    int* d_end_ids;
+
+    void setup(unsigned long long seed = 0) {
+        this->seed = seed;
+
+        check_cuda_error(cudaStreamCreate(&stream));
+        allocator = new Allocator<AllocatorType::CUDA>(getDevice());
+        allocator->setStream(stream);
+
+        struct cudaDeviceProp prop;
+        check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+        check_cuda_error(cublasCreate(&cublas_handle));
+        check_cuda_error(cublasLtCreate(&cublaslt_handle));
+        check_cuda_error(cublasSetStream(cublas_handle, stream));
+        cublasAlgoMap cublas_algo_map(GEMM_CONFIG);
+        cublas_wrapper_mutex = new std::mutex();
+
+        cublas_wrapper = new cublasMMWrapper(cublas_handle,
+                                             cublaslt_handle,
+                                             stream,
+                                             &cublas_algo_map,
+                                             cublas_wrapper_mutex,
+                                             allocator);
+
+        dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+                                                         vocab_size,
+                                                         end_id,
+                                                         stream,
+                                                         cublas_wrapper,
+                                                         allocator,
+                                                         false,   // is_free_buffer_after_forward
+                                                         &prop);  // cuda_device_prop
+
+        h_output_ids = new int[batchxbeam];
+        h_logits = new T[batchxbeam * vocab_size];
+        h_probs = new T[batchxbeam * vocab_size];
+        h_log_probs = new T[batchxbeam * vocab_size];
+        h_cum_log_probs = new float[batchxbeam];
+        h_output_log_probs = new float[max_output_len * batchxbeam];
+
+        // prob = (0.4, 0.3, 0.2, 0.1, ...)
+        test_input_logits = new T[24]{
+            -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX,  // step 0
+             -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, // step 1
+             -FLT_MAX, -FLT_MAX, -0.9163, -1.2040, -1.6094, -2.3026, -FLT_MAX, -FLT_MAX  // step 2
+        };
+
+        d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size, true));
+        d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        d_cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batchxbeam));
+        d_output_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * max_output_len * batchxbeam));
+        d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
+        d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+
+        // Init by zero.
+        cudaMemset(d_cum_log_probs, 0, sizeof(float) * batchxbeam);
+        cudaMemset(d_output_log_probs, 0, sizeof(float) * max_output_len * batchxbeam);
+        cudaMemset(d_output_ids, 0, sizeof(int) * max_seq_len * batchxbeam);
+        deviceFill(d_end_ids, batchxbeam, end_id, stream);
+    }
+
+    void teardown() {
+        delete[] test_input_logits;
+        delete[] h_output_ids;
+        delete[] h_logits;
+        delete[] h_probs;
+        delete[] h_log_probs;
+        delete[] h_cum_log_probs;
+        delete[] h_output_log_probs;
+        delete dynamic_decode_layer;
+        delete cublas_wrapper;
+        delete cublas_wrapper_mutex;
+        delete allocator;
+        check_cuda_error(cublasDestroy(cublas_handle));
+        check_cuda_error(cublasLtDestroy(cublaslt_handle));
+        check_cuda_error(cudaStreamDestroy(stream));
+    }
+
+    TensorMap* createInputTensors(int* topk,
+                                                                size_t topk_size,
+                                                                float* topp,
+                                                                size_t topp_size,
+                                                                float* temperature,
+                                                                float* repetition_penalty)
+    {
+        // construct common input tensors
+        TensorMap* input_tensors = new TensorMap();
+        if (topk != nullptr) {
+            input_tensors->insert({"runtime_top_k", {MEMORY_CPU, TYPE_INT32, {topk_size}, topk}});
+        }
+        if (topp != nullptr) {
+            input_tensors->insert({"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {topp_size}, topp}});
+        }
+        if (temperature != nullptr) {
+            input_tensors->insert({"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, temperature}});
+        }
+        if (repetition_penalty != nullptr) {
+            input_tensors->insert({"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, repetition_penalty}});
+        }
+        input_tensors->insert({"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}});
+        input_tensors->insert({"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}});
+        input_tensors->insert({"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}});
+        input_tensors->insert({"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}});
+        input_tensors->insert({"end_id", Tensor{MEMORY_CPU, TYPE_INT32, {batchxbeam}, &d_end_ids}});
+        input_tensors->insert({"random_seed", Tensor{MEMORY_CPU, TYPE_UINT64, {1}, &seed}});
+        return input_tensors;
+    }
+
+    TensorMap* createOutputTensors() {
+        // construct common output tensors
+        TensorMap* output_tensors = new TensorMap();
+        output_tensors->insert(
+            {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}});
+        output_tensors->insert({"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}});
+        output_tensors->insert(
+            {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, d_cum_log_probs}});
+        output_tensors->insert(
+            {"output_log_probs",
+                Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, d_output_log_probs}});
+        output_tensors->insert(
+            {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}});
+        return output_tensors;
+    }
+
+    void batchH2Dcpy(T* dst, T* src, size_t m, size_t n) {
+        for (size_t i = 0; i < m; ++i) {
+            cudaH2Dcpy(dst + i * n, src, n);
+        }
+    }
+
+    bool checkResult(int* d_output_ids, std::vector<std::set<int>>& expected_ids) {
+        assert(expected_ids.size() == max_seq_len * batchxbeam);
+        int* h_output_ids = new int[max_seq_len * batchxbeam];
+        cudaD2Hcpy(h_output_ids, d_output_ids, max_seq_len * batchxbeam);
+        int failures = 0;
+        for (size_t i = 0; i < max_seq_len * batchxbeam; ++i) {
+            size_t s = i / batchxbeam;
+            size_t b = i % batchxbeam;
+            std::set<int> expts = expected_ids.at(i);
+            if (expts.count(h_output_ids[i]) == 0) {
+                if (failures < 10) {
+                    std::stringstream ss;
+                    ss << " - Fail "
+                       << " (step=" << s << ", batch=" << b << ") "
+                       << "actual=" << h_output_ids[i] << ", expected";
+                    for (auto& expt : expts) {
+                        ss << " " << expt;
+                    }
+                    FT_LOG_DEBUG("%s", ss.str().c_str());
+                }
+                ++failures;
+            }
+        }
+        FT_LOG_DEBUG("check...%6s : failures: %d / %d",
+                     failures == 0 ? "....OK" : "FAILED", failures, max_seq_len * batchxbeam);
+        delete[] h_output_ids;
+        return failures == 0;
+    }
+
+public:
+    void runTest(std::vector<std::set<int>> expected_output_ids,
+                 int* top_ks,
+                 size_t top_k_size,
+                 float* top_ps,
+                 size_t top_p_size,
+                 float* temperature,
+                 float* repetition_penalty,
+                 bool use_local_batch = false)
+    {
+        size_t local_batch_size = use_local_batch ? batch_size / 3 : batch_size;
+        uint ite = use_local_batch ? 1 : 0;
+        for (unsigned long long seed = 0; seed < max_seed; ++seed) {
+            this->setup(seed);
+            size_t step = max_input_len;
+            TensorMap* input_tensors = createInputTensors(
+                top_ks, top_k_size, top_ps, top_p_size, temperature, repetition_penalty);
+            input_tensors->insert({"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}});
+            input_tensors->insert({"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}});
+            input_tensors->insert({"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}});
+            TensorMap* output_tensors = createOutputTensors();
+
+            dynamic_decode_layer->setup(batch_size, beam_width, input_tensors);
+            for (step = max_input_len; step < max_output_len; ++step) {
+                // Reset by the test value since the sampling layer internally update the logit buffer.
+                batchH2Dcpy(input_tensors->at("logits").getPtr<T>(),
+                            test_input_logits + step * vocab_size,
+                            batchxbeam,
+                            vocab_size);
+                dynamic_decode_layer->forward(output_tensors, input_tensors);
+            }
+            bool passed = checkResult(d_output_ids, expected_output_ids);
+            EXPECT_TRUE(passed) << "Failed at seed " << seed;
+#ifndef NDEBUG
+            if (!passed) {
+                FT_LOG_ERROR("actual output ids");
+                printMatrix(d_output_ids, max_seq_len, batch_size, batch_size, true);
+            }
+#endif
+            delete output_tensors;
+            delete input_tensors;
+            this->teardown();
+        }
+    }
+};
+
+TYPED_TEST_SUITE(SamplingDecodeTest, FloatAndHalfTypes);
+
+TYPED_TEST(SamplingDecodeTest, TopK)
+{
+    int top_k = 2;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        //  0       1       2       3       4       5
+        {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, // step 0
+        {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, {4, 5}, // step 1
+        {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
+}
+
+TYPED_TEST(SamplingDecodeTest, BatchTopK)
+{
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{2, 1, 1, 2, 1, 1};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        //  0    1    2       3    4    5
+        {0, 1}, {0}, {0}, {0, 1}, {0}, {0}, // step 0
+        {4, 5}, {4}, {4}, {4, 5}, {4}, {4}, // step 1
+        {2, 3}, {2}, {2}, {2, 3}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
+    delete[] top_ks;
+}
+
+TYPED_TEST(SamplingDecodeTest, TopP)
+{
+    float top_p = 0.3;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
+}
+
+TYPED_TEST(SamplingDecodeTest, BatchTopP)
+{
+    size_t batch_size = this->batch_size;
+    float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
+    std::vector<std::set<int>> expected_output_ids {
+        {0}, {0, 1}, {0, 1}, {0}, {0, 1}, {0, 1}, // step 0
+        {4}, {4, 5}, {4, 5}, {4}, {4, 5}, {4, 5}, // step 1
+        {2}, {2, 3}, {2, 3}, {2}, {2, 3}, {2, 3}  // step 2
+    };
+    this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, TopKTopP) {
+    int top_k = 2;
+    float top_p = 0.3;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
+}
+
+
+TYPED_TEST(SamplingDecodeTest, BatchTopKTopP)
+{
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{2, 2, 1, 2, 2, 1};
+    float top_p = 0.3;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
+    delete[] top_ks;
+}
+
+TYPED_TEST(SamplingDecodeTest, TopKBatchTopP)
+{
+    size_t batch_size = this->batch_size;
+    int top_k = 2;
+    float* top_ps = new float[batch_size]{0.5, 0.3, 0.5, 0.5, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
+        {4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
+        {2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest,  BatchTopKBatchTopP)
+{
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
+    float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0, 1}, {0}, {0, 1}, {0, 1}, {0}, {0, 1}, // step 0
+        {4, 5}, {4}, {4, 5}, {4, 5}, {4}, {4, 5}, // step 1
+        {2, 3}, {2}, {2, 3}, {2, 3}, {2}, {2, 3}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ks;
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopK)
+{
+    size_t batch_size = this->batch_size;
+    int top_k = 0;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, nullptr, 0, nullptr, nullptr);
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopP)
+{
+    size_t batch_size = this->batch_size;
+    float top_p = 0;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, nullptr, 0, &top_p, 1, nullptr, nullptr);
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKTopP)
+{
+    size_t batch_size = this->batch_size;
+    int top_k = 0;
+    float top_p = 0;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, &top_p, 1, nullptr, nullptr);
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroBatchTopKTopP) {
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{0, 0, 0, 0, 0, 0};
+    float top_p = 0;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
+    delete[] top_ks;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsZeroTopKBatchTopP) {
+    size_t batch_size = this->batch_size;
+    int top_k = 0;
+    float* top_ps = new float[batch_size]{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0}, {0}, {0}, // step 0
+        {4}, {4}, {4}, {4}, {4}, {4}, // step 1
+        {2}, {2}, {2}, {2}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKContainZero) {
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{2, 1, 0, 0, 2, 1};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0, 1}, {0}, {0}, {0}, {0, 1}, {0}, // step 0
+        {4, 5}, {4}, {4}, {4}, {4, 5}, {4}, // step 1
+        {2, 3}, {2}, {2}, {2}, {2, 3}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, nullptr, 0, nullptr, nullptr);
+    delete[] top_ks;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopPContainZero) {
+    size_t batch_size = this->batch_size;
+    float* top_ps = new float[batch_size]{0.5f, 0.5f, 0.0f, 0.5f, 0.0f, 0.3f};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0, 1}, {0, 1}, {0}, {0, 1}, {0}, {0}, // step 0
+        {4, 5}, {4, 5}, {4}, {4, 5}, {4}, {4}, // step 1
+        {2, 3}, {2, 3}, {2}, {2, 3}, {2}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKTopPContainZero) {
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{2, 2, 1, 0, 2, 0};
+    float top_p = 0.0;
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0, 1}, {0, 1}, {0}, {0}, {0, 1}, {0}, // step 0
+        {4, 5}, {4, 5}, {4}, {4}, {4, 5}, {4}, // step 1
+        {2, 3}, {2, 3}, {2}, {2}, {2, 3}, {2}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, &top_p, 1, nullptr, nullptr);
+    delete[] top_ks;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsTopKBatchTopPContainZero) {
+    size_t batch_size = this->batch_size;
+    int top_k = 0;
+    float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0, 1}, {0}, {0}, {0, 1}, // step 0
+        {4}, {4}, {4, 5}, {4}, {4}, {4, 5}, // step 1
+        {2}, {2}, {2, 3}, {2}, {2}, {2, 3}  // step 2
+    };
+    this->runTest(expected_output_ids, &top_k, 1, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, InvalidArgsBatchTopKBatchTopPContainZero) {
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{0, 2, 1, 2, 2, 0};
+    float* top_ps = new float[batch_size]{0.0, 0.3, 0.9, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0}, {0, 1}, {0}, {0, 1}, // step 0
+        {4}, {4}, {4}, {4, 5}, {4}, {4, 5}, // step 1
+        {2}, {2}, {2}, {2, 3}, {2}, {2, 3}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr);
+    delete[] top_ks;
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopP) {
+    size_t batch_size = this->batch_size;
+    float* top_ps = new float[batch_size]{0.3f, 0.5f, 0.5f, 0.3f, 0.5f, 0.5f};
+    std::vector<std::set<int>> expected_output_ids {
+        {0}, {0}, {0, 1}, {0}, {0}, {0}, // step 0
+        {0}, {0}, {4, 5}, {4}, {0}, {0}, // step 1
+        {0}, {0}, {2, 3}, {2}, {0}, {0}  // step 2
+    };
+    this->runTest(expected_output_ids, nullptr, 0, top_ps, batch_size, nullptr, nullptr, true);
+    delete[] top_ps;
+}
+
+TYPED_TEST(SamplingDecodeTest, LocalBatchBatchTopKBatchTopP) {
+    size_t batch_size = this->batch_size;
+    int* top_ks = new int[batch_size]{2, 2, 0, 2, 2, 0};
+    float* top_ps = new float[batch_size]{0.0, 0.3, 0.5, 0.0, 0.3, 0.5};
+    std::vector<std::set<int>> expected_output_ids {
+        // batch
+        {0}, {0}, {0, 1}, {0, 1}, {0}, {0}, // step 0
+        {0}, {0}, {4, 5}, {4, 5}, {0}, {0}, // step 1
+        {0}, {0}, {2, 3}, {2, 3}, {0}, {0}  // step 2
+    };
+    this->runTest(expected_output_ids, top_ks, batch_size, top_ps, batch_size, nullptr, nullptr, true);
+    delete[] top_ks;
+    delete[] top_ps;
+}
+
+template<typename T>
+class SamplingDecodeTest2: public FtTestBase {
+
+public:
+    void SetUp() override
+    {
+        FtTestBase::SetUp();
+        check_cuda_error(cudaGetDeviceProperties(&prop, 0));
+        check_cuda_error(cublasCreate(&cublas_handle));
+        check_cuda_error(cublasLtCreate(&cublaslt_handle));
+        check_cuda_error(cublasSetStream(cublas_handle, stream));
+        cublas_algo_map = new cublasAlgoMap("");
+        cublas_wrapper_mutex = new std::mutex();
+        cublas_wrapper = new cublasMMWrapper(cublas_handle,
+                                             cublaslt_handle,
+                                             stream,
+                                             cublas_algo_map,
+                                             cublas_wrapper_mutex,
+                                             allocator);
+
+    }
+    void TearDown() override
+    {
+        delete cublas_wrapper;
+        delete cublas_wrapper_mutex;
+        delete cublas_algo_map;
+        check_cuda_error(cublasLtDestroy(cublaslt_handle));
+        check_cuda_error(cublasDestroy(cublas_handle));
+        FtTestBase::TearDown();
+    }
+
+protected:
+    using FtTestBase::stream;
+    using FtTestBase::allocator;
+
+    struct cudaDeviceProp prop;
+    cublasHandle_t cublas_handle;
+    cublasLtHandle_t cublaslt_handle;
+    cublasAlgoMap* cublas_algo_map;
+    std::mutex* cublas_wrapper_mutex;
+    cublasMMWrapper* cublas_wrapper;
+
+
+    DataType data_type = getTensorType<T>();
+
+    size_t batch_size;
+    size_t beam_width;
+    size_t batchxbeam;
+    size_t vocab_size;
+    size_t max_input_len;
+    size_t max_output_len;
+    size_t max_seq_len;
+
+    uint top_k;
+    float top_p;
+    float temperature;
+    float repetition_penalty;
+    int end_id;
+
+    T* h_logits;
+    T* h_probs;
+    T* h_log_probs;
+    float* h_cum_log_probs;
+    float* h_output_log_probs;
+    int* h_output_ids;
+
+    T* d_logits;
+    int* d_input_lengths;
+    float* d_cum_log_probs;
+    float* d_output_log_probs;
+    int* d_output_ids;
+    int* d_end_ids;
+
+    void setup(SamplingLayerTestParam param)
+    {
+        batch_size = param.batch_size;
+        beam_width = param.beam_width;
+        batchxbeam = batch_size * param.beam_width;
+        vocab_size = param.vocab_size;
+        max_input_len = 0;
+        max_output_len = param.output_len;
+        max_seq_len = max_input_len + max_output_len;
+
+        top_k = param.top_k;
+        top_p = param.top_p;
+        // use default values having no effect.
+        temperature = 1.0f;
+        repetition_penalty = 1.0f;
+        end_id = 0;
+
+        h_logits = new T[batchxbeam * vocab_size];
+        h_output_ids = new int[batchxbeam];
+
+        d_logits = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * batchxbeam * vocab_size));
+        d_input_lengths = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batchxbeam));
+        d_output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batchxbeam));
+        d_end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+
+        // Init by zero.
+        deviceFill(d_input_lengths, batchxbeam, 0, stream);
+        deviceFill(d_output_ids, max_seq_len * batchxbeam, 0, stream);
+        deviceFill(d_end_ids, batch_size, end_id);
+    }
+
+    void teardown() {
+        delete[] h_logits;
+        delete[] h_output_ids;
+    }
+
+    void runCurandTest(SamplingLayerTestParam param,
+                       bool use_local_batch,
+                       bool use_single_random_seed)
+    {
+        setup(param);
+        const DataType data_type = getTensorType<T>();
+
+        const size_t local_batch_size = use_local_batch ? 3 : batch_size;
+        assert(batch_size % local_batch_size == 0);
+
+        DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+                                                                                vocab_size,
+                                                                                end_id,
+                                                                                stream,
+                                                                                cublas_wrapper,
+                                                                                allocator,
+                                                                                false,   // is_free_buffer_after_forward
+                                                                                &prop);  // cuda_device_prop
+
+        // Prepare decoding arguments
+        const size_t random_seed_size = use_single_random_seed ? 1 : batch_size;
+        const size_t period_size = 3;
+        unsigned long long* random_seed = new unsigned long long[random_seed_size];
+        for (size_t i = 0; i < random_seed_size; ++i) {
+            random_seed[i] = i / period_size;
+        }
+
+        TensorMap runtime_args;
+        runtime_args.insert({"random_seed", Tensor(MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed)});
+        runtime_args.insert({"runtime_top_k", Tensor(MEMORY_CPU, TYPE_UINT32, {1}, &top_k)});
+        runtime_args.insert({"runtime_top_p", Tensor(MEMORY_CPU, TYPE_FP32, {1}, &top_p)});
+        dynamic_decode_layer->setup(batch_size, beam_width, &runtime_args);
+
+        for (size_t step = max_input_len; step < max_output_len; ++step) {
+            const size_t iteration_num = batch_size / local_batch_size;
+            initRandom(h_logits, beam_width * vocab_size, -3.0f, 3.0f);
+            tile(h_logits, batch_size, beam_width * vocab_size);
+            cudaH2Dcpy(d_logits, h_logits, batchxbeam * vocab_size);
+
+            for (uint ite = 0; ite < iteration_num; ++ite) {
+                TensorMap dynamic_decode_input_tensors({
+                    {"logits", Tensor{MEMORY_GPU, data_type, {batch_size, beam_width, vocab_size}, d_logits}},
+                    {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
+                    {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                    {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
+                    {"input_lengths", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, d_input_lengths}},
+                    {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                    {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
+                    {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, d_end_ids}},
+                    {"random_seed", {MEMORY_CPU, TYPE_UINT64, {random_seed_size}, random_seed}},
+                    {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+                    {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}}
+                });
+
+                // common outputs
+                TensorMap dynamic_decode_output_tensors({
+                    {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, d_output_ids}},
+                    {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
+                    {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}
+                });
+
+                dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
+                                            &dynamic_decode_input_tensors);
+                sync_check_cuda_error();
+
+                // check results.
+                cudaD2Hcpy(h_output_ids,
+                           dynamic_decode_output_tensors.at("output_ids").getPtrWithOffset<int>(step * batchxbeam),
+                           batchxbeam);
+            }
+            // The same seed produces the same random number.
+            for (size_t i = 0; i + period_size - 1 < batchxbeam; i += period_size) {
+                for (size_t j = 1; j < period_size; ++j) {
+                    EXPECT_TRUE(h_output_ids[i] == h_output_ids[i + j])
+                        << fmtstr("Fail at step %u val[%d]=%d <> val[%d]=%d",
+                                  step, i, h_output_ids[i], i + j, h_output_ids[i + j]);
+                }
+            }
+        }
+        delete dynamic_decode_layer;
+        delete[] random_seed;
+        teardown();
+    }
+
+    void runCumLogProbTest(SamplingLayerTestParam param) {
+        setup(param);
+        unsigned long long seed = 43;
+        const DataType data_type = getTensorType<T>();
+        DynamicDecodeLayer<T> *dynamic_decode_layer = new DynamicDecodeLayer<T>(vocab_size,
+                                                                                vocab_size,
+                                                                                end_id,
+                                                                                stream,
+                                                                                cublas_wrapper,
+                                                                                allocator,
+                                                                                false,   // is_free_buffer_after_forward
+                                                                                &prop);  // cuda_device_prop
+
+        // Logit values in the host of shape ((batch_size x beam) x vocab_size) where beam = 1.
+        // T* h_logits = new T[batch_size * beam_width * vocab_size];
+        T* h_probs = new T[batch_size * beam_width * vocab_size];
+        T* h_log_probs = new T[batch_size * beam_width * vocab_size];
+        float* h_cum_log_probs = new float[batch_size * beam_width];
+        float* h_output_log_probs = new float[max_output_len * batch_size * beam_width];
+        float* expected_cum_log_probs = new float[batch_size * beam_width];
+        initRandom(h_logits, batch_size * beam_width * vocab_size, -3.0f, 3.0f);
+        computeProb(h_probs, h_logits, batch_size * beam_width, vocab_size);
+        computeLogProb(h_log_probs, h_logits, batch_size * beam_width, vocab_size);
+        std::fill_n(expected_cum_log_probs, batch_size * beam_width, 0);
+
+        int* tiled_input_lengths_buf = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size * beam_width));
+        float* cum_log_probs = reinterpret_cast<float*>(allocator->malloc(sizeof(float) * batch_size * beam_width));
+        float* output_log_probs = reinterpret_cast<float*>(
+            allocator->malloc(sizeof(float) * max_output_len * batch_size * beam_width));
+
+        int* output_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * max_seq_len * batch_size * beam_width));
+        int* h_output_ids = new int[batch_size * beam_width];
+
+        int* end_ids = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size));
+        deviceFill(end_ids, batch_size, end_id);
+
+        // Init by zero.
+        cudaMemset(cum_log_probs, 0, sizeof(float) * batch_size * beam_width);
+        cudaMemset(output_log_probs, 0, sizeof(float) * max_output_len * batch_size * beam_width);
+        cudaMemset(output_ids, 0, sizeof(int) * max_seq_len * batch_size * beam_width);
+
+        TensorMap input_tensors({
+            {"random_seed", {MEMORY_CPU, TYPE_INT32, {1}, &seed}},
+            {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+            {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
+            {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+            {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
+        });
+        dynamic_decode_layer->setup(batch_size, beam_width, &input_tensors);
+
+        for (size_t step = max_input_len; step < max_output_len; ++step) {
+            uint ite = 0;
+            // Reset by the test value since the sampling layer internally update the logit buffer (making it log-prob).
+            cudaH2Dcpy(d_logits, h_logits, batch_size * beam_width * vocab_size);
+            TensorMap dynamic_decode_input_tensors({
+                {"logits", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size, beam_width, vocab_size}, d_logits}},
+                {"embedding_bias", Tensor{MEMORY_GPU, data_type, {vocab_size}, nullptr}},
+                {"step", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &step}},
+                {"max_input_length", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &max_input_len}},
+                {"input_lengths",
+                    Tensor{MEMORY_GPU, TYPE_INT32, {batch_size, beam_width}, tiled_input_lengths_buf}},
+                {"ite", Tensor{MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
+                {"local_batch_size", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &batch_size}},
+                {"end_id", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
+                {"random_seed", {MEMORY_CPU, TYPE_UINT64, {1}, &seed}},
+                {"runtime_top_k", {MEMORY_CPU, TYPE_UINT32, {1}, &top_k}},
+                {"runtime_top_p", {MEMORY_CPU, TYPE_FP32, {1}, &top_p}},
+                {"temperature", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &temperature}},
+                {"repetition_penalty", Tensor{MEMORY_CPU, TYPE_FP32, {1}, &repetition_penalty}}
+            });
+
+            // common outputs
+            TensorMap dynamic_decode_output_tensors({
+                {"output_ids", Tensor{MEMORY_GPU, TYPE_INT32, {max_seq_len, batch_size, beam_width}, output_ids}},
+                {"finished", Tensor{MEMORY_GPU, TYPE_BOOL, {batch_size * beam_width}, nullptr}},
+                {"cum_log_probs", Tensor{MEMORY_GPU, TYPE_FP32, {batch_size * beam_width}, cum_log_probs}},
+                {"output_log_probs",
+                    Tensor{MEMORY_GPU, TYPE_FP32, {max_seq_len, batch_size, beam_width}, output_log_probs}},
+                {"sequence_length", Tensor{MEMORY_GPU, TYPE_INT32, {batch_size * beam_width}, nullptr}}});
+
+            dynamic_decode_layer->forward(&dynamic_decode_output_tensors,
+                                        &dynamic_decode_input_tensors);
+
+            FT_LOG_DEBUG("Step %2d generated ids", step);
+            cudaD2Hcpy(h_output_ids,
+                       dynamic_decode_output_tensors
+                           .at("output_ids")
+                           .getPtrWithOffset<int>(step * (batch_size * beam_width)),
+                       batch_size * beam_width);
+            cudaD2Hcpy(h_cum_log_probs, cum_log_probs, batch_size * beam_width);
+            cudaD2Hcpy(h_output_log_probs, output_log_probs, max_output_len * batch_size * beam_width);
+            for (size_t i = 0; i < batch_size * beam_width; ++i) {
+                int idx = i * vocab_size + h_output_ids[i];
+                expected_cum_log_probs[i] += (float)h_log_probs[idx];
+                FT_LOG_DEBUG(
+                    "| step %2d batch %2d idx %7d id %6d | log-prob %9.4f (expt: %9.4f) "
+                    "| cum-log-prob %9.4f (expt: %9.4f) | prob %9.4e",
+                    (int)step, (int)i, (int)idx, (int)h_output_ids[i],
+                    h_output_log_probs[step * batch_size * beam_width + i], (float)h_log_probs[idx],
+                    h_cum_log_probs[i], expected_cum_log_probs[i], (float)h_probs[idx]);
+            }
+            FT_LOG_DEBUG("");
+        }
+
+        bool passed = checkResult(param.toString(), cum_log_probs, expected_cum_log_probs, batch_size * beam_width);
+        EXPECT_TRUE(passed);
+
+        delete[] expected_cum_log_probs;
+        delete[] h_output_log_probs;
+        delete[] h_cum_log_probs;
+        delete[] h_log_probs;
+        delete[] h_probs;
+
+        delete dynamic_decode_layer;
+    }
+
+};
+
+TYPED_TEST_SUITE(SamplingDecodeTest2, FloatAndHalfTypes);
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessSingleRandTopK)
+{
+    // test TopKSampling
+    this->runCurandTest({113, 1201, 1, 3, 1.0f, 5}, false, true);
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessSingleRandTopP)
+{
+    this->runCurandTest({113, 1201, 1, 0, 1.0f, 5}, false, true);
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessBatchRandTopK)
+{
+    // test TopKSampling
+    this->runCurandTest({113, 1201, 1, 3, 1.0f, 5}, false, false);
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessBatchRandTopP)
+{
+    this->runCurandTest({113, 1201, 1, 0, 1.0f, 5}, false, false);
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessBatchRandTopKLocalBatch)
+{
+    // test TopKSampling
+    this->runCurandTest({99, 1201, 1, 3, 1.0f, 5}, true, false);
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessBatchRandTopPLocalBatch)
+{
+    this->runCurandTest({99, 1201, 1, 0, 1.0f, 5}, true, false);
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessCumLogProbTopK)
+{
+    this->runCumLogProbTest({99, 1201, 1, 5, 1.0f, 5});
+}
+
+TYPED_TEST(SamplingDecodeTest2, CorrectnessCumLogProbTopP)
+{
+    this->runCumLogProbTest({99, 1201, 1, 0, 1.0f, 5});
+}
diff --git a/tests/unittests/test_tensor.cu b/tests/unittests/test_tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..264ba8e6085d616d6eb020e9d091fee9567bd71b
--- /dev/null
+++ b/tests/unittests/test_tensor.cu
@@ -0,0 +1,244 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include <gtest/gtest.h>
+
+#include "src/fastertransformer/utils/Tensor.h"
+
+using namespace fastertransformer;
+
+namespace {
+
+#define EXPECT_EQUAL_TENSORS(t1, t2)       \
+    do {                                   \
+        EXPECT_TRUE(t1.where == t2.where); \
+        EXPECT_TRUE(t1.type == t2.type);   \
+        EXPECT_TRUE(t1.shape == t2.shape); \
+        EXPECT_TRUE(t1.data == t2.data);   \
+    } while(false)
+
+TEST(TensorMapTest, HasKeyCorrectness) {
+    bool* v1 = new bool(true);
+    float* v2 = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
+    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, v1};
+    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, v2};
+
+    TensorMap map({{"t1", t1}, {"t2", t2}});
+    EXPECT_TRUE(map.isExist("t1"));
+    EXPECT_TRUE(map.isExist("t2"));
+    EXPECT_FALSE(map.isExist("t3"));
+
+    delete v1;
+    delete[] v2;
+}
+
+TEST(TensorMapTest, InsertCorrectness) {
+    int* v1 = new int[4]{1, 10, 20, 30};
+    float* v2 = new float[2]{1.0f, 2.0f};
+    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v2);
+
+    TensorMap map({{"t1", t1}});
+    EXPECT_TRUE(map.size() == 1);
+    EXPECT_TRUE(map.isExist("t1"));
+    EXPECT_EQUAL_TENSORS(map.at("t1"), t1);
+    EXPECT_FALSE(map.isExist("t2"));
+}
+
+TEST(TensorMapTest, InsertDoesNotAllowNoneTensor) {
+    TensorMap map;
+    EXPECT_TRUE(map.size() == 0);
+    // forbid a none tensor.
+    EXPECT_THROW(map.insert("none", {}), std::runtime_error);
+
+    // forbid a tensor having null data pointer.
+    Tensor none_data_tensor = Tensor(MEMORY_CPU, TYPE_INT32, {}, nullptr);
+    EXPECT_THROW(map.insert("empty", none_data_tensor), std::runtime_error);
+}
+
+TEST(TensorMapTest, InsertDoesNotAllowDuplicatedKey) {
+    int* v1 = new int[4]{1, 10, 20, 30};
+    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+    Tensor t2 = Tensor(MEMORY_CPU, TYPE_INT32, {2}, v1);
+    TensorMap map({{"t1", t1}});
+    EXPECT_TRUE(map.size() == 1);
+    // forbid a duplicated key.
+    EXPECT_THROW(map.insert("t1", t2), std::runtime_error);
+    delete[] v1;
+}
+
+TEST(TensorMapTest, GetValCorrectness) {
+    int* v1 = new int[4]{1, 10, 20, 30};
+    Tensor t1 = Tensor(MEMORY_CPU, TYPE_INT32, {4}, v1);
+
+    TensorMap map({{"t1", t1}});
+    EXPECT_TRUE(map.size() == 1);
+    // throw exception since the map doesn't have a key "t3".
+    EXPECT_THROW(map.getVal<int>("t3"), std::runtime_error);
+    EXPECT_TRUE(map.getVal<int>("t1") == 1);
+    EXPECT_TRUE(map.getVal<int>("t1", 3) == 1);
+
+    // map doesn't have t2 so return the default value 3.
+    EXPECT_TRUE(map.getVal<int>("t2", 3) == 3);
+
+    v1[0] += 1;  // update value.
+    EXPECT_TRUE(map.getVal<int>("t1") == 2);
+    EXPECT_TRUE(map.getVal<int>("t1", 3) == 2);
+
+    size_t index = 2;
+    EXPECT_TRUE(map.getValWithOffset<int>("t1", index) == 20);
+    EXPECT_TRUE(map.getValWithOffset<int>("t1", index, 3) == 20);
+    EXPECT_TRUE(map.getValWithOffset<int>("t2", index, 3) == 3);
+    delete[] v1;
+}
+
+TEST(TensorMapTest, GetTensorCorrectness) {
+    bool* t1_val = new bool(true);
+    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
+    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
+    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
+
+    int* default_val = new int[4]{0, 1, 2, 3};
+    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
+
+    TensorMap map({{"t1", t1}, {"t2", t2}});
+    EXPECT_THROW(map.at("t3"), std::runtime_error);
+    EXPECT_EQUAL_TENSORS(map.at("t1", default_tensor), t1);
+    EXPECT_EQUAL_TENSORS(map.at("t2", default_tensor), t2);
+    EXPECT_EQUAL_TENSORS(map.at("t3", default_tensor), default_tensor);
+    EXPECT_EQUAL_TENSORS(map.at("t3", {}), Tensor());
+
+    delete[] default_val;
+    delete[] t2_val;
+    delete[] t1_val;
+}
+
+TEST(TensorMapTest, GetTensorCorrectnessAtConstTensorMap) {
+    bool* t1_val = new bool(true);
+    float* t2_val = new float[6]{1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f};
+    Tensor t1 = Tensor{MEMORY_CPU, TYPE_BOOL, {1}, t1_val};
+    Tensor t2 = Tensor{MEMORY_CPU, TYPE_FP32, {3, 2}, t2_val};
+
+    int* default_val = new int[4]{0, 1, 2, 3};
+    Tensor default_tensor = Tensor{MEMORY_CPU, TYPE_INT32, {4}, default_val};
+
+    const TensorMap map({{"t1", t1}, {"t2", t2}});
+    EXPECT_THROW(map.at("t3"), std::runtime_error);
+    EXPECT_EQUAL_TENSORS(map.at("t1", default_tensor), t1);
+    EXPECT_EQUAL_TENSORS(map.at("t2", default_tensor), t2);
+    EXPECT_EQUAL_TENSORS(map.at("t3", default_tensor), default_tensor);
+    EXPECT_EQUAL_TENSORS(map.at("t3", {}), Tensor());
+
+    delete[] default_val;
+    delete[] t2_val;
+    delete[] t1_val;
+}
+
+TEST(TensorTest, EmptyTensorMinMaxRaiseError) {
+    Tensor t1;
+    EXPECT_THROW(t1.min<int>(), std::runtime_error);
+    EXPECT_THROW(t1.max<int>(), std::runtime_error);
+
+    Tensor t2 = Tensor{MEMORY_CPU, TYPE_INT32, {}, nullptr};
+    EXPECT_THROW(t2.min<int>(), std::runtime_error);
+    EXPECT_THROW(t2.max<int>(), std::runtime_error);
+}
+
+
+using TensorTypes = testing::Types<int8_t, int, float>;
+
+template <typename T>
+class TensorFuncTest : public testing::Test {};
+
+TYPED_TEST_SUITE(TensorFuncTest, TensorTypes);
+
+TYPED_TEST(TensorFuncTest, MaxCorrectness) {
+    using T = TypeParam;
+
+    size_t size = 4;
+
+    T* v1 = new T[size] {T(1), T(2), T(3), T(4)};
+    T* v2 = new T[size] {T(4), T(3), T(2), T(1)};
+    T* v3 = new T[size] {T(1), T(2), T(4), T(3)};
+
+    Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
+    Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
+    Tensor t3 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v3);
+
+    EXPECT_EQ(t1.max<T>(), T(4));
+    EXPECT_EQ(t2.max<T>(), T(4));
+    EXPECT_EQ(t3.max<T>(), T(4));
+
+    delete[] v1;
+    delete[] v2;
+    delete[] v3;
+}
+
+TYPED_TEST(TensorFuncTest, MinCorrectness) {
+    using T = TypeParam;
+
+    size_t size = 4;
+
+    T* v1 = new T[size]{T(1), T(2), T(3), T(4)};
+    T* v2 = new T[size]{T(4), T(3), T(2), T(1)};
+    T* v3 = new T[size]{T(1), T(2), T(4), T(3)};
+
+    Tensor t1 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v1);
+    Tensor t2 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v2);
+    Tensor t3 = Tensor(MEMORY_CPU, getTensorType<T>(), {size}, v3);
+
+    EXPECT_EQ(t1.min<T>(), T(1));
+    EXPECT_EQ(t2.min<T>(), T(1));
+    EXPECT_EQ(t3.min<T>(), T(1));
+
+    delete[] v1;
+    delete[] v2;
+    delete[] v3;
+}
+
+TYPED_TEST(TensorFuncTest, AnyCorrectness) {
+    using T = TypeParam;
+
+    T* v = new T[4]{T(1), T(2), T(3), T(4)};
+    Tensor t = Tensor{MEMORY_CPU, getTensorType<T>(), {4}, v};
+    EXPECT_TRUE(t.any<T>(T(1)));
+    EXPECT_FALSE(t.any<T>(T(5)));
+    delete[] v;
+}
+
+TYPED_TEST(TensorFuncTest, AllCorrectness) {
+    using T = TypeParam;
+
+    constexpr size_t size = 4;
+    T* v1 = new T[size]{T(1), T(1), T(1), T(1)};
+    T* v2 = new T[size]{T(1), T(1), T(1), T(2)};
+    Tensor t1 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v1};
+    Tensor t2 = Tensor{MEMORY_CPU, getTensorType<T>(), {size}, v2};
+    EXPECT_TRUE(t1.all<T>(T(1)));
+    EXPECT_FALSE(t2.all<T>(T(2)));
+    delete[] v1;
+    delete[] v2;
+}
+
+TYPED_TEST(TensorFuncTest, SliceCorrectness) {
+    using T = TypeParam;
+
+    constexpr int size = 12;
+    T* v = new T[size];
+    for (int i = 0; i < size; ++i) {
+        v[i] = i;
+    }
+
+    DataType dtype = getTensorType<T>();
+    Tensor t1 = Tensor(MEMORY_CPU, dtype, {3, 4}, v);
+    Tensor t2 = t1.slice({2, 4}, 4);
+
+    EXPECT_EQUAL_TENSORS(t2, Tensor(MEMORY_CPU, dtype, {2, 4}, &v[4]));
+    // An overflowed tensor throws an exception.
+    EXPECT_THROW(t1.slice({2, 4}, 5), std::runtime_error);
+
+    delete[] v;
+}
+
+} // end of namespace
diff --git a/tests/unittests/unittest_utils.h b/tests/unittests/unittest_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..00b302b1c28fce4c0037d6ef92c4a5b7cf2a8bff
--- /dev/null
+++ b/tests/unittests/unittest_utils.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>   // min, max
+#include <assert.h>    // assert
+#include <float.h>     // FLT_MAX
+#include <iostream>    // snprintf
+#include <math.h>      // expf, log
+#include <limits>      // numeric_limits
+#include <stdlib.h>    // rand
+#include <string>      // string
+#include <vector>      // vector
+
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include "src/fastertransformer/utils/string_utils.h"
+
+#define PRINT_LIMIT 16
+#define EPSILON (1e-20)
+#define EPSILON_FP16 (1e-10)
+
+using namespace fastertransformer;
+
+class TestFailureError : public std::exception {
+private:
+    std::string msg_;
+public:
+    explicit TestFailureError() = default;
+    explicit TestFailureError(std::string name, std::string msg = "") {
+        msg_ = fmtstr("TEST FAIL [%s] %s", name.c_str(), msg.c_str());
+    }
+    const char* what () const throw () {
+        return msg_.c_str();
+    }
+};
+
+#define EXPECT_TRUE(cond)                                  \
+    do { if(!(cond)) {                                     \
+        FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+                     __func__, #cond, __FILE__, __LINE__); \
+        throw TestFailureError(__func__);                  \
+    } } while(false)
+
+#define EXPECT_FALSE(cond)                                 \
+    do { if(cond) {                                        \
+        FT_LOG_ERROR("TEST FAIL [%s]: %s at %s:%d",        \
+                     __func__, #cond, __FILE__, __LINE__); \
+        throw TestFailureError(__func__);                  \
+    } } while(false)
+
+bool almostEqual(float a, float b, float atol = 1e-5, float rtol = 1e-8)
+{
+    // Params: a = value to compare and b = reference
+    // This function follows implementation of numpy.isclose(), which checks
+    //   abs(a - b) <= (atol + rtol * abs(b)).
+    // Note that the inequality above is asymmetric where b is considered as
+    // a reference value. To account into both absolute/relative errors, it
+    // uses absolute tolerance and relative tolerance at the same time. The
+    // default values of atol and rtol borrowed from numpy.isclose(). For the
+    // case of nan value, the result will be true.
+    if (isnan(a) && isnan(b)) {
+        return true;
+    }
+    return fabs(a - b) <= (atol + rtol * fabs(b));
+}
+
+template<typename T>
+bool checkResult(std::string name, T* out, T*ref, size_t size, float atol, float rtol) {
+    size_t failures = 0;
+    float relative_gap = 0.0f;;
+
+    for (size_t i = 0; i < size; ++i) {
+        // The values for the output and the reference.
+        float a = (float)out[i];
+        float b = (float)ref[i];
+
+        bool ok = almostEqual(a, b, atol, rtol);
+        // Print the error.
+        if (!ok && failures < 4) {
+            FT_LOG_ERROR(">> invalid result for i=%lu:", i);
+            FT_LOG_ERROR(">>    found......: %10.6f", a);
+            FT_LOG_ERROR(">>    expected...: %10.6f", b);
+            FT_LOG_ERROR(">>    error......: %.6f", fabsf(a - b));
+            FT_LOG_ERROR(">>    tol........: %.6f", atol + rtol * fabs(b));
+        }
+        // Update the number of failures.
+        failures += ok ? 0 : 1;
+        // Update the relative gap.
+        relative_gap += fabsf(a - b) / (fabsf(b) + EPSILON);
+    }
+
+    relative_gap /= size;
+
+    // Allow not matched up to 1% elements.
+    size_t tol_failures = (size_t)(0.01 * size);
+    FT_LOG_INFO("check...%6s : %-50s (failures: %.2f%% atol: %.2e rtol: %.2e rel_gap: %.2e%%)",
+                failures <= tol_failures ? "....OK" : "FAILED", name.c_str(),
+                100. * failures / size, atol, rtol, 100. * relative_gap);
+    return failures <= tol_failures;
+}
+
+template<typename T>
+bool checkResult(std::string name, T* out, T* ref, size_t size,
+                 bool device_out = true, bool device_ref = false)
+{
+    bool is_fp32 = sizeof(T) == 4;
+    float atol = is_fp32 ? 1e-4f : 1e-3f;
+    float rtol = is_fp32 ? 1e-2f : 1e-1f;
+
+    T* h_out = nullptr;
+    if (device_out) {
+        h_out = new T[size];
+        cudaMemcpy(h_out, out, sizeof(T) * size, cudaMemcpyDeviceToHost);
+        out = h_out;
+    }
+    T* h_ref = nullptr;
+    if (device_ref) {
+        h_ref = new T[size];
+        cudaMemcpy(h_ref, ref, sizeof(T) * size, cudaMemcpyDeviceToHost);
+        ref = h_ref;
+    }
+    bool is_ok = checkResult(name, out, ref, size, atol, rtol);
+    if (h_out != nullptr){
+        delete[] h_out;
+    }
+    if (h_ref != nullptr) {
+        delete[] h_ref;
+    }
+    return is_ok;
+}
+
+template<typename T>
+void initRandom(T* ptr, size_t size, float minval, float maxval) {
+    for (size_t i = 0; i < size; ++i) {
+        float val = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+        val *= (maxval - minval);
+        ptr[i] = static_cast<T>(minval + val);
+    }
+}
+
+void initRandomInt(int* ptr, size_t size, int minval, int maxval) {
+    assert(minval < maxval);
+    int mod = maxval - minval;
+    for (size_t i = 0; i < size; ++i) {
+        ptr[i] = minval + rand() % mod;
+    }
+}
+
+template<typename T>
+void tile(T* x, int m, int n) {
+    for (int i = 1; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            x[i * n + j] = x[j];
+        }
+    }
+}
+
+template<typename T>
+void tile(T* dst, T* src, int m, int n) {
+    for (int i = 1; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            dst[i * n + j] = src[j];
+        }
+    }
+}
+
+#define HALF_FLT_MAX 65504.0f
+
+template<typename T>
+bool isHalf() {
+    return std::is_same<T, half>::value;
+}
+
+template<typename T>
+static inline void printMatrixWithLimit(T* ptr, int m, int k, int stride, bool is_device_ptr) {
+    printMatrix(ptr, std::min(PRINT_LIMIT, m), std::min(PRINT_LIMIT, k), stride, is_device_ptr);
+}