check-in fastertransformer (#7)

* add ft code * gitignore * fix lint * revert fmha

check-in fastertransformer (#7)
* add ft code * gitignore * fix lint * revert fmha
9efcac38 · Li Zhang · GitHub · 720fc533 · 9efcac38 · 9efcac38
Unverified Commit 9efcac38 authored Jun 20, 2023 by Li Zhang Committed by GitHub Jun 20, 2023
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@
 __pycache__/
 *.egg-info/
 workspace/
+.cache
+*build*/
--- a/3rdparty/INIReader.h
+++ b/3rdparty/INIReader.h
+// Read an INI file into easy-to-access name/value pairs.
+// inih and INIReader are released under the New BSD license.
+// Go to the project home page for more info:
+//
+// https://github.com/benhoyt/inih (Initial repo)
+// https://github.com/jtilly/inih  (The reference of this header file)
+/* inih -- simple .INI file parser
+inih is released under the New BSD license (see LICENSE.txt). Go to the project
+home page for more info:
+https://github.com/benhoyt/inih
+https://github.com/jtilly/inih 
+*/
+#ifndef __INI_H__
+#define __INI_H__
+/* Make this header file easier to include in C++ code */
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdio.h>
+/* Typedef for prototype of handler function. */
+typedef int (*ini_handler)(void* user, const char* section,
+                           const char* name, const char* value);
+/* Typedef for prototype of fgets-style reader function. */
+typedef char* (*ini_reader)(char* str, int num, void* stream);
+/* Parse given INI-style file. May have [section]s, name=value pairs
+   (whitespace stripped), and comments starting with ';' (semicolon). Section
+   is "" if name=value pair parsed before any section heading. name:value
+   pairs are also supported as a concession to Python's configparser.
+   For each name=value pair parsed, call handler function with given user
+   pointer as well as section, name, and value (data only valid for duration
+   of handler call). Handler should return nonzero on success, zero on error.
+   Returns 0 on success, line number of first error on parse error (doesn't
+   stop on first error), -1 on file open error, or -2 on memory allocation
+   error (only when INI_USE_STACK is zero).
+*/
+int ini_parse(const char* filename, ini_handler handler, void* user);
+/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't
+   close the file when it's finished -- the caller must do that. */
+int ini_parse_file(FILE* file, ini_handler handler, void* user);
+/* Same as ini_parse(), but takes an ini_reader function pointer instead of
+   filename. Used for implementing custom or string-based I/O. */
+int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
+                     void* user);
+/* Nonzero to allow multi-line value parsing, in the style of Python's
+   configparser. If allowed, ini_parse() will call the handler with the same
+   name for each subsequent line parsed. */
+#ifndef INI_ALLOW_MULTILINE
+#define INI_ALLOW_MULTILINE 1
+#endif
+/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of
+   the file. See http://code.google.com/p/inih/issues/detail?id=21 */
+#ifndef INI_ALLOW_BOM
+#define INI_ALLOW_BOM 1
+#endif
+/* Nonzero to allow inline comments (with valid inline comment characters
+   specified by INI_INLINE_COMMENT_PREFIXES). Set to 0 to turn off and match
+   Python 3.2+ configparser behaviour. */
+#ifndef INI_ALLOW_INLINE_COMMENTS
+#define INI_ALLOW_INLINE_COMMENTS 1
+#endif
+#ifndef INI_INLINE_COMMENT_PREFIXES
+#define INI_INLINE_COMMENT_PREFIXES ";"
+#endif
+/* Nonzero to use stack, zero to use heap (malloc/free). */
+#ifndef INI_USE_STACK
+#define INI_USE_STACK 1
+#endif
+/* Stop parsing on first error (default is to keep parsing). */
+#ifndef INI_STOP_ON_FIRST_ERROR
+#define INI_STOP_ON_FIRST_ERROR 0
+#endif
+/* Maximum line length for any line in INI file. */
+#ifndef INI_MAX_LINE
+#define INI_MAX_LINE 200
+#endif
+#ifdef __cplusplus
+}
+#endif
+/* inih -- simple .INI file parser
+inih is released under the New BSD license (see LICENSE.txt). Go to the project
+home page for more info:
+https://github.com/benhoyt/inih
+*/
+#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#if !INI_USE_STACK
+#include <stdlib.h>
+#endif
+#define MAX_SECTION 50
+#define MAX_NAME 50
+/* Strip whitespace chars off end of given string, in place. Return s. */
+inline static char* rstrip(char* s)
+{
+    char* p = s + strlen(s);
+    while (p > s && isspace((unsigned char)(*--p)))
+        *p = '\0';
+    return s;
+}
+/* Return pointer to first non-whitespace char in given string. */
+inline static char* lskip(const char* s)
+{
+    while (*s && isspace((unsigned char)(*s)))
+        s++;
+    return (char*)s;
+}
+/* Return pointer to first char (of chars) or inline comment in given string,
+   or pointer to null at end of string if neither found. Inline comment must
+   be prefixed by a whitespace character to register as a comment. */
+inline static char* find_chars_or_comment(const char* s, const char* chars)
+{
+#if INI_ALLOW_INLINE_COMMENTS
+    int was_space = 0;
+    while (*s && (!chars || !strchr(chars, *s)) &&
+           !(was_space && strchr(INI_INLINE_COMMENT_PREFIXES, *s))) {
+        was_space = isspace((unsigned char)(*s));
+        s++;
+    }
+#else
+    while (*s && (!chars || !strchr(chars, *s))) {
+        s++;
+    }
+#endif
+    return (char*)s;
+}
+/* Version of strncpy that ensures dest (size bytes) is null-terminated. */
+inline static char* strncpy0(char* dest, const char* src, size_t size)
+{
+    strncpy(dest, src, size);
+    dest[size - 1] = '\0';
+    return dest;
+}
+/* See documentation in header file. */
+inline int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
+                     void* user)
+{
+    /* Uses a fair bit of stack (use heap instead if you need to) */
+#if INI_USE_STACK
+    char line[INI_MAX_LINE];
+#else
+    char* line;
+#endif
+    char section[MAX_SECTION] = "";
+    char prev_name[MAX_NAME] = "";
+    char* start;
+    char* end;
+    char* name;
+    char* value;
+    int lineno = 0;
+    int error = 0;
+#if !INI_USE_STACK
+    line = (char*)malloc(INI_MAX_LINE);
+    if (!line) {
+        return -2;
+    }
+#endif
+    /* Scan through stream line by line */
+    while (reader(line, INI_MAX_LINE, stream) != NULL) {
+        lineno++;
+        start = line;
+#if INI_ALLOW_BOM
+        if (lineno == 1 && (unsigned char)start[0] == 0xEF &&
+                           (unsigned char)start[1] == 0xBB &&
+                           (unsigned char)start[2] == 0xBF) {
+            start += 3;
+        }
+#endif
+        start = lskip(rstrip(start));
+        if (*start == ';' || *start == '#') {
+            /* Per Python configparser, allow both ; and # comments at the
+               start of a line */
+        }
+#if INI_ALLOW_MULTILINE
+        else if (*prev_name && *start && start > line) {
+#if INI_ALLOW_INLINE_COMMENTS
+        end = find_chars_or_comment(start, NULL);
+        if (*end)
+            *end = '\0';
+        rstrip(start);
+#endif
+            /* Non-blank line with leading whitespace, treat as continuation
+               of previous name's value (as per Python configparser). */
+            if (!handler(user, section, prev_name, start) && !error)
+                error = lineno;
+        }
+#endif
+        else if (*start == '[') {
+            /* A "[section]" line */
+            end = find_chars_or_comment(start + 1, "]");
+            if (*end == ']') {
+                *end = '\0';
+                strncpy0(section, start + 1, sizeof(section));
+                *prev_name = '\0';
+            }
+            else if (!error) {
+                /* No ']' found on section line */
+                error = lineno;
+            }
+        }
+        else if (*start) {
+            /* Not a comment, must be a name[=:]value pair */
+            end = find_chars_or_comment(start, "=:");
+            if (*end == '=' || *end == ':') {
+                *end = '\0';
+                name = rstrip(start);
+                value = lskip(end + 1);
+#if INI_ALLOW_INLINE_COMMENTS
+                end = find_chars_or_comment(value, NULL);
+                if (*end)
+                    *end = '\0';
+#endif
+                rstrip(value);
+                /* Valid name[=:]value pair found, call handler */
+                strncpy0(prev_name, name, sizeof(prev_name));
+                if (!handler(user, section, name, value) && !error)
+                    error = lineno;
+            }
+            else if (!error) {
+                /* No '=' or ':' found on name[=:]value line */
+                error = lineno;
+            }
+        }
+#if INI_STOP_ON_FIRST_ERROR
+        if (error)
+            break;
+#endif
+    }
+#if !INI_USE_STACK
+    free(line);
+#endif
+    return error;
+}
+/* See documentation in header file. */
+inline int ini_parse_file(FILE* file, ini_handler handler, void* user)
+{
+    return ini_parse_stream((ini_reader)fgets, file, handler, user);
+}
+/* See documentation in header file. */
+inline int ini_parse(const char* filename, ini_handler handler, void* user)
+{
+    FILE* file;
+    int error;
+    file = fopen(filename, "r");
+    if (!file)
+        return -1;
+    error = ini_parse_file(file, handler, user);
+    fclose(file);
+    return error;
+}
+#endif /* __INI_H__ */
+#ifndef __INIREADER_H__
+#define __INIREADER_H__
+#include <map>
+#include <set>
+#include <string>
+// Read an INI file into easy-to-access name/value pairs. (Note that I've gone
+// for simplicity here rather than speed, but it should be pretty decent.)
+class INIReader
+{
+public:
+    // Empty Constructor
+    INIReader() {};
+    // Construct INIReader and parse given filename. See ini.h for more info
+    // about the parsing.
+    INIReader(std::string filename);
+    // Construct INIReader and parse given file. See ini.h for more info
+    // about the parsing.
+    INIReader(FILE *file);
+    ~INIReader();
+    // Return the result of ini_parse(), i.e., 0 on success, line number of
+    // first error on parse error, or -1 on file open error.
+    int ParseError() const;
+    // Return the list of sections found in ini file
+    const std::set<std::string>& Sections() const;
+    // Get a string value from INI file, returning default_value if not found.
+    std::string Get(std::string section, std::string name,
+                    std::string default_value) const;
+    std::string Get(std::string section, std::string name) const;
+    // Get an integer (long) value from INI file, returning default_value if
+    // not found or not a valid integer (decimal "1234", "-1234", or hex "0x4d2").
+    long GetInteger(std::string section, std::string name, long default_value) const;
+    long GetInteger(std::string section, std::string name) const;
+    // Get a real (floating point double) value from INI file, returning
+    // default_value if not found or not a valid floating point value
+    // according to strtod().
+    double GetReal(std::string section, std::string name, double default_value) const;
+    // Get a single precision floating point number value from INI file, returning
+    // default_value if not found or not a valid floating point value
+    // according to strtof().
+    float GetFloat(std::string section, std::string name, float default_value) const;
+    float GetFloat(std::string section, std::string name) const;
+    // Get a boolean value from INI file, returning default_value if not found or if
+    // not a valid true/false value. Valid true values are "true", "yes", "on", "1",
+    // and valid false values are "false", "no", "off", "0" (not case sensitive).
+    bool GetBoolean(std::string section, std::string name, bool default_value) const;
+protected:
+    int _error;
+    std::map<std::string, std::string> _values;
+    std::set<std::string> _sections;
+    static std::string MakeKey(std::string section, std::string name);
+    static int ValueHandler(void* user, const char* section, const char* name,
+                            const char* value);
+};
+#endif  // __INIREADER_H__
+#ifndef __INIREADER__
+#define __INIREADER__
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+inline INIReader::INIReader(std::string filename)
+{
+    _error = ini_parse(filename.c_str(), ValueHandler, this);
+}
+inline INIReader::INIReader(FILE *file)
+{
+    _error = ini_parse_file(file, ValueHandler, this);
+}
+inline int INIReader::ParseError() const
+{
+    return _error;
+}
+inline INIReader::~INIReader() { }
+inline const std::set<std::string>& INIReader::Sections() const
+{
+    return _sections;
+}
+inline std::string INIReader::Get(std::string section, std::string name, std::string default_value) const
+{
+    std::string key = MakeKey(section, name);
+    return _values.count(key) ? _values.at(key) : default_value;
+}
+inline std::string INIReader::Get(std::string section, std::string name) const
+{
+    std::string key = MakeKey(section, name);
+    if(_values.count(key)) return _values.at(key);
+    else
+    {
+        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
+        exit(-1);
+    }
+}
+inline long INIReader::GetInteger(std::string section, std::string name, long default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    // This parses "1234" (decimal) and also "0x4D2" (hex)
+    long n = strtol(value, &end, 0);
+    return end > value ? n : default_value;
+}
+inline long INIReader::GetInteger(std::string section, std::string name) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    // This parses "1234" (decimal) and also "0x4D2" (hex)
+    long n = strtol(value, &end, 0);
+    if(end <= value)
+    {
+        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
+        exit(-1);
+    }
+    return n;
+}
+inline double INIReader::GetReal(std::string section, std::string name, double default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    double n = strtod(value, &end);
+    return end > value ? n : default_value;
+}
+inline float INIReader::GetFloat(std::string section, std::string name, float default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    float n = strtof(value, &end);
+    return end > value ? n : default_value;
+}
+inline float INIReader::GetFloat(std::string section, std::string name) const
+{
+    std::string valstr = Get(section, name, "");
+    const char* value = valstr.c_str();
+    char* end;
+    float n = strtof(value, &end);
+    if(end <= value)
+    {
+        printf("[ERROR] Does not find the section %s with name %s. \n", section.c_str(), name.c_str());
+        exit(-1);
+    }
+    return n;
+}
+inline bool INIReader::GetBoolean(std::string section, std::string name, bool default_value) const
+{
+    std::string valstr = Get(section, name, "");
+    // Convert to lower case to make string comparisons case-insensitive
+    std::transform(valstr.begin(), valstr.end(), valstr.begin(), ::tolower);
+    if (valstr == "true" || valstr == "yes" || valstr == "on" || valstr == "1")
+        return true;
+    else if (valstr == "false" || valstr == "no" || valstr == "off" || valstr == "0")
+        return false;
+    else
+        return default_value;
+}
+inline std::string INIReader::MakeKey(std::string section, std::string name)
+{
+    std::string key = section + "=" + name;
+    // Convert to lower case to make section/name lookups case-insensitive
+    std::transform(key.begin(), key.end(), key.begin(), ::tolower);
+    return key;
+}
+inline int INIReader::ValueHandler(void* user, const char* section, const char* name,
+                            const char* value)
+{
+    INIReader* reader = (INIReader*)user;
+    std::string key = MakeKey(section, name);
+    if (reader->_values[key].size() > 0)
+        reader->_values[key] += "\n";
+    reader->_values[key] += value;
+    reader->_sections.insert(section);
+    return 1;
+}
+#endif  // __INIREADER__
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+cmake_minimum_required(VERSION 3.11 FATAL_ERROR) # for PyTorch extensions, version should be greater than 3.13
+project(FasterTransformer LANGUAGES CXX CUDA)
+find_package(CUDA 10.2 REQUIRED)
+if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
+  add_definitions("-DENABLE_BF16")
+  message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.0, enable -DENABLE_BF16 flag")
+endif()
+if((${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11" AND ${CUDA_VERSION_MINOR} VERSION_GREATER_EQUAL "8") OR (${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "12"))
+  add_definitions("-DENABLE_FP8")
+  option(ENABLE_FP8 "ENABLE_FP8" OFF)
+  if(ENABLE_FP8)
+    message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} is greater or equal than 11.8, enable -DENABLE_FP8 flag")
+  endif()
+endif()
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
+option(BUILD_PYT "Build in PyTorch TorchScript class mode" OFF)
+if(NOT BUILD_MULTI_GPU)
+  option(BUILD_MULTI_GPU "Build project about multi-GPU" OFF)
+endif()
+if(NOT USE_TRITONSERVER_DATATYPE)
+  option(USE_TRITONSERVER_DATATYPE "Build triton backend for triton server" OFF)
+endif()
+include(FetchContent)
+FetchContent_Declare(
+  repo-cutlass 
+  GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+  GIT_TAG        cc85b64cf676c45f98a17e3a47c0aafcf817f088
+)
+set(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+FetchContent_MakeAvailable(repo-cutlass)
+set(CUTLASS_HEADER_DIR ${PROJECT_SOURCE_DIR}/3rdparty/cutlass/include)
+set(CUTLASS_EXTENSIONS_DIR ${PROJECT_SOURCE_DIR}/src/fastertransformer/cutlass_extensions/include)
+option(SPARSITY_SUPPORT "Build project with Ampere sparsity feature support" OFF)
+option(BUILD_FAST_MATH "Build in fast math mode" ON)
+if(BUILD_MULTI_GPU)
+  message(STATUS "Add DBUILD_MULTI_GPU, requires MPI and NCCL")
+  add_definitions("-DBUILD_MULTI_GPU")
+  set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
+  find_package(MPI REQUIRED)
+  find_package(NCCL REQUIRED)
+  set(CMAKE_MODULE_PATH "") # prevent the bugs for pytorch building
+endif()
+if(BUILD_PYT)
+  if(DEFINED ENV{NVIDIA_PYTORCH_VERSION})
+    if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_LESS "20.03")
+      message(FATAL_ERROR "NVIDIA PyTorch image is too old for TorchScript mode.")
+    endif()
+    if($ENV{NVIDIA_PYTORCH_VERSION} VERSION_EQUAL "20.03")
+      add_definitions(-DLEGACY_THS=1)
+    endif()
+  endif()
+endif()
+if(USE_TRITONSERVER_DATATYPE)
+  message("-- USE_TRITONSERVER_DATATYPE")
+  add_definitions("-DUSE_TRITONSERVER_DATATYPE")
+endif()
+set(CXX_STD "14" CACHE STRING "C++ standard")
+set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
+set(TF_PATH "" CACHE STRING "TensorFlow path")
+set(CUSPARSELT_PATH "" CACHE STRING "cuSPARSELt path")
+if((BUILD_TF OR BUILD_TF2) AND NOT TF_PATH)
+  message(FATAL_ERROR "TF_PATH must be set if BUILD_TF or BUILD_TF2 (=TensorFlow mode) is on.")
+endif()
+list(APPEND CMAKE_MODULE_PATH ${CUDA_PATH}/lib64)
+# profiling
+option(USE_NVTX "Whether or not to use nvtx" ON)
+if(USE_NVTX)
+  message(STATUS "NVTX is enabled.")
+  add_definitions("-DUSE_NVTX")
+endif()
+# setting compiler flags
+set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall -ldl") # -Xptxas -v
+set(SM_SETS 52 60 61 70 75 80 86 89 90)
+set(USING_WMMA False)
+set(FIND_SM False)
+foreach(SM_NUM IN LISTS SM_SETS)
+  string(FIND "${SM}" "${SM_NUM}" SM_POS)
+  if(SM_POS GREATER -1)
+    if(FIND_SM STREQUAL False)
+      set(ENV{TORCH_CUDA_ARCH_LIST} "")
+    endif()
+    set(FIND_SM True)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_${SM_NUM},code=\\\"sm_${SM_NUM},compute_${SM_NUM}\\\"")
+    if (SM_NUM STREQUAL 70 OR SM_NUM STREQUAL 75 OR SM_NUM STREQUAL 80 OR SM_NUM STREQUAL 86 OR SM_NUM STREQUAL 89 OR SM_NUM STREQUAL 90)
+      set(USING_WMMA True)
+    endif()
+    if(BUILD_PYT)
+      string(SUBSTRING ${SM_NUM} 0 1 SM_MAJOR)
+      string(SUBSTRING ${SM_NUM} 1 1 SM_MINOR)
+      set(ENV{TORCH_CUDA_ARCH_LIST} "$ENV{TORCH_CUDA_ARCH_LIST}\;${SM_MAJOR}.${SM_MINOR}")
+    endif()
+    list(APPEND CMAKE_CUDA_ARCHITECTURES ${SM_NUM})
+    message("-- Assign GPU architecture (sm=${SM_NUM})")
+  endif()
+endforeach()
+if(USING_WMMA STREQUAL True)
+  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
+  message("-- Use WMMA")
+endif()
+if(NOT (FIND_SM STREQUAL True))
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  \
+                        -gencode=arch=compute_70,code=\\\"sm_70,compute_70\\\" \
+                        -gencode=arch=compute_75,code=\\\"sm_75,compute_75\\\" \
+                        -gencode=arch=compute_80,code=\\\"sm_80,compute_80\\\" \
+                        -gencode=arch=compute_86,code=\\\"sm_86,compute_86\\\" \
+                        ")
+  #                      -rdc=true")
+  set(CMAKE_C_FLAGS    "${CMAKE_C_FLAGS}    -DWMMA")
+  set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS}  -DWMMA")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DWMMA")
+  if(BUILD_PYT)
+    set(ENV{TORCH_CUDA_ARCH_LIST} "7.0;7.5;8.0;8.6")
+  endif()
+  set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86)
+  message("-- Assign GPU architecture (sm=70,75,80,86)")
+endif()
+if(BUILD_PYT)
+  set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
+endif()
+set(CMAKE_C_FLAGS_DEBUG    "${CMAKE_C_FLAGS_DEBUG}    -Wall -O0")
+set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG}  -Wall -O0")
+# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall  --ptxas-options=-v --resource-usage")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -O0 -G -Xcompiler -Wall -DCUDA_PTX_FP8_F2FP_ENABLED")
+set(CMAKE_CXX_STANDARD "${CXX_STD}")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++${CXX_STD} -DCUDA_PTX_FP8_F2FP_ENABLED")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+# set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 --ptxas-options=--verbose")
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED")
+if(BUILD_FAST_MATH)
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} --use_fast_math")
+message("CMAKE_CUDA_FLAGS_RELEASE: ${CMAKE_CUDA_FLAGS_RELEASE}")
+endif()
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(COMMON_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}
+  ${CUDA_PATH}/include
+  ${CUTLASS_HEADER_DIR}
+)
+message("-- COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
+set(COMMON_LIB_DIRS
+  ${CUDA_PATH}/lib64
+)
+if (SPARSITY_SUPPORT)
+  list(APPEND COMMON_HEADER_DIRS ${CUSPARSELT_PATH}/include)
+  list(APPEND COMMON_LIB_DIRS ${CUSPARSELT_PATH}/lib64)
+  add_definitions(-DSPARSITY_ENABLED=1)
+endif()
+if(BUILD_TF)
+  list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
+  list(APPEND COMMON_LIB_DIRS ${TF_PATH})
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
+endif()
+if(BUILD_TF2)
+  list(APPEND COMMON_HEADER_DIRS ${TF_PATH}/include)
+  list(APPEND COMMON_LIB_DIRS ${TF_PATH})
+  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
+endif()
+set(PYTHON_PATH "python" CACHE STRING "Python path")
+if(BUILD_PYT)
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch; print(torch.__version__,end='');"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE TORCH_VERSION)
+  if (TORCH_VERSION VERSION_LESS "1.5.0")
+      message(FATAL_ERROR "PyTorch >= 1.5.0 is needed for TorchScript mode.")
+  endif()
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import os; import torch;
+print(os.path.dirname(torch.__file__),end='');"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE TORCH_DIR)
+  if (NOT _PYTHON_SUCCESS MATCHES 0)
+      message(FATAL_ERROR "Torch config Error.")
+  endif()
+  list(APPEND CMAKE_PREFIX_PATH ${TORCH_DIR})
+  find_package(Torch REQUIRED)
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; from distutils import sysconfig;
+print(sysconfig.get_python_inc());"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE PY_INCLUDE_DIR)
+  if (NOT _PYTHON_SUCCESS MATCHES 0)
+      message(FATAL_ERROR "Python config Error.")
+  endif()
+  list(APPEND COMMON_HEADER_DIRS ${PY_INCLUDE_DIR})
+  execute_process(COMMAND ${PYTHON_PATH} "-c" "from __future__ import print_function; import torch;
+print(torch._C._GLIBCXX_USE_CXX11_ABI,end='');"
+                  RESULT_VARIABLE _PYTHON_SUCCESS
+                  OUTPUT_VARIABLE USE_CXX11_ABI)
+  message("-- USE_CXX11_ABI=${USE_CXX11_ABI}")
+  if (USE_CXX11_ABI)
+    set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=1")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=1")
+  else()
+    set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=0")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
+endif()
+if (BUILD_MULTI_GPU)
+  list(APPEND COMMON_HEADER_DIRS ${MPI_INCLUDE_PATH})
+  list(APPEND COMMON_LIB_DIRS /usr/local/mpi/lib)
+endif()
+if(USE_TRITONSERVER_DATATYPE)
+  list(APPEND COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR}/../repo-core-src/include)
+endif()
+include_directories(
+  ${COMMON_HEADER_DIRS}
+)
+link_directories(
+  ${COMMON_LIB_DIRS}
+)
+# add_subdirectory(3rdparty)
+add_subdirectory(src)
+add_subdirectory(examples)
+add_subdirectory(tests)
+# # Mesaure the compile time
+option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF)
+if (MEASURE_BUILD_TIME)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_COMMAND} -E time")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_CUSTOM "${CMAKE_COMMAND} -E time")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
+endif()
+########################################
+add_library(transformer-shared SHARED
+  $<TARGET_OBJECTS:BaseBeamSearchLayer>
+  $<TARGET_OBJECTS:BaseSamplingLayer>
+  $<TARGET_OBJECTS:BeamSearchLayer>
+  $<TARGET_OBJECTS:DynamicDecodeLayer>
+  $<TARGET_OBJECTS:llama_fmha>
+  $<TARGET_OBJECTS:Llama>
+  $<TARGET_OBJECTS:LlamaTritonBackend>
+  $<TARGET_OBJECTS:OnlineBeamSearchLayer>
+  $<TARGET_OBJECTS:TopKSamplingLayer>
+  $<TARGET_OBJECTS:TopPSamplingLayer>
+  $<TARGET_OBJECTS:TransformerTritonBackend>
+  $<TARGET_OBJECTS:activation_kernels>
+  $<TARGET_OBJECTS:ban_bad_words>
+  $<TARGET_OBJECTS:beam_search_penalty_kernels>
+  $<TARGET_OBJECTS:beam_search_topk_kernels>
+  $<TARGET_OBJECTS:bert_preprocess_kernels>
+  $<TARGET_OBJECTS:cublasAlgoMap>
+  $<TARGET_OBJECTS:cublasMMWrapper>
+  $<TARGET_OBJECTS:cuda_utils>
+  $<TARGET_OBJECTS:custom_ar_comm>
+  $<TARGET_OBJECTS:custom_ar_kernels>
+  $<TARGET_OBJECTS:decoder_masked_multihead_attention>
+  $<TARGET_OBJECTS:decoding_kernels>
+  $<TARGET_OBJECTS:gpt_kernels>
+  $<TARGET_OBJECTS:logprob_kernels>
+  $<TARGET_OBJECTS:logger>
+  $<TARGET_OBJECTS:memory_utils>
+  $<TARGET_OBJECTS:mpi_utils>
+  $<TARGET_OBJECTS:nccl_utils>
+  $<TARGET_OBJECTS:nvtx_utils>
+  $<TARGET_OBJECTS:online_softmax_beamsearch_kernels>
+  $<TARGET_OBJECTS:sampling_penalty_kernels>
+  $<TARGET_OBJECTS:sampling_topk_kernels>
+  $<TARGET_OBJECTS:sampling_topp_kernels>
+  $<TARGET_OBJECTS:stop_criteria>
+  $<TARGET_OBJECTS:tensor>
+  $<TARGET_OBJECTS:unfused_attention_kernels>
+  $<TARGET_OBJECTS:word_list>
+)
+if (BUILD_MULTI_GPU)
+target_link_libraries(transformer-shared PUBLIC
+  -lmpi
+  ${NCCL_LIBRARIES}
+)
+endif()
+if(USE_NVTX)
+target_link_libraries(transformer-shared PUBLIC
+  -lnvToolsExt
+)
+endif()
+set_target_properties(transformer-shared PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(transformer-shared PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+set_target_properties(transformer-shared PROPERTIES LINKER_LANGUAGE CXX)
+target_link_libraries(transformer-shared PUBLIC -lcudart -lcublas -lcublasLt -lcurand)
+include(GNUInstallDirs)
+set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/FasterTransformer)
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/FasterTransformerConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
+  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
+)
+install(
+  FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerConfig.cmake
+  DESTINATION ${INSTALL_CONFIGDIR}
+)
+install(
+  TARGETS
+    transformer-shared
+  EXPORT
+    transformer-shared-targets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
+)
+install(
+  EXPORT
+    transformer-shared-targets
+  FILE
+    FasterTransformerTargets.cmake
+  DESTINATION
+    ${INSTALL_CONFIGDIR}
+)
+export(
+  EXPORT
+    transformer-shared-targets
+  FILE
+    ${CMAKE_CURRENT_BINARY_DIR}/FasterTransformerTargets.cmake
+  NAMESPACE
+    TritonCore::
+)
+export(PACKAGE FasterTransformer)
--- a/cmake/FasterTransformerConfig.cmake.in
+++ b/cmake/FasterTransformerConfig.cmake.in
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+include(CMakeFindDependencyMacro)
+get_filename_component(
+  FASTERTRANSFORMER_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+list(APPEND CMAKE_MODULE_PATH ${FASTERTRANSFORMER_CMAKE_DIR})
+if(NOT TARGET transformer-shared)
+  include("${FASTERTRANSFORMER_CMAKE_DIR}/FasterTransformerTargets.cmake")
+endif()
+set(FASTERTRANSFORMER_LIBRARIES transformer-shared)
--- a/cmake/Modules/FindCUDNN.cmake
+++ b/cmake/Modules/FindCUDNN.cmake
+# taken from https://github.com/pytorch/pytorch/blob/master/cmake/Modules_CUDA_fix/FindCUDNN.cmake
+# Find the CUDNN libraries
+#
+# The following variables are optionally searched for defaults
+#  CUDNN_ROOT: Base directory where CUDNN is found
+#  CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
+#  CUDNN_LIBRARY: Directory where CUDNN library is searched for
+#  CUDNN_STATIC: Are we looking for a static library? (default: no)
+#
+# The following are set after configuration is done:
+#  CUDNN_FOUND
+#  CUDNN_INCLUDE_PATH
+#  CUDNN_LIBRARY_PATH
+#
+include(FindPackageHandleStandardArgs)
+set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
+if (DEFINED $ENV{CUDNN_ROOT_DIR})
+  message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
+endif()
+list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
+set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
+find_path(CUDNN_INCLUDE_PATH cudnn.h
+  HINTS ${CUDNN_INCLUDE_DIR}
+  PATH_SUFFIXES cuda/include cuda include)
+option(CUDNN_STATIC "Look for static CUDNN" OFF)
+if (CUDNN_STATIC)
+  set(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+  set(CUDNN_LIBNAME "cudnn")
+endif()
+set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
+if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
+  message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
+endif()
+find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
+  PATHS ${CUDNN_LIBRARY}
+  PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH)
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
--- a/cmake/Modules/FindNCCL.cmake
+++ b/cmake/Modules/FindNCCL.cmake
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# 
+# From PyTorch:
+# 
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+# 
+# From Caffe2:
+# 
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+# 
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+# 
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+# 
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+# 
+# All contributions by Kakao Brain:
+# Copyright 2019-2020 Kakao Brain
+# 
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+# 
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+# 
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+# 
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# 
+# Find the nccl libraries
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
+set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
+set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
+if ($ENV{NCCL_ROOT_DIR})
+  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
+endif()
+list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR})
+if (USE_STATIC_NCCL)
+  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
+  SET(NCCL_LIBNAME "nccl_static")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(NCCL_LIBNAME "nccl")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+  if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <nccl.h>
+      int main()
+      {
+        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+        int x;
+        ncclGetVersion(&x);
+        return x == NCCL_VERSION_CODE;
+      }
+")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "NCCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
--- a/cmake/TritonFasterTransformerBackendConfig.cmake.in
+++ b/cmake/TritonFasterTransformerBackendConfig.cmake.in
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+include(CMakeFindDependencyMacro)
+get_filename_component(
+  TRITONPYTORCHBACKEND_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH
+)
+list(APPEND CMAKE_MODULE_PATH ${TRITONPYTORCHBACKEND_CMAKE_DIR})
+if(NOT TARGET TritonPyTorchBackend::triton-pytorch-backend)
+  include("${TRITONPYTORCHBACKEND_CMAKE_DIR}/TritonPyTorchBackendTargets.cmake")
+endif()
+set(TRITONPYTORCHBACKEND_LIBRARIES TritonPyTorchBackend::triton-pytorch-backend)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+add_subdirectory(cpp)
\ No newline at end of file
--- a/examples/__init__.py
+++ b/examples/__init__.py
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+add_subdirectory(llama)
\ No newline at end of file
--- a/examples/cpp/llama/CMakeLists.txt
+++ b/examples/cpp/llama/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+add_executable(llama_triton_example llama_triton_example.cc)
+target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart
+        LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils 
+        nvtx_utils word_list glog)
\ No newline at end of file
--- a/examples/cpp/llama/generate_gemm_config.py
+++ b/examples/cpp/llama/generate_gemm_config.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import subprocess
+import fire
+def main(head_num: int = 80,
+         size_per_head: int = 128,
+         vocab_size: int = 65632,
+         inter_size: int = 27392,
+         tensor_para_size: int = 8,
+         max_batch_size: int = 64):
+    for bsz in range(1, max_batch_size + 1):
+        subprocess.call(
+            f'bin/gpt_gemm {bsz} 1 1 {head_num} {size_per_head} {inter_size} {vocab_size} 1 {tensor_para_size} {0 if bsz == 1 else 1}',
+            shell=True)
+if __name__ == '__main__':
+    fire.Fire(main)
--- a/examples/cpp/llama/llama_ckpt_convert.py
+++ b/examples/cpp/llama/llama_ckpt_convert.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import fire
+import os.path as osp
+from os import makedirs
+from pathlib import Path
+import safetensors
+from typing import List
+from tqdm import tqdm
+def import_fb(ckpt_dir: str):
+    checkpoints = []
+    for pattern in ['*.pth', '*.pt']:
+        checkpoints += sorted(Path(ckpt_dir).glob(pattern))
+    print(checkpoints)
+    n_ckpt = len(checkpoints)
+    model_params = {}
+    def get_param(name, size):
+        print(name, size)
+        if name not in model_params:
+            model_params[name] = torch.zeros(
+                size, dtype=torch.float16, device='cpu')
+        return model_params[name]
+    for i, ckpt_path in enumerate(checkpoints):
+        ckpt = torch.load(ckpt_path, map_location='cpu')
+        for param_name, param_data in ckpt.items():
+            key = param_name.split('.')[-2]
+            if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'output']:  # column-parallel
+                size = param_data.size(0)
+                param = get_param(
+                    param_name, [size * n_ckpt, param_data.size(1)])
+                param.data[size * i: size * (i + 1), :] = param_data
+            elif key in ['w2', 'wo', 'tok_embeddings']:          # row-parallel
+                size = param_data.size(-1)
+                param = get_param(
+                    param_name, [param_data.size(0), size * n_ckpt])
+                param.data[:, size * i: size * (i + 1)] = param_data
+            elif i == 0:
+                param = get_param(param_name, param_data.size())
+                param.data = param_data
+        del ckpt
+    for name, param in model_params.items():
+        # transpose all weights as FasterTransformer is expecting column-major weights
+        # (output_dims, input_dims) -> (input_dims, output_dims)
+        key = name.split('.')[-2]
+        if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
+            param.data = param.data.t()
+    # concat qkv projection
+    for i in range(1000):
+        _qkv = [f'layers.{i}.attention.{k}.weight' for k in ['wq', 'wk', 'wv']]
+        try:
+            qkv = tuple(map(model_params.pop, _qkv))
+        except KeyError:
+            break
+        qkv = torch.stack(qkv, dim=1)
+        model_params[f'layers.{i}.attention.w_qkv.weight'] = qkv
+        print(qkv.shape, qkv.dtype)
+    return model_params
+def permute(x: torch.Tensor):
+    SIZE_PER_HEAD = 128
+    if x.shape[-1] > 1:  # qweights
+        dim = x.shape[-1]
+        n_heads = dim // SIZE_PER_HEAD
+        return x.view(-1, n_heads, 2, dim // n_heads // 2).transpose(2, 3).reshape(-1, dim)
+    else:  # scales, zeros
+        dim = x.shape[0]
+        n_heads = dim // SIZE_PER_HEAD
+        return x.view(n_heads, 2, dim // n_heads // 2, 1).transpose(1, 2).reshape(dim, 1)
+def check_zero(x: torch.Tensor):
+    sum = x.flatten().sum().item()
+    assert sum == 0, str(sum)
+def import_gptq(path: str):
+    model_params = {}
+    _qweight = 'weight'
+    _suffixes = [_qweight]
+    n_split = 3
+    if True:
+        _params = {}
+        for i in tqdm(range(0, n_split)):
+            filename = "pytorch_model-{:05d}-of-{:05d}.bin".format(i + 1, n_split)
+            _tmp = torch.load(osp.join(path, filename), map_location='cpu')
+            _params.update(_tmp)
+        # print('\n'.join(_params.keys()))
+        def get_tensor(name):
+            return _params[name]
+        def get_tensor_transposed(name):
+            return _params[name].t()
+    # _qweight = 'qweight'
+    # _suffixes = [_qweight, 'bias', 'scales', 'zeros']
+    # with safetensors.safe_open(path, framework='pt') as f:
+    #     get_tensor = f.get_tensor
+    #     # quantized weights are already in column major, no need to transpose
+    #     get_tensor_transposed = get_tensor
+        for i in range(1000):
+            try:
+                # attention weights
+                _qkvo = [f'model.layers.{i}.self_attn.{t}_proj' for t in 'qkvo']
+                for suffix in _suffixes:
+                    q, k, v, o = map(get_tensor_transposed, map(('{}.' + suffix).format, _qkvo))
+                    if suffix == 'bias':
+                        check_zero(q), check_zero(k), check_zero(v), check_zero(o)
+                    else:
+                        # q, k has different layout for fb & hf, convert to fb's layout
+                        q = permute(q)
+                        k = permute(k)
+                        if suffix == _qweight:  # weight, qweight
+                            # insert a dimension for splitting heads later
+                            # qkv = torch.cat([q[:, None, :], k[:, None, :], v[:, None, :]], dim=1)
+                            qkv = torch.stack((q, k, v), dim=1)
+                        else:  # scales, zeros
+                            # qkv = torch.cat([q[None, :], k[None, :], v[None, :]], dim=0).squeeze(dim=-1)
+                            qkv = torch.stack((q, k, v), dim=0).squeeze(dim=-1)
+                        for k, v in [('w_qkv', qkv), ('wo', o)]:
+                            model_params[f'layers.{i}.attention.{k}.{suffix}'] = v
+                # ffn weights
+                _w123 = [f'model.layers.{i}.mlp.{t}_proj' for t in ['gate', 'down', 'up']]
+                for suffix in _suffixes:
+                    w1, w2, w3 = map(get_tensor_transposed, map(('{}.' + suffix).format, _w123))
+                    if suffix == 'bias':
+                        check_zero(w1), check_zero(w2), check_zero(w3)
+                    else:
+                        if suffix in ['scales', 'zeros']:
+                            w1, w2, w3 = map(lambda x: x.squeeze(dim=-1), [w1, w2, w3])
+                        for k, v in [('w1', w1), ('w2', w2), ('w3', w3)]:
+                            model_params[f'layers.{i}.feed_forward.{k}.{suffix}'] = v
+                other = [('attention_norm.weight', 'input_layernorm.weight'),
+                         ('ffn_norm.weight', 'post_attention_layernorm.weight')]
+                for ours, theirs in other:
+                    model_params[f'layers.{i}.' + ours] = get_tensor(f'model.layers.{i}.' + theirs)
+            except safetensors.SafetensorError:
+                break
+            except KeyError:
+                break
+            print(i)
+        other = [('tok_embeddings.weight', 'model.embed_tokens.weight'),
+                 ('norm.weight', 'model.norm.weight'),
+                 ('output.weight', 'lm_head.weight')]
+        for ours, theirs in other:
+            model_params[ours] = get_tensor(theirs)
+        return model_params
+def export(model_params: dict, out_dir: str, n_inference: int):
+    makedirs(out_dir, exist_ok=True)
+    def save_bin(param: torch.Tensor, name):
+        print(name, param.shape)
+        if param.dtype in [torch.float, torch.bfloat16]:
+            param = param.half()
+        param.contiguous().numpy().tofile(osp.join(out_dir, name))
+    # reverse the spliting axes since the weights are transposed above
+    for param_name, param_data in model_params.items():
+        split_dim = None
+        key, ext = param_name.split('.')[-2:]
+        copy = False
+        if key in ['w1', 'w3', 'w_qkv']:
+            split_dim = -1
+        elif key in ['w2', 'wo']:
+            if ext in ['scales', 'zeros']:
+                copy = True
+            else:
+                split_dim = 0
+        if split_dim is not None:
+            print(f'*** spliting {param_name}, shape={param_data.shape}, split_dim={split_dim}')
+            assert param_data.shape[split_dim] % n_inference == 0
+            split_size = param_data.shape[split_dim] // n_inference
+            splits = torch.split(param_data, split_size, dim=split_dim)
+            for i, split in enumerate(splits):
+                prefix, ext = osp.splitext(param_name)
+                save_bin(split, f'{prefix}.{i}{ext}')
+        elif copy:
+            print(f'### copying {param_name}, shape={param_data.shape}')
+            copies = [param_data] * n_inference
+            for i, copy in enumerate(copies):
+                prefix, ext = osp.splitext(param_name)
+                save_bin(copy, f'{prefix}.{i}{ext}')
+        else:
+            save_bin(param_data, param_name)
+def main(kind: str, input_path: str, out_dir: str, n_inference: int = 1):
+    if kind == 'fb':
+        model_params = import_fb(input_path)
+    elif kind == 'gptq':
+        model_params = import_gptq(input_path)
+    else:
+        raise RuntimeError(f'Unsupported kind: {kind}')
+    export(model_params, out_dir, n_inference)
+if __name__ == '__main__':
+    fire.Fire(main)
\ No newline at end of file
--- a/examples/cpp/llama/llama_config.ini
+++ b/examples/cpp/llama/llama_config.ini
+[ft_instance_hyperparameter]
+data_type=fp16
+enable_custom_all_reduce=0
+pipeline_para_size=1
+tensor_para_size=8
+model_dir=/shared_data/chatpjlm-0/v0.2.3/fastertransformer/weights/
+[request]
+request_batch_size=8
+request_output_len=2048
+beam_width=1 ; beam width for beam search
+top_k=1 ; k value for top k sampling
+top_p=0.0 ; p value for top p sampling
+temperature=1.0 ; Use for sampling
+repetition_penalty=1.00 ; Use for sampling
+presence_penalty=0.0 ; Only one of repetition_penalty and presence_penalty are allowed.
+len_penalty=0.0
+beam_search_diversity_rate=0.0
+; PJLM start/end ids
+start_id=0
+end_id=1
+; --------------------- legacy params -------------------------
+; LLaMA start/end ids
+; start_id=1
+; end_id=2
+[4999_llama]
+head_num=80
+size_per_head=128
+vocab_size=65632
+num_layer=82
+rotary_embedding=128
+norm_eps=1e-5
+start_id=0
+end_id=1
+inter_size=27392
+[llama_7B]
+head_num=32
+size_per_head=128
+vocab_size=32000
+num_layer=32
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=11008
+[llama_13B]
+head_num=40
+size_per_head=128
+vocab_size=32000
+num_layer=40
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=13824
+[llama_30B]
+head_num=52
+size_per_head=128
+vocab_size=32000
+num_layer=60
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=17920
+[llama_65B]
+head_num=64
+size_per_head=128
+vocab_size=32000
+num_layer=80
+rotary_embedding=128
+start_id=1
+end_id=2
+inter_size=22016
--- a/examples/cpp/llama/llama_triton_example.cc
+++ b/examples/cpp/llama/llama_triton_example.cc
--- a/examples/cpp/llama/start_ids.csv
+++ b/examples/cpp/llama/start_ids.csv
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,44883,2282,32901,4220,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,46088,46064,625,19880,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,47335,56437,60468,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,44883,2282,6828,3467,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,36589,3467,7849,299,7032,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,44976,39798,6828,3467,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,2795,977,9193,299,405,537,46323,13,44975,45004,11130,32843,45004,35597
+0,18396,22305,13,4662,561,399,326,44875,29913,6938,1198,345,3134,39407,320,47997,45778,45121,61969,47371,492,13,44872,65616,47997,45778,45121,61969,47371,345,263,13820,1558,5515,2404,409,345,12643,521,41109,34993,326,44875,24488,10677,320,45691,45926,45513,46641,47641,46285,6456,492,824,345,12314,307,377,11951,44863,23391,44863,329,5420,935,421,44858,13,44872,65616,47997,45778,45121,61969,47371,541,2914,329,34352,30302,3530,299,278,5515,14966,521,278,1711,1591,425,5716,329,65616,45452,45545,44858,13,570,996,372,13,44975,45004,44950,11111,45004,35597,45691,45926,45513,46641,47641,46285,6456,46323,13,44975,45004,11130,32843,45004,35597
\ No newline at end of file
--- a/examples/cpp/llama/tokenizer.py
+++ b/examples/cpp/llama/tokenizer.py
+from sentencepiece import SentencePieceProcessor
+from typing import List
+import fire
+import sys
+class Tokenizer:
+    def __init__(self, model_file: str):
+        self.model = SentencePieceProcessor(model_file=model_file)
+        self.vocab_size = self.model.vocab_size()
+        self.start_id = self.model.bos_id()
+        self.end_id = self.model.eos_id()
+        self.pad_id = self.model.pad_id()
+        print(f'vocab_size = {self.vocab_size}')
+        print(f'start_id = {self.start_id}')
+        print(f'end_id = {self.end_id}')
+        print(f'pad_id = {self.pad_id}')
+    def encode(self, s: str):
+        return self.model.Encode(s, add_bos=True)
+    def decode(self, t: List[int]):
+        return self.model.Decode(t)
+def main(model_file: str = '/data/llama/model/tokenizer.model',
+         encode_file: str = None, decode_file: str = None):
+    tokenizer = Tokenizer(model_file)
+    if encode_file:
+        with open(encode_file, 'r') as f:
+            xs = tokenizer.encode(f.read())
+            print(','.join(map(str, xs)))
+    elif decode_file:
+        with open(decode_file, 'r') as f:
+            ys = tokenizer.decode(f.read())
+            print(ys)
+    else:
+        first = True
+        while True:
+            try:
+                s = input()
+            except EOFError:
+                break
+            if not first:
+                print('---------------------------------------------')
+            first = False
+            try:
+                xs = map(int, s.strip().split(' '))
+                s = tokenizer.decode(list(xs))
+                print(s)
+            except ValueError:
+                xs = tokenizer.encode(s)
+                print(' '.join(map(str, xs)))
+if __name__ == '__main__':
+    fire.Fire(main)
\ No newline at end of file
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+add_subdirectory(fastertransformer)
\ No newline at end of file
--- a/src/fastertransformer/CMakeLists.txt
+++ b/src/fastertransformer/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+add_subdirectory(utils)
+add_subdirectory(kernels)
+add_subdirectory(layers)
+add_subdirectory(models)
+if(BUILD_PYT)
+    add_subdirectory(th_op)
+endif()
+add_subdirectory(triton_backend)
\ No newline at end of file
--- a/src/fastertransformer/kernels/CMakeLists.txt
+++ b/src/fastertransformer/kernels/CMakeLists.txt
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+cmake_minimum_required(VERSION 3.8)
+add_library(ban_bad_words STATIC ban_bad_words.cu)
+set_property(TARGET ban_bad_words PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET ban_bad_words PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(stop_criteria STATIC stop_criteria_kernels.cu)
+set_property(TARGET stop_criteria PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET stop_criteria PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(activation_kernels STATIC activation_kernels.cu)
+set_property(TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(gen_relative_pos_bias STATIC gen_relative_pos_bias.cu)
+set_property(TARGET gen_relative_pos_bias PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET gen_relative_pos_bias PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(gen_relative_pos_bias PUBLIC activation_kernels)
+add_library(logprob_kernels STATIC logprob_kernels.cu)
+set_property(TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(unfused_attention_kernels STATIC unfused_attention_kernels.cu)
+set_property(TARGET unfused_attention_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET unfused_attention_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(bert_preprocess_kernels STATIC bert_preprocess_kernels.cu)
+set_property(TARGET bert_preprocess_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET bert_preprocess_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+set(decoder_masked_multihead_attention_files
+    decoder_masked_multihead_attention.cu
+)
+file(GLOB decoder_masked_multihead_attention_files ${decoder_masked_multihead_attention_files} ./decoder_masked_multihead_attention/*.cu)
+add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead_attention_files})
+set_property(TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(online_softmax_beamsearch_kernels STATIC online_softmax_beamsearch_kernels.cu)
+set_property(TARGET online_softmax_beamsearch_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET online_softmax_beamsearch_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(decoding_kernels STATIC decoding_kernels.cu)
+set_property(TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(gpt_kernels STATIC gpt_kernels.cu)
+set_property(TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(beam_search_penalty_kernels STATIC beam_search_penalty_kernels.cu)
+set_property(TARGET beam_search_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET beam_search_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+target_link_libraries(beam_search_penalty_kernels PRIVATE cuda_utils)
+add_library(beam_search_topk_kernels STATIC beam_search_topk_kernels.cu)
+set_property(TARGET beam_search_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET beam_search_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(sampling_topk_kernels STATIC sampling_topk_kernels.cu)
+set_property(TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(sampling_topp_kernels STATIC sampling_topp_kernels.cu)
+set_property(TARGET sampling_topp_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET sampling_topp_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(sampling_penalty_kernels STATIC sampling_penalty_kernels.cu)
+set_property(TARGET sampling_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET sampling_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
+add_library(custom_ar_kernels STATIC custom_ar_kernels.cu)
+set_property(TARGET custom_ar_kernels PROPERTY POSITION_INDEPENDENT_CODE  ON)
+set_property(TARGET custom_ar_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)