Initial commit

e90433a0 · Casper · 5440c0aa · e90433a0 · e90433a0 · e90433a0
Commit e90433a0 authored Dec 22, 2023 by Casper
20 changed files
--- a/.gitignore
+++ b/.gitignore
+.DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

--- a/README.md
+++ b/README.md
-# AutoAWQ_kernels
\ No newline at end of file
+# AutoAWQ Kernels
+
+AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
+
+## Requirements
+
+- Windows: Must use WSL2.
+- GPU: Must be compute capability 7.5 or higher.
+- CUDA Toolkit: Must be 11.8 or higher.
\ No newline at end of file
--- a/awq_cuda/attention/cuda_bf16_fallbacks.cuh
+++ b/awq_cuda/attention/cuda_bf16_fallbacks.cuh
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+
+namespace fastertransformer {
+
+#ifdef ENABLE_BF16
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = __low2float(val);
+    f_val.y = __high2float(val);
+    return f_val;
+#else
+    return __bfloat1622float2(val);
+#endif
+}
+
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = max(min(__low2float(val), 127.f), -128.f);
+    f_val.y = max(min(__high2float(val), 127.f), -128.f);
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
+    return int16;
+#else
+    val = __hmin2(val, make_bfloat162(127., 127.));
+    val = __hmax2(val, make_bfloat162(-128., -128.));
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
+    return int16;
+#endif
+}
+
+inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __floats2bfloat162_rn(val.x, val.y);
+#else
+    return __float22bfloat162_rn(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    __nv_bfloat162 val2;
+    val2.x = val;
+    val2.y = val;
+    return val2;
+#else
+    return __bfloat162bfloat162(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
+#else
+    return __hadd2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+#else
+    return __hadd(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
+#else
+    return __hsub2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+#else
+    return __hsub(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
+#else
+    return __hmul2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+#else
+    return __hmul(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh, fzl, fzh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    fzl = __low2float(z);
+    fzh = __high2float(z);
+    return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
+#else
+    return __hfma2(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+#else
+    return __hfma(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);;
+    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
+#else
+    return h2exp(x);
+#endif
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+
+inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+#endif
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
+#else
+    return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    fdl = __low2float(d);
+    fdh = __high2float(d);
+    return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
+#else
+    return a * b * c + d;
+#endif
+}
+
+#endif // ENABLE_BF16
+
+}  // namespace fastertransformer
--- a/awq_cuda/attention/cuda_bf16_wrapper.h
+++ b/awq_cuda/attention/cuda_bf16_wrapper.h
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
--- a/awq_cuda/attention/decoder_masked_multihead_attention.cu
+++ b/awq_cuda/attention/decoder_masked_multihead_attention.cu
+// Adapted from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decoder_masked_multihead_attention.h"
+#include "decoder_masked_multihead_attention_utils.h"
+#include "cuda_bf16_wrapper.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+#include "decoder_masked_multihead_attention_template.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, DO_CROSS_ATTENTION, stream)    \
+    size_t smem_sz = mmha::smem_size_in_bytes<T, DO_CROSS_ATTENTION>(params, THDS_PER_VALUE, THDS_PER_BLOCK);          \
+    auto kernel = mmha::masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE,                 \
+                                                          THDS_PER_BLOCK, DO_CROSS_ATTENTION>;                         \
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz);                                \
+    dim3 grid(params.num_heads, params.batch_size);                                                                    \
+    kernel<<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// !!! Specialize the launcher for Cross attention
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    constexpr int  THREADS_PER_VALUE  = Dh_MAX * sizeof(T) / 16;
+    constexpr bool DO_CROSS_ATTENTION = std::is_same<KERNEL_PARAMS_TYPE, Cross_multihead_attention_params<T>>::value;
+    int            tlength            = (DO_CROSS_ATTENTION) ? params.memory_max_len : params.timestep;
+    // printf("tlength, CROSS_ATTENTION = %d, %d\n", tlength, DO_CROSS_ATTENTION);
+    if (tlength < 32) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, DO_CROSS_ATTENTION, stream);
+    }
+    else if (tlength < 2048) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, DO_CROSS_ATTENTION, stream);
+    }
+    else {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, DO_CROSS_ATTENTION, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#undef MMHA_LAUNCH_KERNEL
+
+template<typename T, typename KERNEL_PARAMS_TYPE>
+void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    switch (params.hidden_size_per_head) {
+        case 32:
+            mmha_launch_kernel<T, 32, 32, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 48:
+            mmha_launch_kernel<T, 48, 64, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 64:
+            mmha_launch_kernel<T, 64, 64, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 80:
+            mmha_launch_kernel<T, 80, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 96:
+            mmha_launch_kernel<T, 96, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 112:
+            mmha_launch_kernel<T, 112, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 128:
+            mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 160:
+            mmha_launch_kernel<T, 160, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 192:
+            mmha_launch_kernel<T, 192, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 224:
+            mmha_launch_kernel<T, 224, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 256:
+            mmha_launch_kernel<T, 256, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        default:
+            assert(false);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Masked_multihead_attention_params<float>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Masked_multihead_attention_params<uint16_t>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream)
+{
+    multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void cross_multihead_attention(const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Cross_multihead_attention_params<float>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void cross_multihead_attention(const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Cross_multihead_attention_params<uint16_t>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params,
+                               const cudaStream_t&                                    stream)
+{
+    multihead_attention_<__nv_bfloat16, Cross_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/awq_cuda/attention/decoder_masked_multihead_attention.h
+++ b/awq_cuda/attention/decoder_masked_multihead_attention.h
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CHECK_CUDA(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t status_ = call;                                                                                    \
+        if (status_ != cudaSuccess) {                                                                                  \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_));              \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The structure of parameters for the masked multihead attention kernel.
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+
+template<typename T>
+struct Multihead_attention_params_base {
+
+    // The output buffer. Dimensions B x D.
+    T* out = nullptr;
+
+    // The input Qs and the associated bias. Dimensions B x D and D, resp.
+    const T *q = nullptr, *q_bias = nullptr;
+    // The input Ks and the associated bias. Dimensions B x D and D, resp.
+    const T *k = nullptr, *k_bias = nullptr;
+    // The input Vs and the associated bias. Dimensions B x D and D, resp.
+    const T *v = nullptr, *v_bias = nullptr;
+
+    // The cache for the Ks. The size must be at least B x L x D.
+    T* k_cache = nullptr;
+    // The cache for the Vs. The size must be at least B x L x D.
+    T* v_cache = nullptr;
+    // The indirections to use for cache when beam sampling.
+    const int* cache_indir = nullptr;
+
+    // Stride to handle the case when KQV is a single buffer
+    int stride = 0;
+
+    // The batch size.
+    int batch_size = 0;
+    // The beam width
+    int beam_width = 0;
+    // The sequence length.
+    int memory_max_len = 0;
+    // The number of heads (H).
+    int num_heads = 0;
+    // The number of heads for KV cache.
+    int num_kv_heads = 0;
+    // The hidden dimension per head (Dh).
+    int hidden_size_per_head = 0;
+    // The per-head latent space reserved for rotary embeddings.
+    int  rotary_embedding_dim = 0;
+    bool neox_rotary_style    = false;
+    float rotary_base = 0.0f;
+    // The maximum length of input sentences.
+    int max_input_length = 0;
+    // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
+    int timestep = 0;
+    // The current timestep of each sentences (support different timestep for different sentences)
+
+    // The 1.f / sqrt(Dh). Computed on the host.
+    float inv_sqrt_dh = 0.0f;
+
+    // Used when we have some input context like gpt
+    const int* total_padding_tokens = nullptr;
+
+    const bool* masked_tokens            = nullptr;
+    const int*  prefix_prompt_lengths    = nullptr;
+    int         max_prefix_prompt_length = 0;
+
+    const T* relative_attention_bias        = nullptr;
+    int      relative_attention_bias_stride = 0;
+    // The slope per head of linear position bias to attention score (H).
+    const float* linear_bias_slopes = nullptr;
+
+    const T*   ia3_key_weights   = nullptr;
+    const T*   ia3_value_weights = nullptr;
+    const int* ia3_tasks         = nullptr;
+
+    const float* qkv_scale_out       = nullptr;
+    const float* attention_out_scale = nullptr;
+    int          int8_mode           = 0;
+};
+
+template<typename T, bool CROSS_ATTENTION>
+struct Multihead_attention_params: public Multihead_attention_params_base<T> {
+    // output cross attentions
+    float* cross_attention_out        = nullptr;
+    int    max_decoder_seq_len        = 0;
+    bool   is_return_cross_attentions = false;
+
+    // allows to exist attention eary
+    bool* finished = nullptr;
+
+    // required in case of cross attention
+    // will need it here till if constexpr in c++17
+    int* memory_length_per_sample = nullptr;
+
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+};
+
+template<typename T>
+struct Multihead_attention_params<T, true>: public Multihead_attention_params_base<T> {
+    // output cross attentions
+    float* cross_attention_out        = nullptr;
+    int    max_decoder_seq_len        = 0;
+    bool   is_return_cross_attentions = false;
+
+    // allows to exist attention eary
+    bool* finished = nullptr;
+
+    // required in case of cross attention
+    int* memory_length_per_sample = nullptr;
+
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+};
+
+template<class T>
+using Masked_multihead_attention_params = Multihead_attention_params<T, false>;
+
+template<class T>
+using Cross_multihead_attention_params = Multihead_attention_params<T, true>;
+
+template<typename T>
+struct outputCrossAttentionParam {
+    // max decoder output length
+    int  max_decoder_seq_len        = 0;
+    T*   cross_attention_out        = nullptr;
+    bool is_return_cross_attentions = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream);
+#endif
+void cross_multihead_attention(const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void cross_multihead_attention(const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params,
+                               const cudaStream_t&                                    stream);
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/awq_cuda/attention/decoder_masked_multihead_attention_template.hpp
+++ b/awq_cuda/attention/decoder_masked_multihead_attention_template.hpp
--- a/awq_cuda/attention/decoder_masked_multihead_attention_utils.h
+++ b/awq_cuda/attention/decoder_masked_multihead_attention_utils.h
--- a/awq_cuda/attention/ft_attention.cpp
+++ b/awq_cuda/attention/ft_attention.cpp
+// Adapted from NVIDIA/FasterTransformer and FlashAttention
+
+#include <torch/extension.h>
+#include "ATen/cuda/CUDAContext.h"
+#include <c10/cuda/CUDAGuard.h>
+
+#include "ft_attention.h"
+#include "decoder_masked_multihead_attention.h"
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+#define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, NAME, ...)                  \
+  if (TYPE == at::ScalarType::Half) {                                      \
+    using scalar_t = at::Half;                                             \
+    __VA_ARGS__();                                                         \
+  } else if (TYPE == at::ScalarType::BFloat16) {                           \
+    using scalar_t = at::BFloat16;                                         \
+    __VA_ARGS__();                                                         \
+  } else if (TYPE == at::ScalarType::Float)  {                             \
+    using scalar_t = float;                                                \
+    __VA_ARGS__();                                                         \
+  } else {                                                                 \
+    AT_ERROR(#NAME, " not implemented for type '", toString(TYPE), "'"); \
+  }
+
+template<typename T>
+void masked_multihead_attention(const Masked_multihead_attention_params<T>& params,
+                                const cudaStream_t& stream);
+
+template<typename T>
+void cross_multihead_attention(const Masked_multihead_attention_params<T>& params,
+                               const cudaStream_t& stream);
+
+template<typename T>
+struct SATypeConverter {
+    using Type = T;
+};
+
+template<>
+struct SATypeConverter<at::Half> {
+    using Type = uint16_t;
+};
+
+template<>
+struct SATypeConverter<at::BFloat16> {
+    using Type = __nv_bfloat16;
+};
+
+template <typename T>
+void set_params(Masked_multihead_attention_params<T> &params,
+                const size_t batch_size,
+                const size_t nheads,
+                const size_t nheads_kv,
+                const size_t memory_max_seqlen,
+                const size_t headdim,
+                const int timestep,
+                const int rotary_embedding_dim,
+                const float rotary_base,
+                const bool neox_rotary_style,
+                const int qkv_batch_stride,
+                T *q_ptr,
+                T *k_ptr,
+                T *v_ptr,
+                T *k_cache_ptr,
+                T *v_cache_ptr,
+                int *length_per_sample,
+                float *alibi_slopes_ptr,
+                T *out_ptr) {
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+    params.q = q_ptr;
+    params.k = k_ptr;
+    params.v = v_ptr;
+    params.q_bias = nullptr;
+    params.k_bias = nullptr;
+    params.v_bias = nullptr;
+    params.k_cache = k_cache_ptr;
+    params.v_cache = v_cache_ptr;
+    params.linear_bias_slopes = alibi_slopes_ptr;
+    params.out = out_ptr;
+    params.cache_indir = nullptr;
+    params.stride = qkv_batch_stride;
+    params.batch_size = batch_size;
+    params.beam_width = 1;
+    params.memory_max_len = memory_max_seqlen;
+    params.num_heads = nheads;
+    params.num_kv_heads = nheads_kv;
+    params.hidden_size_per_head = headdim;
+    params.rotary_embedding_dim = rotary_embedding_dim;
+    params.rotary_base = rotary_base;
+    params.neox_rotary_style = neox_rotary_style;
+    params.timestep = timestep;
+    params.inv_sqrt_dh = 1.f / sqrt(float(headdim));
+    params.total_padding_tokens = nullptr;
+    params.masked_tokens = nullptr;
+    params.prefix_prompt_lengths = nullptr;
+    params.max_prefix_prompt_length = 0;
+    params.relative_attention_bias = nullptr;
+    params.relative_attention_bias_stride = 0;
+    params.cross_attention_out = nullptr;
+    params.max_decoder_seq_len = 0;
+    params.is_return_cross_attentions = false;
+    params.finished = nullptr;
+    params.memory_length_per_sample = nullptr;
+    params.length_per_sample = length_per_sample;
+}
+
+torch::Tensor single_query_attention(const torch::Tensor q,
+                                     const torch::Tensor k,
+                                     const torch::Tensor v,
+                                     torch::Tensor k_cache,
+                                     torch::Tensor v_cache,
+                                     c10::optional<const torch::Tensor> length_per_sample_,
+                                     c10::optional<const torch::Tensor> alibi_slopes_,
+                                     const int timestep,
+                                     const int rotary_embedding_dim,
+                                     const float rotary_base,
+                                     // neox_rotary_style = not interleaved
+                                     const bool neox_rotary_style) {
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); CHECK_DEVICE(k_cache); CHECK_DEVICE(v_cache);
+    int batch_size = v_cache.size(0);
+    int nheads = q.size(1);
+    int nheads_kv = v_cache.size(1);
+    int memory_max_seqlen = v_cache.size(2);
+    int headdim = v_cache.size(3);
+    CHECK_SHAPE(q, batch_size, nheads, headdim);
+    CHECK_SHAPE(k, batch_size, nheads_kv, headdim);
+    CHECK_SHAPE(v, batch_size, nheads_kv, headdim);
+    CHECK_SHAPE(v_cache, batch_size, nheads_kv, memory_max_seqlen, headdim);
+    // k_cache shape: [B, H, Dh/x, L, x] where x=8 for fp16 and x=4 for fp32
+    int packsize = k_cache.dtype() == torch::kFloat32 ? 4 : 8;
+    CHECK_SHAPE(k_cache, batch_size, nheads_kv, headdim / packsize, memory_max_seqlen, packsize);
+    TORCH_CHECK(q.stride(2) == 1 && q.stride(1) == headdim);
+    TORCH_CHECK(k.stride(2) == 1 && k.stride(1) == headdim);
+    TORCH_CHECK(v.stride(2) == 1 && v.stride(1) == headdim);
+    // TORCH_CHECK(q.stride(0) == k.stride(0) && q.stride(0) == v.stride(0));
+    CHECK_CONTIGUOUS(v_cache); CHECK_CONTIGUOUS(k_cache);
+
+    if (length_per_sample_.has_value()) {
+        auto length_per_sample = length_per_sample_.value();
+        CHECK_DEVICE(length_per_sample);
+        CHECK_SHAPE(length_per_sample, batch_size);
+        CHECK_CONTIGUOUS(length_per_sample);
+        TORCH_CHECK(length_per_sample.dtype() == torch::kInt32);
+    }
+
+    if (alibi_slopes_.has_value()) {
+      auto alibi_slopes = alibi_slopes_.value();
+      CHECK_DEVICE(alibi_slopes);
+      CHECK_SHAPE(alibi_slopes, nheads);
+      CHECK_CONTIGUOUS(alibi_slopes); 
+      TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32);
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    torch::Tensor out = torch::empty_like(q);
+
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(q.scalar_type(), "single_query_attention", [&] {
+        using DataType = typename SATypeConverter<scalar_t>::Type;
+        Masked_multihead_attention_params<DataType> params;
+        set_params(params, batch_size, nheads, nheads_kv, memory_max_seqlen, headdim, 
+                   timestep, rotary_embedding_dim, rotary_base, neox_rotary_style, q.stride(0),
+                   reinterpret_cast<DataType*>(q.data_ptr()),
+                   reinterpret_cast<DataType*>(k.data_ptr()),
+                   reinterpret_cast<DataType*>(v.data_ptr()),
+                   reinterpret_cast<DataType*>(k_cache.data_ptr()),
+                   reinterpret_cast<DataType*>(v_cache.data_ptr()),
+                   length_per_sample_.has_value()
+                       ? length_per_sample_.value().data_ptr<int>() : nullptr,
+                   alibi_slopes_.has_value() 
+                       ? alibi_slopes_.value().data_ptr<float>(): nullptr,
+                   reinterpret_cast<DataType*>(out.data_ptr()));
+        auto stream = at::cuda::getCurrentCUDAStream();
+        masked_multihead_attention(params, stream);
+    });
+    return out;
+}
\ No newline at end of file
--- a/awq_cuda/attention/ft_attention.h
+++ b/awq_cuda/attention/ft_attention.h
+#pragma once
+#include <torch/extension.h>
+
+
+torch::Tensor single_query_attention(const torch::Tensor q,
+                                     const torch::Tensor k,
+                                     const torch::Tensor v,
+                                     torch::Tensor k_cache,
+                                     torch::Tensor v_cache,
+                                     c10::optional<const torch::Tensor> length_per_sample_,
+                                     c10::optional<const torch::Tensor> alibi_slopes_,
+                                     const int timestep,
+                                     const int rotary_embedding_dim = 0,
+                                     const float rotary_base = 10000.0f,
+                                     const bool neox_rotary_style=true);
\ No newline at end of file
--- a/awq_cuda/layernorm/layernorm.cu
+++ b/awq_cuda/layernorm/layernorm.cu
+/*
+
+Adapted from NVIDIA FasterTransformer:
+https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/layernorm_kernels.cu
+
+*/
+
+#include <torch/extension.h>
+#include <cuda_fp16.h>
+#include "reduction.cuh"
+#include "layernorm.h"
+#include <cuda_runtime.h>
+#include <c10/cuda/CUDAGuard.h>
+
+static inline __device__ float to_float(half src)
+{
+    return __half2float(src);
+}
+
+static inline __device__ float to_float(float src)
+{
+    return src;
+}
+
+template<typename T>
+__global__ void generalT5LayerNorm(
+    const T* __restrict input, const T* __restrict gamma, T* output, const float layernorm_eps, int m, int n)
+{
+    // layernorm module in the T5 style No bias and no subtraction of mean.
+    const int tid = threadIdx.x;
+
+    __shared__ float s_variance;
+    float            variance = 0.0f;
+
+    float local_var_sum = 0.0f;
+    for (int i = tid; i < n; i += blockDim.x) {
+        float diff = to_float(__ldg(&input[blockIdx.x * n + i]));
+        local_var_sum += diff * diff;
+    }
+    variance = blockReduceSum(local_var_sum);
+
+    if (threadIdx.x == 0) {
+        s_variance = rsqrtf(variance / (float)n + layernorm_eps);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < n; i += blockDim.x) {
+        output[blockIdx.x * n + i] =
+            clamp_inf_for_half<T>((to_float(input[blockIdx.x * n + i]) * s_variance) * to_float(__ldg(&gamma[i])));
+    }
+}
+
+
+template<typename T>
+void invokeGeneralT5LayerNorm(T*           out,
+                              const T*     input,
+                              const T*     gamma,
+                              // const T*     beta,
+                              const float  layernorm_eps,
+                              const int    m,
+                              const int    n)
+{
+    dim3 grid(m);
+    dim3 block(min(n, 1024));
+
+    /* For general cases, n is equal to hidden_units, e.g., 512/1024.
+        Since we have warp shuffle inside the code, block.x % 32 should be 0.
+    */
+    if (n % 32 != 0) {
+        block.x = 1024;
+    }
+
+    block.x = block.x / (4 / sizeof(T));  // if using half, only need half of block.x
+
+    /* should pay attention to the rsqrt precision*/
+    generalT5LayerNorm<T><<<grid, block>>>(input, gamma, out, layernorm_eps, m, n);  // For gpt-3
+}
+
+template void invokeGeneralT5LayerNorm(half*           out,
+                              const half*     input,
+                              const half*     gamma,
+                              // const half*     beta,
+                              const float  layernorm_eps,
+                              const int    m,
+                              const int    n);
+
+template void invokeGeneralT5LayerNorm(float*           out,
+                              const float*     input,
+                              const float*     gamma,
+                              // const half*     beta,
+                              const float  layernorm_eps,
+                              const int    m,
+                              const int    n);
+
+
+
+// input b, n, c
+void layernorm_forward_cuda(
+    torch::Tensor _input,
+    torch::Tensor _gamma,
+    torch::Tensor _out,
+    float eps)
+{
+    int m = _input.size(0) * _input.size(1);
+    int n = _input.size(2);
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(_input));
+
+    auto input = reinterpret_cast<half*>(_input.data_ptr<at::Half>());
+    auto gamma = reinterpret_cast<half*>(_gamma.data_ptr<at::Half>());
+    auto out = reinterpret_cast<half*>(_out.data_ptr<at::Half>());
+
+    invokeGeneralT5LayerNorm(out, input, gamma, eps, m, n);
+}
--- a/awq_cuda/layernorm/layernorm.h
+++ b/awq_cuda/layernorm/layernorm.h
+#include <torch/extension.h>
+
+void layernorm_forward_cuda(torch::Tensor _input, torch::Tensor _gamma, torch::Tensor _out, float eps);
--- a/awq_cuda/layernorm/reduction.cuh
+++ b/awq_cuda/layernorm/reduction.cuh
+/*
+
+Adapted from NVIDIA FasterTransformer:
+https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/kernels/reduce_kernel_utils.cuh
+*/
+
+#pragma once
+#include <assert.h>
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+#include <cooperative_groups/reduce.h>
+#else
+#include <cooperative_groups.h>
+#endif
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <float.h>
+#include <type_traits>
+
+#define HALF_FLT_MAX 65504.F
+#define FINAL_MASK 0xffffffff
+
+
+template<typename T>
+inline __device__ T add(T a, T b) {
+    return a + b;
+}
+
+template<>
+inline __device__ half2 add(half2 a, half2 b) {
+    return __hadd2(a, b);
+}
+
+template<>
+inline __device__ half add(half a, half b) {
+    return __hadd(a, b);
+}
+
+template<typename T>
+__inline__ __device__ T warpReduceSum(T val)
+{
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1)
+        val = add(val, __shfl_xor_sync(FINAL_MASK, val, mask, 32));  //__shfl_sync bf16 return float when sm < 80
+    return val;
+}
+
+/* Calculate the sum of all elements in a block */
+template<typename T>
+__inline__ __device__ T blockReduceSum(T val)
+{
+    static __shared__ T shared[32];
+    int                 lane = threadIdx.x & 0x1f;
+    int                 wid  = threadIdx.x >> 5;
+
+    val = warpReduceSum<T>(val);
+
+    if (lane == 0)
+        shared[wid] = val;
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
+    val = warpReduceSum<T>(val);
+
+    return val;
+}
+
+
+template<typename T>
+__device__ __forceinline__ T clamp_inf_for_half(const float input)
+{
+    return input;
+}
+
+template<>
+__device__ __forceinline__ half clamp_inf_for_half(const float input)
+{
+    // clamp inf values to enable fp16 training
+    return input > 0.0f ? __float2half(min(input, HALF_FLT_MAX - 1000)) : __float2half(max(input, -HALF_FLT_MAX + 1000));
+}
--- a/awq_cuda/position_embedding/pos_encoding.h
+++ b/awq_cuda/position_embedding/pos_encoding.h
+#pragma once
+#include <torch/extension.h>
+
+void rotary_embedding_neox(
+  torch::Tensor& positions,
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int head_size,
+  torch::Tensor& cos_sin_cache);
\ No newline at end of file
--- a/awq_cuda/position_embedding/pos_encoding_kernels.cu
+++ b/awq_cuda/position_embedding/pos_encoding_kernels.cu
+/*
+
+Adapted from the VLLM project:
+https://github.com/vllm-project/vllm/blob/main/csrc/pos_encoding_kernels.cu
+
+*/
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "pos_encoding.h"
+
+template<typename scalar_t>
+__global__ void rotary_embedding_neox_kernel(
+  const int64_t* __restrict__ positions,        // [num_tokens]
+  scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [num_tokens, num_heads, head_size]
+  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
+  const int rot_dim,
+  const int stride,
+  const int num_heads,
+  const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+
+  const int embed_dim = rot_dim / 2;
+  const int n = num_heads * embed_dim;
+  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int token_head = token_idx * stride + head_idx * head_size;
+
+    const int rot_offset = i % embed_dim;
+    const int x_index = rot_offset;
+    const int y_index = embed_dim + rot_offset;
+
+    const int out_x = token_idx * stride + head_idx * head_size + x_index;
+    const int out_y = token_idx * stride + head_idx * head_size + y_index;
+
+    const scalar_t cos = __ldg(cache_ptr + x_index);
+    const scalar_t sin = __ldg(cache_ptr + y_index);
+
+    const scalar_t q_x = query[token_head + x_index];
+    const scalar_t q_y = query[token_head + y_index];
+    query[out_x] = q_x * cos - q_y * sin;
+    query[out_y] = q_y * cos + q_x * sin;
+
+    const scalar_t k_x = key[token_head + x_index];
+    const scalar_t k_y = key[token_head + y_index];
+    key[out_x] = k_x * cos - k_y * sin;
+    key[out_y] = k_y * cos + k_x * sin;
+  }
+}
+
+void rotary_embedding_neox(
+  torch::Tensor& positions,         // [b, num_tokens]
+  torch::Tensor& query,             // [b, num_tokens, 1, num_heads, head_size]
+  torch::Tensor& key,               // [b, num_tokens, 1, num_heads, head_size]
+  int head_size,
+  torch::Tensor& cos_sin_cache)     // [max_position, rot_dim]
+{
+  int num_tokens = query.size(0) * query.size(1);
+  int rot_dim = cos_sin_cache.size(1);
+  int num_heads = query.size(-2);
+  int stride = num_heads * head_size;
+  // TORCH_CHECK(stride == key.stride(0));
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    query.scalar_type(),
+    "rotary_embedding_neox",
+    [&] {
+      rotary_embedding_neox_kernel<scalar_t><<<grid, block, 0, stream>>>(
+        positions.data_ptr<int64_t>(),
+        query.data_ptr<scalar_t>(),
+        key.data_ptr<scalar_t>(),
+        cos_sin_cache.data_ptr<scalar_t>(),
+        rot_dim,
+        stride,
+        num_heads,
+        head_size);
+    });
+}
+
--- a/awq_cuda/pybind_awq.cpp
+++ b/awq_cuda/pybind_awq.cpp
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+#include "layernorm/layernorm.h"
+#include "quantization/gemm_cuda.h"
+#include "quantization/gemv_cuda.h"
+#include "position_embedding/pos_encoding.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("layernorm_forward_cuda", &layernorm_forward_cuda, "FasterTransformer layernorm kernel");
+    m.def("gemm_forward_cuda", &gemm_forward_cuda, "Quantized GEMM kernel.");
+    m.def("gemmv2_forward_cuda", &gemmv2_forward_cuda, "Quantized v2 GEMM kernel.");
+    m.def("gemv_forward_cuda", &gemv_forward_cuda, "Quantized GEMV kernel.");
+    m.def("rotary_embedding_neox", &rotary_embedding_neox, "Apply GPT-NeoX style rotary embedding to query and key");
+}
\ No newline at end of file
--- a/awq_cuda/pybind_awq_ft.cpp
+++ b/awq_cuda/pybind_awq_ft.cpp
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+#include "attention/ft_attention.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("single_query_attention", &single_query_attention, "Attention with a single query",
+          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"),
+          py::arg("length_per_sample_"), py::arg("alibi_slopes_"), py::arg("timestep"), py::arg("rotary_embedding_dim")=0,
+          py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true);
+}
\ No newline at end of file
--- a/awq_cuda/quantization/dequantize.cuh
+++ b/awq_cuda/quantization/dequantize.cuh
+/*
+Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+
+@article{lin2023awq,
+  title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
+  author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
+  journal={arXiv},
+  year={2023}
+}
+*/
+
+#pragma once
+
+
+__device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
+{
+    uint4 result;
+
+    uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
+    uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
+
+    // First, we extract the i4s and construct an intermediate fp16 number.
+    static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
+    static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
+    static constexpr uint32_t TOP_MASK              = 0x00f000f0;
+    static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+    // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
+    // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
+    // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
+    // elt_67 to fp16 without having to shift them to the bottom bits before hand.
+
+    // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
+    // immediately before required.
+    const uint32_t top_i4s = i4s >> 8;
+    // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                    : "=r"(h[0])
+                    : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                    : "=r"(h[1])
+                    : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                    : "=r"(h[2])
+                    : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+    // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
+    asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+                    : "=r"(h[3])
+                    : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
+
+    // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
+    // half2 ctor. In this case, I chose performance reliability over code readability.
+
+    // This is the half2 {1032, 1032} represented as an integer.
+    // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
+    // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
+    static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
+    // This is the half2 {1 / 16, 1 / 16} represented as an integer.
+    static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
+    // This is the half2 {-72, -72} represented as an integer.
+    // static constexpr uint32_t NEG_72 = 0xd480d480;
+    // Haotian: Let's use {-64, -64}.
+    static constexpr uint32_t NEG_64 = 0xd400d400;
+
+    // Finally, we construct the output numbers.
+    // Convert elt_01
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_23
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+    // Convert elt_45
+    asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
+    // Convert elt_67
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
+
+    return result;
+}
+
--- a/awq_cuda/quantization/gemm_cuda.h
+++ b/awq_cuda/quantization/gemm_cuda.h
+#include <torch/extension.h>
+
+torch::Tensor gemm_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel,
+    torch::Tensor _scaling_factors, torch::Tensor _zeros, int split_k_iters);
+
+torch::Tensor gemmv2_forward_cuda(torch::Tensor _in_feats, torch::Tensor _kernel,
+    torch::Tensor _scaling_factors, torch::Tensor _zeros, int group_size, int split_k_iters);
\ No newline at end of file
--- a/awq_cuda/quantization/gemm_cuda_gen.cu
+++ b/awq_cuda/quantization/gemm_cuda_gen.cu