Merge branch 'v0.0.6_develop_sugon' into 'main'

v0.0.6 See merge request dcutoolkit/deeplearing/autoawq_kernels!1

Merge branch 'v0.0.6_develop_sugon' into 'main'
v0.0.6 See merge request dcutoolkit/deeplearing/autoawq_kernels!1
9f217825 · gaoqiong · b2c05ad6 · 1c46b800 · 9f217825 · 9f217825
Commit 9f217825 authored Jul 03, 2024 by gaoqiong
20 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
+name: Build AutoAWQ Wheels with CUDA
+on:
+  push:
+    tags:
+      - "v*"
+jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/github_create_release.js')
+            await script(github, context, core)
+  build_cuda_wheels:
+    name: Build AWQ with CUDA
+    runs-on: ${{ matrix.os }}
+    needs: release
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-latest]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
+        cuda: ["11.8.0", "12.1.1"]
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      PYPI_CUDA_VERSION: "12.1.1"
+      CUDA_VERSION: ${{ matrix.cuda }}
+    steps:
+      - name: Free Disk Space
+        uses: jlumbroso/free-disk-space@v1.3.0
+        if: runner.os == 'Linux'
+        with:
+          tool-cache: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: false
+          docker-images: true
+          swap-storage: false
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Setup Mamba
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+      - name: Install Dependencies
+        run: |
+          # Install CUDA toolkit
+          mamba install -y 'cuda' -c "nvidia/label/cuda-${env:CUDA_VERSION}"
+          # Env variables
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+          # Install torch
+          $cudaVersion = $env:CUDA_VERSION.Replace('.', '')
+          $cudaVersionPytorch = $cudaVersion.Substring(0, $cudaVersion.Length - 1)
+          $pytorchVersion = "torch==2.2.1"
+          python -m pip install --upgrade --no-cache-dir $pytorchVersion+cu$cudaVersionPytorch --index-url https://download.pytorch.org/whl/cu$cudaVersionPytorch
+          python -m pip install build setuptools wheel ninja
+          # Print version information
+          python --version
+          python -c "import torch; print('PyTorch:', torch.__version__)"
+          python -c "import torch; print('CUDA:', torch.version.cuda)"
+          python -c "import os; print('CUDA_HOME:', os.getenv('CUDA_HOME', None))"
+          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
+      - name: Build Wheel
+        run: |
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+          # Only add +cu118 to wheel if not releasing on PyPi
+          if ( $env:CUDA_VERSION -eq $env:PYPI_CUDA_VERSION ){
+            $env:PYPI_BUILD = 1
+          }
+          python setup.py sdist bdist_wheel
+      - name: Upload Assets
+        uses: shogo82148/actions-upload-release-asset@v1
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/*.whl
+  build_rocm_wheels:
+    name: Build AWQ with ROCm
+    runs-on: ${{ matrix.os }}
+    needs: release
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        python: ["3.8", "3.9", "3.10", "3.11"]
+        rocm: ["5.6.1", "5.7.1"] # we build only for rocm5.6 & 5.7 to match PyTorch 2.1.0 and PyTorch 2.2 nightly
+    defaults:
+      run:
+        shell: bash
+    env:
+      ROCM_VERSION: ${{ matrix.rocm }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Free Disk Space
+        run: |
+          df -h
+          echo "Removing large packages"
+          sudo apt-get remove -y '^dotnet-.*'
+          sudo apt-get remove -y 'php.*'
+          sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get autoclean -y >/dev/null 2>&1
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Setup Mamba
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.python }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+      - name: Set up ROCm
+        run: |
+          echo "Using python:"
+          python --version
+          which python
+          if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.7.50701-1_all.deb
+          else
+            echo Unknown rocm version
+            exit 1
+          fi
+          curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
+          sudo dpkg -i $ROCM_DL_FILE
+          sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
+      - name: Install Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
+          python -m pip install --upgrade build setuptools wheel
+          if [[ "${{ matrix.rocm }}" == "5.7.1" ]]; then
+            python -m pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/rocm5.7
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            python -m pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/rocm5.6
+          else
+            echo Unknown rocm version for python install
+            exit 1
+          fi
+      - name: Build Wheel
+        run: |
+          echo "Using python for build:"
+          python --version
+          which python
+          ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
+      - name: Upload Assets
+        uses: shogo82148/actions-upload-release-asset@v1
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/*.whl
--- a/.github/workflows/scripts/github_create_release.js
+++ b/.github/workflows/scripts/github_create_release.js
+module.exports = async (github, context, core) => {
+	try {
+		const response = await github.rest.repos.createRelease({
+			draft: false,
+			generate_release_notes: true,
+			name: process.env.RELEASE_TAG,
+			owner: context.repo.owner,
+			prerelease: false,
+			repo: context.repo.repo,
+			tag_name: process.env.RELEASE_TAG,
+		});
+		core.setOutput('upload_url', response.data.upload_url);
+	} catch (error) {
+		core.setFailed(error.message);
+	}
+}
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*hip*
+!hip_compact.hip
--- a/LICENSE
+++ b/LICENSE
+MIT License
+Copyright (c) 2023 Casper
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
-# AutoAWQ_kernels
+# <div align="center"><strong>AutoAWQ_kernel</strong></div>
+## 简介
+AutoAWQ_kernel是一个从AutoAWQ分离出来的一个组件，以减少编译时间
+## 安装
+### 使用源码编译方式安装
+#### 编译环境准备
+下载光源的镜像，起dcoker
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
+# <Image ID>用上面拉取docker镜像的ID替换
+# <Host Path>主机端路径
+# <Container Path>容器映射路径
+docker run -it --name baichuan --shm-size=1024G -v /opt/hyhal:/opt/hyhal:ro --device=/dev/kfd --device=/dev/dri/ --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --ulimit memlock=-1:-1 --ipc=host --network host --group-add video -v <Host Path>:<Container Path> <Image ID> /bin/bash
+```
+注：
+1、docker启动  -v /opt/hyhal:/opt/hyhal  这个变量不能少             
+#### 源码编译安装
+- 代码下载
+根据不同的需求下载不同的分支
+- 提供2种源码编译方式（进入AutoAWQ目录）：
+```
+1. 源码编译安装
+pip3 install e.
+2. 编译成whl包安装
+# 安装wheel 
+python3 setup.py bdist_wheel
+cd dist && pip3 install autoawq*
+```
--- a/README_origin.md
+++ b/README_origin.md
+# AutoAWQ Kernels
+AutoAWQ Kernels is a new package that is split up from the [main repository](https://github.com/casper-hansen/AutoAWQ) in order to avoid compilation times.
+## Requirements
+- Windows: Must use WSL2.
+- NVIDIA:
+  - GPU: Must be compute capability 7.5 or higher.
+  - CUDA Toolkit: Must be 11.8 or higher.
+- AMD:
+  - ROCm: Must be 5.6 or higher.
+## Install
+### Install from PyPi
+The package is available on PyPi with CUDA 12.1.1 wheels:
+```
+pip install autoawq-kernels
+```
+### Install release wheels
+For ROCm and other CUDA versions, you can use the wheels published at each [release](https://github.com/casper-hansen/AutoAWQ_kernels/releases/):
+```
+pip install https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.2/autoawq_kernels-0.0.2+rocm561-cp310-cp310-linux_x86_64.whl
+```
+### Build from source
+You can also build from source:
+```
+git clone https://github.com/casper-hansen/AutoAWQ_kernels
+cd AutoAWQ_kernels
+pip install -e .
+```
+To build for ROCm, you need to first install the following packages `rocsparse-dev hipsparse-dev rocthrust-dev rocblas-dev hipblas-dev`.
\ No newline at end of file
--- a/awq_ext/attention/cuda_bf16_fallbacks.cuh
+++ b/awq_ext/attention/cuda_bf16_fallbacks.cuh
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+namespace fastertransformer {
+#ifdef ENABLE_BF16
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = __low2float(val);
+    f_val.y = __high2float(val);
+    return f_val;
+#else
+    return __bfloat1622float2(val);
+#endif
+}
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = max(min(__low2float(val), 127.f), -128.f);
+    f_val.y = max(min(__high2float(val), 127.f), -128.f);
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
+    return int16;
+#else
+    val = __hmin2(val, make_bfloat162(127., 127.));
+    val = __hmax2(val, make_bfloat162(-128., -128.));
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
+    return int16;
+#endif
+}
+inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __floats2bfloat162_rn(val.x, val.y);
+#else
+    return __float22bfloat162_rn(val);
+#endif
+}
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    __nv_bfloat162 val2;
+    val2.x = val;
+    val2.y = val;
+    return val2;
+#else
+    return __bfloat162bfloat162(val);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
+#else
+    return __hadd2(x, y);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+#else
+    return __hadd(x, y);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
+#else
+    return __hsub2(x, y);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+#else
+    return __hsub(x, y);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
+#else
+    return __hmul2(x, y);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+#else
+    return __hmul(x, y);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh, fzl, fzh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    fzl = __low2float(z);
+    fzh = __high2float(z);
+    return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
+#else
+    return __hfma2(x, y, z);
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+#else
+    return __hfma(x, y, z);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);;
+    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
+#else
+    return h2exp(x);
+#endif
+}
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+#endif
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
+#else
+    return a + b + c;
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
+#else
+    return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
+#else
+    return a + b + c;
+#endif
+}
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
+#else
+    return a * b * c;
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
+#else
+    return a * b * c;
+#endif
+}
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    fdl = __low2float(d);
+    fdh = __high2float(d);
+    return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
+#else
+    return a * b * c + d;
+#endif
+}
+#endif // ENABLE_BF16
+}  // namespace fastertransformer
--- a/awq_ext/attention/cuda_bf16_wrapper.h
+++ b/awq_ext/attention/cuda_bf16_wrapper.h
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#ifdef ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
--- a/awq_ext/attention/decoder_masked_multihead_attention.cu
+++ b/awq_ext/attention/decoder_masked_multihead_attention.cu
+// Adapted from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "decoder_masked_multihead_attention.h"
+#include "decoder_masked_multihead_attention_utils.h"
+#include "cuda_bf16_wrapper.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+#include "decoder_masked_multihead_attention_template.hpp"
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, DO_CROSS_ATTENTION, stream)    \
+    size_t smem_sz = mmha::smem_size_in_bytes<T, DO_CROSS_ATTENTION>(params, THDS_PER_VALUE, THDS_PER_BLOCK);          \
+    auto kernel = mmha::masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE,                 \
+                                                          THDS_PER_BLOCK, DO_CROSS_ATTENTION>;                         \
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz);                                \
+    dim3 grid(params.num_heads, params.batch_size);                                                                    \
+    kernel<<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// !!! Specialize the launcher for Cross attention
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    constexpr int  THREADS_PER_VALUE  = Dh_MAX * sizeof(T) / 16;
+    constexpr bool DO_CROSS_ATTENTION = std::is_same<KERNEL_PARAMS_TYPE, Cross_multihead_attention_params<T>>::value;
+    int            tlength            = (DO_CROSS_ATTENTION) ? params.memory_max_len : params.timestep;
+    // printf("tlength, CROSS_ATTENTION = %d, %d\n", tlength, DO_CROSS_ATTENTION);
+    if (tlength < 32) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, DO_CROSS_ATTENTION, stream);
+    }
+    else if (tlength < 2048) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, DO_CROSS_ATTENTION, stream);
+    }
+    else {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, DO_CROSS_ATTENTION, stream);
+    }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#undef MMHA_LAUNCH_KERNEL
+template<typename T, typename KERNEL_PARAMS_TYPE>
+void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    switch (params.hidden_size_per_head) {
+        case 32:
+            mmha_launch_kernel<T, 32, 32, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 48:
+            mmha_launch_kernel<T, 48, 64, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 64:
+            mmha_launch_kernel<T, 64, 64, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 80:
+            mmha_launch_kernel<T, 80, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 96:
+            mmha_launch_kernel<T, 96, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 112:
+            mmha_launch_kernel<T, 112, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 128:
+            mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 160:
+            mmha_launch_kernel<T, 160, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 192:
+            mmha_launch_kernel<T, 192, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 224:
+            mmha_launch_kernel<T, 224, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 256:
+            mmha_launch_kernel<T, 256, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        default:
+            assert(false);
+    }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Masked_multihead_attention_params<float>>(params, stream);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Masked_multihead_attention_params<uint16_t>>(params, stream);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream)
+{
+    multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void cross_multihead_attention(const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Cross_multihead_attention_params<float>>(params, stream);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void cross_multihead_attention(const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Cross_multihead_attention_params<uint16_t>>(params, stream);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef ENABLE_BF16
+void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params,
+                               const cudaStream_t&                                    stream)
+{
+    multihead_attention_<__nv_bfloat16, Cross_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/awq_ext/attention/decoder_masked_multihead_attention.h
+++ b/awq_ext/attention/decoder_masked_multihead_attention.h
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#define CHECK_CUDA(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t status_ = call;                                                                                    \
+        if (status_ != cudaSuccess) {                                                                                  \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_));              \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// The structure of parameters for the masked multihead attention kernel.
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+template<typename T>
+struct Multihead_attention_params_base {
+    // The output buffer. Dimensions B x D.
+    T* out = nullptr;
+    // The input Qs and the associated bias. Dimensions B x D and D, resp.
+    const T *q = nullptr, *q_bias = nullptr;
+    // The input Ks and the associated bias. Dimensions B x D and D, resp.
+    const T *k = nullptr, *k_bias = nullptr;
+    // The input Vs and the associated bias. Dimensions B x D and D, resp.
+    const T *v = nullptr, *v_bias = nullptr;
+    // The cache for the Ks. The size must be at least B x L x D.
+    T* k_cache = nullptr;
+    // The cache for the Vs. The size must be at least B x L x D.
+    T* v_cache = nullptr;
+    // The indirections to use for cache when beam sampling.
+    const int* cache_indir = nullptr;
+    // Stride to handle the case when KQV is a single buffer
+    int stride = 0;
+    // The batch size.
+    int batch_size = 0;
+    // The beam width
+    int beam_width = 0;
+    // The sequence length.
+    int memory_max_len = 0;
+    // The number of heads (H).
+    int num_heads = 0;
+    // The number of heads for KV cache.
+    int num_kv_heads = 0;
+    // The hidden dimension per head (Dh).
+    int hidden_size_per_head = 0;
+    // The per-head latent space reserved for rotary embeddings.
+    int  rotary_embedding_dim = 0;
+    bool neox_rotary_style    = false;
+    float rotary_base = 0.0f;
+    // The maximum length of input sentences.
+    int max_input_length = 0;
+    // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
+    int timestep = 0;
+    // The current timestep of each sentences (support different timestep for different sentences)
+    // The 1.f / sqrt(Dh). Computed on the host.
+    float inv_sqrt_dh = 0.0f;
+    // Used when we have some input context like gpt
+    const int* total_padding_tokens = nullptr;
+    const bool* masked_tokens            = nullptr;
+    const int*  prefix_prompt_lengths    = nullptr;
+    int         max_prefix_prompt_length = 0;
+    const T* relative_attention_bias        = nullptr;
+    int      relative_attention_bias_stride = 0;
+    // The slope per head of linear position bias to attention score (H).
+    const float* linear_bias_slopes = nullptr;
+    const T*   ia3_key_weights   = nullptr;
+    const T*   ia3_value_weights = nullptr;
+    const int* ia3_tasks         = nullptr;
+    const float* qkv_scale_out       = nullptr;
+    const float* attention_out_scale = nullptr;
+    int          int8_mode           = 0;
+};
+template<typename T, bool CROSS_ATTENTION>
+struct Multihead_attention_params: public Multihead_attention_params_base<T> {
+    // output cross attentions
+    float* cross_attention_out        = nullptr;
+    int    max_decoder_seq_len        = 0;
+    bool   is_return_cross_attentions = false;
+    // allows to exist attention eary
+    bool* finished = nullptr;
+    // required in case of cross attention
+    // will need it here till if constexpr in c++17
+    int* memory_length_per_sample = nullptr;
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+};
+template<typename T>
+struct Multihead_attention_params<T, true>: public Multihead_attention_params_base<T> {
+    // output cross attentions
+    float* cross_attention_out        = nullptr;
+    int    max_decoder_seq_len        = 0;
+    bool   is_return_cross_attentions = false;
+    // allows to exist attention eary
+    bool* finished = nullptr;
+    // required in case of cross attention
+    int* memory_length_per_sample = nullptr;
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+};
+template<class T>
+using Masked_multihead_attention_params = Multihead_attention_params<T, false>;
+template<class T>
+using Cross_multihead_attention_params = Multihead_attention_params<T, true>;
+template<typename T>
+struct outputCrossAttentionParam {
+    // max decoder output length
+    int  max_decoder_seq_len        = 0;
+    T*   cross_attention_out        = nullptr;
+    bool is_return_cross_attentions = false;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream);
+#endif
+void cross_multihead_attention(const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void cross_multihead_attention(const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params,
+                               const cudaStream_t&                                    stream);
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
--- a/awq_ext/attention/decoder_masked_multihead_attention_template.hpp
+++ b/awq_ext/attention/decoder_masked_multihead_attention_template.hpp
--- a/awq_ext/attention/decoder_masked_multihead_attention_utils.h
+++ b/awq_ext/attention/decoder_masked_multihead_attention_utils.h
--- a/awq_ext/attention/ft_attention.cpp
+++ b/awq_ext/attention/ft_attention.cpp
+// Adapted from NVIDIA/FasterTransformer and FlashAttention
+#include <torch/extension.h>
+#include "ATen/cuda/CUDAContext.h"
+#include <c10/cuda/CUDAGuard.h>
+#include "ft_attention.h"
+#include "decoder_masked_multihead_attention.h"
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, NAME, ...)                  \
+  if (TYPE == at::ScalarType::Half) {                                      \
+    using scalar_t = at::Half;                                             \
+    __VA_ARGS__();                                                         \
+  } else if (TYPE == at::ScalarType::BFloat16) {                           \
+    using scalar_t = at::BFloat16;                                         \
+    __VA_ARGS__();                                                         \
+  } else if (TYPE == at::ScalarType::Float)  {                             \
+    using scalar_t = float;                                                \
+    __VA_ARGS__();                                                         \
+  } else {                                                                 \
+    AT_ERROR(#NAME, " not implemented for type '", toString(TYPE), "'"); \
+  }
+template<typename T>
+void masked_multihead_attention(const Masked_multihead_attention_params<T>& params,
+                                const cudaStream_t& stream);
+template<typename T>
+void cross_multihead_attention(const Masked_multihead_attention_params<T>& params,
+                               const cudaStream_t& stream);
+template<typename T>
+struct SATypeConverter {
+    using Type = T;
+};
+template<>
+struct SATypeConverter<at::Half> {
+    using Type = uint16_t;
+};
+template<>
+struct SATypeConverter<at::BFloat16> {
+    using Type = __nv_bfloat16;
+};
+template <typename T>
+void set_params(Masked_multihead_attention_params<T> &params,
+                const size_t batch_size,
+                const size_t nheads,
+                const size_t nheads_kv,
+                const size_t memory_max_seqlen,
+                const size_t headdim,
+                const int timestep,
+                const int rotary_embedding_dim,
+                const float rotary_base,
+                const bool neox_rotary_style,
+                const int qkv_batch_stride,
+                T *q_ptr,
+                T *k_ptr,
+                T *v_ptr,
+                T *k_cache_ptr,
+                T *v_cache_ptr,
+                int *length_per_sample,
+                float *alibi_slopes_ptr,
+                T *out_ptr) {
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+    params.q = q_ptr;
+    params.k = k_ptr;
+    params.v = v_ptr;
+    params.q_bias = nullptr;
+    params.k_bias = nullptr;
+    params.v_bias = nullptr;
+    params.k_cache = k_cache_ptr;
+    params.v_cache = v_cache_ptr;
+    params.linear_bias_slopes = alibi_slopes_ptr;
+    params.out = out_ptr;
+    params.cache_indir = nullptr;
+    params.stride = qkv_batch_stride;
+    params.batch_size = batch_size;
+    params.beam_width = 1;
+    params.memory_max_len = memory_max_seqlen;
+    params.num_heads = nheads;
+    params.num_kv_heads = nheads_kv;
+    params.hidden_size_per_head = headdim;
+    params.rotary_embedding_dim = rotary_embedding_dim;
+    params.rotary_base = rotary_base;
+    params.neox_rotary_style = neox_rotary_style;
+    params.timestep = timestep;
+    params.inv_sqrt_dh = 1.f / sqrt(float(headdim));
+    params.total_padding_tokens = nullptr;
+    params.masked_tokens = nullptr;
+    params.prefix_prompt_lengths = nullptr;
+    params.max_prefix_prompt_length = 0;
+    params.relative_attention_bias = nullptr;
+    params.relative_attention_bias_stride = 0;
+    params.cross_attention_out = nullptr;
+    params.max_decoder_seq_len = 0;
+    params.is_return_cross_attentions = false;
+    params.finished = nullptr;
+    params.memory_length_per_sample = nullptr;
+    params.length_per_sample = length_per_sample;
+}
+torch::Tensor single_query_attention(const torch::Tensor q,
+                                     const torch::Tensor k,
+                                     const torch::Tensor v,
+                                     torch::Tensor k_cache,
+                                     torch::Tensor v_cache,
+                                     c10::optional<const torch::Tensor> length_per_sample_,
+                                     c10::optional<const torch::Tensor> alibi_slopes_,
+                                     const int timestep,
+                                     const int rotary_embedding_dim,
+                                     const float rotary_base,
+                                     // neox_rotary_style = not interleaved
+                                     const bool neox_rotary_style) {
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); CHECK_DEVICE(k_cache); CHECK_DEVICE(v_cache);
+    int batch_size = v_cache.size(0);
+    int nheads = q.size(1);
+    int nheads_kv = v_cache.size(1);
+    int memory_max_seqlen = v_cache.size(2);
+    int headdim = v_cache.size(3);
+    CHECK_SHAPE(q, batch_size, nheads, headdim);
+    CHECK_SHAPE(k, batch_size, nheads_kv, headdim);
+    CHECK_SHAPE(v, batch_size, nheads_kv, headdim);
+    CHECK_SHAPE(v_cache, batch_size, nheads_kv, memory_max_seqlen, headdim);
+    // k_cache shape: [B, H, Dh/x, L, x] where x=8 for fp16 and x=4 for fp32
+    int packsize = k_cache.dtype() == torch::kFloat32 ? 4 : 8;
+    CHECK_SHAPE(k_cache, batch_size, nheads_kv, headdim / packsize, memory_max_seqlen, packsize);
+    TORCH_CHECK(q.stride(2) == 1 && q.stride(1) == headdim);
+    TORCH_CHECK(k.stride(2) == 1 && k.stride(1) == headdim);
+    TORCH_CHECK(v.stride(2) == 1 && v.stride(1) == headdim);
+    // TORCH_CHECK(q.stride(0) == k.stride(0) && q.stride(0) == v.stride(0));
+    CHECK_CONTIGUOUS(v_cache); CHECK_CONTIGUOUS(k_cache);
+    if (length_per_sample_.has_value()) {
+        auto length_per_sample = length_per_sample_.value();
+        CHECK_DEVICE(length_per_sample);
+        CHECK_SHAPE(length_per_sample, batch_size);
+        CHECK_CONTIGUOUS(length_per_sample);
+        TORCH_CHECK(length_per_sample.dtype() == torch::kInt32);
+    }
+    if (alibi_slopes_.has_value()) {
+      auto alibi_slopes = alibi_slopes_.value();
+      CHECK_DEVICE(alibi_slopes);
+      CHECK_SHAPE(alibi_slopes, nheads);
+      CHECK_CONTIGUOUS(alibi_slopes); 
+      TORCH_CHECK(alibi_slopes.dtype() == torch::kFloat32);
+    }
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+    torch::Tensor out = torch::empty_like(q);
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(q.scalar_type(), "single_query_attention", [&] {
+        using DataType = typename SATypeConverter<scalar_t>::Type;
+        Masked_multihead_attention_params<DataType> params;
+        set_params(params, batch_size, nheads, nheads_kv, memory_max_seqlen, headdim, 
+                   timestep, rotary_embedding_dim, rotary_base, neox_rotary_style, q.stride(0),
+                   reinterpret_cast<DataType*>(q.data_ptr()),
+                   reinterpret_cast<DataType*>(k.data_ptr()),
+                   reinterpret_cast<DataType*>(v.data_ptr()),
+                   reinterpret_cast<DataType*>(k_cache.data_ptr()),
+                   reinterpret_cast<DataType*>(v_cache.data_ptr()),
+                   length_per_sample_.has_value()
+                       ? length_per_sample_.value().data_ptr<int>() : nullptr,
+                   alibi_slopes_.has_value() 
+                       ? alibi_slopes_.value().data_ptr<float>(): nullptr,
+                   reinterpret_cast<DataType*>(out.data_ptr()));
+        auto stream = at::cuda::getCurrentCUDAStream();
+        masked_multihead_attention(params, stream);
+    });
+    return out;
+}
\ No newline at end of file
--- a/awq_ext/attention/ft_attention.h
+++ b/awq_ext/attention/ft_attention.h
+#pragma once
+#include <torch/extension.h>
+torch::Tensor single_query_attention(const torch::Tensor q,
+                                     const torch::Tensor k,
+                                     const torch::Tensor v,
+                                     torch::Tensor k_cache,
+                                     torch::Tensor v_cache,
+                                     c10::optional<const torch::Tensor> length_per_sample_,
+                                     c10::optional<const torch::Tensor> alibi_slopes_,
+                                     const int timestep,
+                                     const int rotary_embedding_dim = 0,
+                                     const float rotary_base = 10000.0f,
+                                     const bool neox_rotary_style=true);
\ No newline at end of file
--- a/awq_ext/exllama/cu_compat.cuh
+++ b/awq_ext/exllama/cu_compat.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _cuda_compat_cuh
+#define _cuda_compat_cuh
+// atomicAdd for half types, to support CC < 7.x
+__device__ __forceinline__ void atomicAdd_half(half* address, half val)
+{
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do
+    {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    }
+    while (assumed != old);
+}
+// atomicAdd for half2 types
+__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
+{
+    unsigned int* address_as_ui = (unsigned int*)address;
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do
+    {
+        assumed = old;
+        half2 old_val = *((half2*)&old);
+        half2 new_val = __hadd2(old_val, val);
+        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
+    }
+    while (assumed != old);
+}
+//
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
+//__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
+//__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
+#endif
+#endif
+#endif
+#endif
--- a/awq_ext/exllama/cuda_buffers.cu
+++ b/awq_ext/exllama/cuda_buffers.cu
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#define _cuda_buffers_cu
+#include "cuda_buffers.cuh"
+CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
+// __constant__ half2 q4_table[16][256];
+// half2 q4_table_host[16][256];
+// bool q4_table_init = false;
+CudaBuffers::CudaBuffers
+(
+    int _device,
+    int _temp_state_size,
+    half* _temp_state,
+    half* _temp_dq
+) :
+    device(_device),
+    temp_state_size(_temp_state_size),
+    temp_state(_temp_state),
+    temp_dq(_temp_dq)
+{
+    cudaSetDevice(_device);
+    cudaStreamCreate(&alt_stream_1);
+    cudaStreamCreate(&alt_stream_2);
+    cudaStreamCreate(&alt_stream_3);
+    cudaEventCreate(&alt_stream_1_done);
+    cudaEventCreate(&alt_stream_2_done);
+    cudaEventCreate(&alt_stream_3_done);
+}
+CudaBuffers::~CudaBuffers()
+{
+    cudaStreamDestroy(alt_stream_1);
+    cudaStreamDestroy(alt_stream_2);
+    cudaStreamDestroy(alt_stream_3);
+    cudaEventDestroy(alt_stream_1_done);
+    cudaEventDestroy(alt_stream_2_done);
+    cudaEventDestroy(alt_stream_3_done);
+}
+CudaBuffers* get_buffers(const int device_index)
+{
+    return g_buffers[device_index];
+}
+void prepare_buffers_cuda
+(
+    int _device,
+    int _temp_state_size,
+    half* _temp_state,
+    half* _temp_dq
+)
+{
+    CudaBuffers* buffers = new CudaBuffers
+    (
+        _device,
+        _temp_state_size,
+        _temp_state,
+        _temp_dq
+    );
+    g_buffers[_device] = buffers;
+}
+void cleanup_buffers_cuda()
+{
+    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
+    {
+        if (!g_buffers[i]) continue;
+        delete g_buffers[i];
+        g_buffers[i] = NULL;
+    }
+}
--- a/awq_ext/exllama/cuda_buffers.cuh
+++ b/awq_ext/exllama/cuda_buffers.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _cuda_buffers_cuh
+#define _cuda_buffers_cuh
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+#include <cstdio>
+const int CUDA_MAX_DEVICES = 16;
+// #ifndef _cuda_buffers_cu
+// extern __constant__ half2 q4_table[16][256];
+// #endif
+class CudaBuffers
+{
+public:
+    int device;
+    half* temp_state;           // [max_hidden_rows * intermediate_size]
+    int temp_state_size;
+    half* temp_dq;              // size of largest quant tensor * 8
+    cudaStream_t alt_stream_1;
+    cudaStream_t alt_stream_2;
+    cudaStream_t alt_stream_3;
+    cudaEvent_t alt_stream_1_done;
+    cudaEvent_t alt_stream_2_done;
+    cudaEvent_t alt_stream_3_done;
+    CudaBuffers
+    (
+        int _device,
+        int _temp_state_size,
+        half* _temp_state,
+        half* _temp_dq
+    );
+    ~CudaBuffers();
+};
+CudaBuffers* get_buffers(const int device_index);
+void prepare_buffers_cuda
+(
+    int _device,
+    int _temp_state_size,
+    half* _temp_state,
+    half* _temp_dq
+);
+void cleanup_buffers_cuda();
+#endif
--- a/awq_ext/exllama/cuda_func/column_remap.cu
+++ b/awq_ext/exllama/cuda_func/column_remap.cu
--- a/awq_ext/exllama/cuda_func/column_remap.cuh
+++ b/awq_ext/exllama/cuda_func/column_remap.cuh
+// Adapted from turboderp exllama: https://github.com/turboderp/exllama
+#ifndef _column_remap_cuh
+#define _column_remap_cuh
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+void column_remap_cuda
+(
+    const half* x,
+    half* x_new,
+    const int x_height,
+    const int x_width,
+    const uint32_t* x_map
+);
+#endif
\ No newline at end of file
--- a/awq_ext/exllama/cuda_func/q4_matmul.cu
+++ b/awq_ext/exllama/cuda_func/q4_matmul.cu