update dtk to 24.04.1 and modify README

6a583c2f · chenych · 7d576a9a · 6a583c2f · 6a583c2f · 6a583c2f
Commit 6a583c2f authored Aug 21, 2024 by chenych
20 changed files
--- a/.coveragerc
+++ b/.coveragerc
+[html]
+directory = coverage
+
+[run]
+data_file = .coverage_$LOCAL_RANK
--- a/3rd_party/AutoGPTQ/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/3rd_party/AutoGPTQ/.github/ISSUE_TEMPLATE/bug_report.md
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**Hardware details**
+Information about CPU and GPU, such as RAM, number, etc.
+
+**Software version**
+Version of relevant software such as operation system, cuda toolkit, python, auto-gptq, pytorch, transformers, accelerate, etc.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context about the problem here.
--- a/3rd_party/AutoGPTQ/.github/ISSUE_TEMPLATE/custom.md
+++ b/3rd_party/AutoGPTQ/.github/ISSUE_TEMPLATE/custom.md
+---
+name: Custom issue template
+about: Describe this issue template's purpose here.
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+
--- a/3rd_party/AutoGPTQ/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/3rd_party/AutoGPTQ/.github/ISSUE_TEMPLATE/feature_request.md
+---
+name: Feature request
+about: Suggest an idea for this project
+title: "[FEATURE]"
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
--- a/3rd_party/AutoGPTQ/.github/workflows/build_wheels_cuda_linux.yml
+++ b/3rd_party/AutoGPTQ/.github/workflows/build_wheels_cuda_linux.yml
+name: Build AutoGPTQ Wheels with CUDA for Linux
+
+on: workflow_dispatch
+
+jobs:
+  build_wheels:
+    if: ${{ github.repository_owner == 'AutoGPTQ' }}
+    name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA ${{ matrix.cuda }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
+        cuda: ["11.8"]  # wheel for 12.1 are built in build_wheels_pypi.yml
+    defaults:
+      run:
+        shell: bash
+    env:
+        CUDA_VERSION: ${{ matrix.cuda }}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Free disk space
+        run: |
+          # Go from 19G to 54G free disk space in 3min
+          df -h
+          sudo apt-get update
+          sudo apt-get purge -y '^apache.*'
+          sudo apt-get purge -y '^imagemagick.*'
+          sudo apt-get purge -y '^dotnet.*'
+          sudo apt-get purge -y '^aspnetcore.*'
+          sudo apt-get purge -y 'php.*'
+          sudo apt-get purge -y '^temurin.*'
+          sudo apt-get purge -y '^mysql.*'
+          sudo apt-get purge -y '^java.*'
+          sudo apt-get purge -y '^openjdk.*'
+          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          echo "remove /usr/share leftovers"
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+          echo "remove other leftovers"
+          sudo rm -rf /var/lib/mysql > /dev/null 2>&1
+          sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
+          sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
+          sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
+          sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
+          sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
+          sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
+          sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
+          sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
+          df -h
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Install Dependencies
+        run: |
+          conda install cuda-toolkit -c "nvidia/label/cuda-${CUDA_VERSION}.0"
+
+          # Refer to https://pytorch.org/get-started/locally/
+          python -m pip install torch --index-url https://download.pytorch.org/whl/cu118
+          python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
+
+      - name: Check install
+        run: |
+          python -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Build Wheel
+        run: |
+          # For some reason $CONDA_PREFIX is empty.
+          export CUDA_HOME=/usr/share/miniconda
+          export CUDA_PATH=/usr/share/miniconda
+          export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CONDA_PREFIX}/lib"
+
+          export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+          echo "CUDA_PATH:"
+          echo $CUDA_PATH
+
+          echo "PYPI_RELEASE:"
+          echo $PYPI_RELEASE
+
+          python setup.py sdist bdist_wheel
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: 'linux-cuda-wheels'
+          path: ./dist/*.whl
--- a/3rd_party/AutoGPTQ/.github/workflows/build_wheels_cuda_windows.yml
+++ b/3rd_party/AutoGPTQ/.github/workflows/build_wheels_cuda_windows.yml
+name: Build AutoGPTQ Wheels with CUDA for Windows
+
+on: workflow_dispatch
+
+jobs:
+  build_wheels:
+    if: ${{ github.repository_owner == 'AutoGPTQ' }}
+    name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA ${{ matrix.cuda }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [windows-latest]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
+        cuda: ["11.8"]  # wheel for 12.1 are built in build_wheels_pypi.yml
+    defaults:
+      run:
+        shell: pwsh
+    env:
+        CUDA_VERSION: ${{ matrix.cuda }}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Install Dependencies
+        run: |
+          conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
+
+          # Refer to https://pytorch.org/get-started/locally/
+          python -m pip install torch --index-url https://download.pytorch.org/whl/cu118
+          python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
+
+      - name: Check install
+        run: |
+          python -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Build Wheel
+        run: |
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+
+          $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
+          if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
+          python setup.py sdist bdist_wheel
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: 'windows-cuda-wheels'
+          path: ./dist/*.whl
--- a/3rd_party/AutoGPTQ/.github/workflows/build_wheels_pypi_linux.yml
+++ b/3rd_party/AutoGPTQ/.github/workflows/build_wheels_pypi_linux.yml
+name: Build AutoGPTQ Wheels for PyPI with CUDA for Linux
+
+on: workflow_dispatch
+
+jobs:
+  build_wheels:
+    if: ${{ github.repository_owner == 'AutoGPTQ' }}
+    name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA 12.1
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
+    defaults:
+      run:
+        shell: bash
+    env:
+        CUDA_VERSION: "12.1"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Free disk space
+        run: |
+          # Go from 19G to 54G free disk space in 3min
+          df -h
+          sudo apt-get update
+          sudo apt-get purge -y '^apache.*'
+          sudo apt-get purge -y '^imagemagick.*'
+          sudo apt-get purge -y '^dotnet.*'
+          sudo apt-get purge -y '^aspnetcore.*'
+          sudo apt-get purge -y 'php.*'
+          sudo apt-get purge -y '^temurin.*'
+          sudo apt-get purge -y '^mysql.*'
+          sudo apt-get purge -y '^java.*'
+          sudo apt-get purge -y '^openjdk.*'
+          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          echo "remove /usr/share leftovers"
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+          echo "remove other leftovers"
+          sudo rm -rf /var/lib/mysql > /dev/null 2>&1
+          sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
+          sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
+          sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
+          sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
+          sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
+          sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
+          sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
+          sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
+          df -h
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Install Dependencies
+        run: |
+          conda install cuda-toolkit -c "nvidia/label/cuda-${CUDA_VERSION}.0"
+
+          # Refer to https://pytorch.org/get-started/locally/
+          python -m pip install torch
+          python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
+
+      - name: Check install
+        run: |
+          python -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Build Wheel
+        run: |
+          # For some reason $CONDA_PREFIX is empty.
+          export CUDA_HOME=/usr/share/miniconda
+          export CUDA_PATH=/usr/share/miniconda
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${CONDA_PREFIX}/lib"
+
+          export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+          export PYPI_RELEASE="1"
+
+          echo "CUDA_PATH:"
+          echo $CUDA_PATH
+
+          echo "PYPI_RELEASE:"
+          echo $PYPI_RELEASE
+
+          python setup.py sdist bdist_wheel
+      
+      - uses: actions/upload-artifact@v3
+        with:
+          name: 'linux-cuda-wheels-pypi'
+          path: ./dist/*.whl
--- a/3rd_party/AutoGPTQ/.github/workflows/build_wheels_pypi_windows.yml
+++ b/3rd_party/AutoGPTQ/.github/workflows/build_wheels_pypi_windows.yml
+name: Build AutoGPTQ Wheels for PyPI with CUDA for Windows
+
+on: workflow_dispatch
+
+jobs:
+  build_wheels:
+    if: ${{ github.repository_owner == 'AutoGPTQ' }}
+    name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA 12.1
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [windows-latest]
+        pyver: ["3.8", "3.9", "3.10", "3.11"]
+    defaults:
+      run:
+        shell: pwsh
+    env:
+        CUDA_VERSION: "12.1"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.pyver }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Install Dependencies
+        run: |
+          conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
+
+          # Refer to https://pytorch.org/get-started/locally/
+          python -m pip install torch --index-url https://download.pytorch.org/whl/cu121
+
+          python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
+
+      - name: Check install
+        run: |
+          python -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Build Wheel
+        run: |
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+
+          $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+
+          $env:PYPI_RELEASE = "1"
+
+          echo "CUDA_PATH:"
+          echo $env:CUDA_PATH
+
+          echo "PYPI_RELEASE:"
+          echo $env:PYPI_RELEASE
+
+          python setup.py sdist bdist_wheel
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: 'windows-cuda-wheels-pypi'
+          path: ./dist/*.whl
--- a/3rd_party/AutoGPTQ/.github/workflows/build_wheels_rocm.yml
+++ b/3rd_party/AutoGPTQ/.github/workflows/build_wheels_rocm.yml
+name: Build AutoGPTQ Wheels with ROCm
+
+on: workflow_dispatch
+
+jobs:
+  build_wheels:
+    if: ${{ github.repository_owner == 'AutoGPTQ' }}
+
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        python: ["3.8", "3.9", "3.10", "3.11"]
+        rocm: ["5.7.3"]  # we build only for ROCm 5.7 to match PyTorch 2.2.0 and PyTorch 2.2 nightly
+
+    name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and RoCm ${{ matrix.rocm }}
+    runs-on: ${{ matrix.os }}
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Free disk space
+        run: |
+          # Go from 19G to 54G free disk space in 3min
+          df -h
+          sudo apt-get update
+          sudo apt-get purge -y '^apache.*'
+          sudo apt-get purge -y '^imagemagick.*'
+          sudo apt-get purge -y '^dotnet.*'
+          sudo apt-get purge -y '^aspnetcore.*'
+          sudo apt-get purge -y 'php.*'
+          sudo apt-get purge -y '^temurin.*'
+          sudo apt-get purge -y '^mysql.*'
+          sudo apt-get purge -y '^java.*'
+          sudo apt-get purge -y '^openjdk.*'
+          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          echo "remove /usr/share leftovers"
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+          echo "remove other leftovers"
+          sudo rm -rf /var/lib/mysql > /dev/null 2>&1
+          sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
+          sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
+          sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
+          sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
+          sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
+          sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
+          sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
+          sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
+          df -h
+      
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.python }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Set up environment
+        run: |
+          echo "Using python:"
+          python --version
+          which python
+
+          if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
+          elif [[ "${{ matrix.rocm }}" == "5.7.3" ]]; then
+            export ROCM_DL_FILE=amdgpu-install_5.7.50703-1_all.deb
+          else
+            echo Unknown rocm version
+            exit 1
+          fi
+
+          curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
+          sudo dpkg -i $ROCM_DL_FILE
+          sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
+
+          python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
+
+          if [[ "${{ matrix.rocm }}" == "5.7.3" ]]; then
+            echo "Using PyTorch stable"
+            python -m pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
+          else
+            echo Unknown rocm version for python install
+            exit 1
+          fi
+
+      - name: Build wheels
+        run: |
+          echo "Using python for build:"
+          python --version
+          which python
+
+          ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: 'linux-rocm-wheels'
+          path: ./dist/*.whl
--- a/3rd_party/AutoGPTQ/.github/workflows/test_quality.yml
+++ b/3rd_party/AutoGPTQ/.github/workflows/test_quality.yml
+name: check_code_quality
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "auto_gptq/**.py"
+      - "tests/**.py"
+      - "examples/**.py"
+      - "setup.py"
+
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "auto_gptq/**.py"
+      - "tests/**.py"
+      - "examples/**.py"
+      - "setup.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-22.04]
+    runs-on: ${{ matrix.os }}
+  
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Free disk space
+        run: |
+          # Go from 19G to 54G free disk space in 3min
+          df -h
+          sudo apt-get update
+          sudo apt-get purge -y '^apache.*'
+          sudo apt-get purge -y '^imagemagick.*'
+          sudo apt-get purge -y '^dotnet.*'
+          sudo apt-get purge -y '^aspnetcore.*'
+          sudo apt-get purge -y 'php.*'
+          sudo apt-get purge -y '^temurin.*'
+          sudo apt-get purge -y '^mysql.*'
+          sudo apt-get purge -y '^java.*'
+          sudo apt-get purge -y '^openjdk.*'
+          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          df -h
+          sudo apt-get autoremove -y >/dev/null 2>&1
+          sudo apt-get clean
+          df -h
+          echo "https://github.com/actions/virtual-environments/issues/709"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          df -h
+          echo "remove big /usr/local"
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
+          df -h
+          echo "remove /usr/share leftovers"
+          sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
+          sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
+          sudo rm -rf /usr/share/swift > /dev/null 2>&1
+          df -h
+          echo "remove other leftovers"
+          sudo rm -rf /var/lib/mysql > /dev/null 2>&1
+          sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
+          sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
+          sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
+          sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
+          sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
+          sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
+          sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
+          sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
+          df -h
+
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Setup Miniconda
+        uses: conda-incubator/setup-miniconda@v2.2.0
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.python-version }}
+          mamba-version: "*"
+          use-mamba: false
+          channels: conda-forge,defaults
+          channel-priority: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+
+      - name: Install dependencies
+        run: |
+          conda install cuda-toolkit -c "nvidia/label/cuda-12.1.0"
+
+          # Refer to https://pytorch.org/get-started/locally/
+          python -m pip install torch
+          
+          python -m pip install --upgrade build setuptools wheel numpy
+
+      - name: Check install
+        run: |
+          python -c "import torch; print('torch version:', torch.__version__)"
+
+      - name: Install AutoGPTQ
+        run: |
+          # For some reason $CONDA_PREFIX is empty.
+          export CUDA_HOME=/usr/share/miniconda
+          export CUDA_PATH=/usr/share/miniconda
+
+          echo "CUDA_HOME:"
+          echo $CUDA_HOME
+
+          echo "CUDA_PATH:"
+          echo $CUDA_PATH
+
+          pip install -vvv .[quality]
+      
+      - name: Check style with ruff
+        run: |
+          ruff auto_gptq examples tests setup.py
--- a/3rd_party/AutoGPTQ/.gitignore
+++ b/3rd_party/AutoGPTQ/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/3rd_party/AutoGPTQ/Dockerfile
+++ b/3rd_party/AutoGPTQ/Dockerfile
+# Build with: `docker build -f Dockerfile -t autogptq .`
+# Run with: `docker run --gpus all --rm -it autogptq:latest /bin/bash`
+
+FROM nvcr.io/nvidia/cuda:12.1.0-runtime-ubuntu22.04
+
+RUN apt update && \
+    apt install -y wget git && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir .conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+
+RUN conda init bash
+
+RUN pip install --upgrade pip
+RUN pip install --upgrade numpy torch setuptools wheel
+
+RUN git clone https://github.com/AutoGPTQ/AutoGPTQ.git
+WORKDIR /AutoGPTQ
+
+RUN pip install -vvv .
\ No newline at end of file
--- a/3rd_party/AutoGPTQ/Dockerfile_amd
+++ b/3rd_party/AutoGPTQ/Dockerfile_amd
+# Build with: `docker build -f Dockerfile_amd -t autogptq-rocm .`
+# Run with: `docker run --rm -it --shm-size=150G --device /dev/kfd --device /dev/dri --net host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined autogptq-rocm:latest /bin/bash`
+
+FROM rocm/dev-ubuntu-22.04:5.7
+
+RUN apt update && \
+    apt install -y wget \
+    git \
+    rocsparse-dev \
+    hipsparse-dev \
+    rocthrust-dev \
+    rocblas-dev \
+    hipblas-dev && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir .conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh
+
+RUN conda init bash
+
+RUN pip install --upgrade pip
+RUN pip install --upgrade numpy setuptools wheel ninja packaging
+RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7
+
+RUN git clone https://github.com/AutoGPTQ/AutoGPTQ.git
+WORKDIR /AutoGPTQ
+
+RUN ROCM_VERSION="5.7" pip install -vvv .
\ No newline at end of file
--- a/3rd_party/AutoGPTQ/LICENSE
+++ b/3rd_party/AutoGPTQ/LICENSE
+MIT License
+
+Copyright (c) 2023 潘其威(William)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/3rd_party/AutoGPTQ/MANIFEST.in
+++ b/3rd_party/AutoGPTQ/MANIFEST.in
+global-include autogptq_extension/**/*.cuh
+global-include autogptq_extension/**/*.h
+global-include autogptq_extension/**/*.cpp
+global-include autogptq_extension/**/*.cu
+global-include autogptq_extension/**/*.py
--- a/3rd_party/AutoGPTQ/Makefile
+++ b/3rd_party/AutoGPTQ/Makefile
+style:
+	ruff auto_gptq examples tests setup.py --fix
--- a/3rd_party/AutoGPTQ/README.md
+++ b/3rd_party/AutoGPTQ/README.md
+<h1 align="center">AutoGPTQ</h1>
+<p align="center">An easy-to-use LLM quantization package with user-friendly APIs, based on GPTQ algorithm (weight-only quantization).</p>
+<p align="center">
+    <a href="https://github.com/PanQiWei/AutoGPTQ/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/PanQiWei/AutoGPTQ.svg">
+    </a>
+    <a href="https://pypi.org/project/auto-gptq/">
+        <img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dd/auto-gptq">
+    </a>
+</p>
+<h4 align="center">
+    <p>
+        <b>English</b> |
+        <a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
+    </p>
+</h4>
+
+## News or Update
+
+- 2024-02-15 - (News) - AutoGPTQ 0.7.0 is released, with [Marlin](https://github.com/IST-DASLab/marlin) int4*fp16 matrix multiplication kernel support, with the argument `use_marlin=True` when loading models.
+- 2023-08-23 - (News) - 🤗 Transformers, optimum and peft have integrated `auto-gptq`, so now running and training GPTQ models can be more available to everyone! See [this blog](https://huggingface.co/blog/gptq-integration) and it's resources for more details!
+
+*For more histories please turn to [here](docs/NEWS_OR_UPDATE.md)*
+
+## Performance Comparison
+
+### Inference Speed
+> The result is generated using [this script](examples/benchmark/generation_speed.py), batch size of input is 1, decode strategy is beam search and enforce the model to generate 512 tokens, speed metric is tokens/s (the larger, the better).
+>
+> The quantized model is loaded using the setup that can gain the fastest inference speed.
+
+| model         | GPU           | num_beams | fp16  | gptq-int4 |
+|---------------|---------------|-----------|-------|-----------|
+| llama-7b      | 1xA100-40G    | 1         | 18.87 | 25.53     |
+| llama-7b      | 1xA100-40G    | 4         | 68.79 | 91.30     |
+| moss-moon 16b | 1xA100-40G    | 1         | 12.48 | 15.25     |
+| moss-moon 16b | 1xA100-40G    | 4         | OOM   | 42.67     |
+| moss-moon 16b | 2xA100-40G    | 1         | 06.83 | 06.78     |
+| moss-moon 16b | 2xA100-40G    | 4         | 13.10 | 10.80     |
+| gpt-j 6b      | 1xRTX3060-12G | 1         | OOM   | 29.55     |
+| gpt-j 6b      | 1xRTX3060-12G | 4         | OOM   | 47.36     |
+
+
+### Perplexity
+For perplexity comparison, you can turn to [here](https://github.com/qwopqwop200/GPTQ-for-LLaMa#result) and [here](https://github.com/qwopqwop200/GPTQ-for-LLaMa#gptq-vs-bitsandbytes)
+
+## Installation
+
+AutoGPTQ is available on Linux and Windows only. You can install the latest stable release of AutoGPTQ from pip with pre-built wheels:
+
+| CUDA/ROCm version | Installation                                                                                      | Built against PyTorch |
+|-------------------|---------------------------------------------------------------------------------------------------|-----------------------|
+| CUDA 11.8         | `pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`   | 2.2.1+cu118           |
+| CUDA 12.1         | `pip install auto-gptq --no-build-isolation`                                                                            | 2.2.1+cu121           |
+| ROCm 5.7          | `pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/` | 2.2.1+rocm5.7               |
+
+AutoGPTQ can be installed with the Triton dependency with `pip install auto-gptq[triton] --no-build-isolation` in order to be able to use the Triton backend (currently only supports linux, no 3-bits quantization).
+
+For older AutoGPTQ, please refer to [the previous releases installation table](docs/INSTALLATION.md).
+
+On NVIDIA systems, AutoGPTQ does not support [Maxwell or lower](https://qiita.com/uyuni/items/733a93b975b524f89f46) GPUs.
+
+### Install from source
+
+Clone the source code:
+```bash
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+```
+
+A few packages are required in order to build from source: `pip install numpy gekko pandas`.
+
+Then, install locally from source:
+```bash
+pip install -vvv --no-build-isolation -e .
+```
+You can set `BUILD_CUDA_EXT=0` to disable pytorch extension building, but this is **strongly discouraged** as AutoGPTQ then falls back on a slow python implementation.
+
+As a last resort, if the above command fails, you can try `python setup.py install`.
+
+#### On ROCm systems
+
+To install from source for AMD GPUs supporting ROCm, please specify the `ROCM_VERSION` environment variable. Example:
+
+```bash
+ROCM_VERSION=5.6 pip install -vvv --no-build-isolation -e .
+```
+
+The compilation can be speeded up by specifying the `PYTORCH_ROCM_ARCH` variable ([reference](https://github.com/pytorch/pytorch/blob/7b73b1e8a73a1777ebe8d2cd4487eb13da55b3ba/setup.py#L132)) in order to build for a single target device, for example `gfx90a` for MI200 series devices.
+
+For ROCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
+
+## Quick Tour
+
+### Quantization and Inference
+> warning: this is just a showcase of the usage of basic apis in AutoGPTQ, which uses only one sample to quantize a much small model, quality of quantized model using such little samples may not good.
+
+Below is an example for the simplest use of `auto_gptq` to quantize a model and inference after quantization:
+```python
+from transformers import AutoTokenizer, TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+import logging
+
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+)
+
+pretrained_model_dir = "facebook/opt-125m"
+quantized_model_dir = "opt-125m-4bit"
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+examples = [
+    tokenizer(
+        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+    )
+]
+
+quantize_config = BaseQuantizeConfig(
+    bits=4,  # quantize model to 4-bit
+    group_size=128,  # it is recommended to set the value to 128
+    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
+)
+
+# load un-quantized model, by default, the model will always be loaded into CPU memory
+model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+
+# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
+model.quantize(examples)
+
+# save quantized model
+model.save_quantized(quantized_model_dir)
+
+# save quantized model using safetensors
+model.save_quantized(quantized_model_dir, use_safetensors=True)
+
+# push quantized model to Hugging Face Hub.
+# to use use_auth_token=True, Login first via huggingface-cli login.
+# or pass explcit token with: use_auth_token="hf_xxxxxxx"
+# (uncomment the following three lines to enable this feature)
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+
+# alternatively you can save and push at the same time
+# (uncomment the following three lines to enable this feature)
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
+
+# load quantized model to the first GPU
+model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
+
+# download quantized model from Hugging Face Hub and load to the first GPU
+# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
+
+# inference with model.generate
+print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
+
+# or you can also use pipeline
+pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
+print(pipeline("auto-gptq is")[0]["generated_text"])
+```
+
+For more advanced features of model quantization, please reference to [this script](examples/quantization/quant_with_alpaca.py)
+
+### Customize Model
+<details>
+
+<summary>Below is an example to extend `auto_gptq` to support `OPT` model, as you will see, it's very easy:</summary>
+
+```python
+from auto_gptq.modeling import BaseGPTQForCausalLM
+
+
+class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
+    # chained attribute name of transformer layer block
+    layers_block_name = "model.decoder.layers"
+    # chained attribute names of other nn modules that in the same level as the transformer layer block
+    outside_layer_modules = [
+        "model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
+        "model.decoder.project_in", "model.decoder.final_layer_norm"
+    ]
+    # chained attribute names of linear layers in transformer layer module
+    # normally, there are four sub lists, for each one the modules in it can be seen as one operation,
+    # and the order should be the order when they are truly executed, in this case (and usually in most cases),
+    # they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.out_proj"],
+        ["fc1"],
+        ["fc2"]
+    ]
+```
+After this, you can use `OPTGPTQForCausalLM.from_pretrained` and other methods as shown in Basic.
+
+</details>
+
+### Evaluation on Downstream Tasks
+You can use tasks defined in `auto_gptq.eval_tasks` to evaluate model's performance on specific down-stream task before and after quantization.
+
+The predefined tasks support all causal-language-models implemented in [🤗 transformers](https://github.com/huggingface/transformers) and in this project.
+
+<details>
+
+<summary>Below is an example to evaluate `EleutherAI/gpt-j-6b` on sequence-classification task using `cardiffnlp/tweet_sentiment_multilingual` dataset:</summary>
+
+```python
+from functools import partial
+
+import datasets
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from auto_gptq.eval_tasks import SequenceClassificationTask
+
+
+MODEL = "EleutherAI/gpt-j-6b"
+DATASET = "cardiffnlp/tweet_sentiment_multilingual"
+TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
+ID2LABEL = {
+    0: "negative",
+    1: "neutral",
+    2: "positive"
+}
+LABELS = list(ID2LABEL.values())
+
+
+def ds_refactor_fn(samples):
+    text_data = samples["text"]
+    label_data = samples["label"]
+
+    new_samples = {"prompt": [], "label": []}
+    for text, label in zip(text_data, label_data):
+        prompt = TEMPLATE.format(labels=LABELS, text=text)
+        new_samples["prompt"].append(prompt)
+        new_samples["label"].append(ID2LABEL[label])
+
+    return new_samples
+
+
+#  model = AutoModelForCausalLM.from_pretrained(MODEL).eval().half().to("cuda:0")
+model = AutoGPTQForCausalLM.from_pretrained(MODEL, BaseQuantizeConfig())
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+task = SequenceClassificationTask(
+        model=model,
+        tokenizer=tokenizer,
+        classes=LABELS,
+        data_name_or_path=DATASET,
+        prompt_col_name="prompt",
+        label_col_name="label",
+        **{
+            "num_samples": 1000,  # how many samples will be sampled to evaluation
+            "sample_max_len": 1024,  # max tokens for each sample
+            "block_max_len": 2048,  # max tokens for each data block
+            # function to load dataset, one must only accept data_name_or_path as input
+            # and return datasets.Dataset
+            "load_fn": partial(datasets.load_dataset, name="english"),
+            # function to preprocess dataset, which is used for datasets.Dataset.map,
+            # must return Dict[str, list] with only two keys: [prompt_col_name, label_col_name]
+            "preprocess_fn": ds_refactor_fn,
+            # truncate label when sample's length exceed sample_max_len
+            "truncate_prompt": False
+        }
+    )
+
+# note that max_new_tokens will be automatically specified internally based on given classes
+print(task.run())
+
+# self-consistency
+print(
+    task.run(
+        generation_config=GenerationConfig(
+            num_beams=3,
+            num_return_sequences=3,
+            do_sample=True
+        )
+    )
+)
+```
+
+</details>
+
+## Learn More
+[tutorials](docs/tutorial) provide step-by-step guidance to integrate `auto_gptq` with your own project and some best practice principles.
+
+[examples](examples/README.md) provide plenty of example scripts to use `auto_gptq` in different ways.
+
+## Supported Models
+
+> you can use `model.config.model_type` to compare with the table below to check whether the model you use is supported by `auto_gptq`.
+>
+> for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
+
+| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                                            |
+|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
+| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                               |
+| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+
+## Supported Evaluation Tasks
+Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
+
+## Running tests
+
+Tests can be run with:
+
+```
+pytest tests/ -s
+```
+
+## FAQ
+
+### Which kernel is used by default?
+
+AutoGPTQ defaults to using exllamav2 int4*fp16 kernel for matrix multiplication.
+
+### How to use Marlin kernel?
+
+Marlin is an optimized int4 * fp16 kernel was recently proposed at https://github.com/IST-DASLab/marlin. This is integrated in AutoGPTQ when loading a model with `use_marlin=True`. This kernel is available only on devices with compute capability 8.0 or 8.6 (Ampere GPUs).
+
+## Acknowledgement
+- Special thanks **Elias Frantar**, **Saleh Ashkboos**, **Torsten Hoefler** and **Dan Alistarh** for proposing **GPTQ** algorithm and open source the [code](https://github.com/IST-DASLab/gptq), and for releasing [Marlin kernel](https://github.com/IST-DASLab/marlin) for mixed precision computation.
+- Special thanks **qwopqwop200**, for code in this project that relevant to quantization are mainly referenced from [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda).
+- Special thanks to **turboderp**, for releasing [Exllama](https://github.com/turboderp/exllama) and [Exllama v2](https://github.com/turboderp/exllamav2) libraries with efficient mixed precision kernels.
--- a/3rd_party/AutoGPTQ/README_GPTQ.md
+++ b/3rd_party/AutoGPTQ/README_GPTQ.md
+<h1 align="center">Yuan2-M32基于AutoGPTQ的量化和推理</h1>
+
+
+
+## 配置AutoGPTQ环境
+- AutoGPTQ环境配置要求：CUDA版本高于11.8
+- 容器：使用[vllm](https://github.com/IEI-mjx/Yuan2.0-M32/blob/main/vllm/README_Yuan_vllm.md)项目提供的镜像创建容器
+```shell
+# 进入容器
+docker exec -it vllm_yuan bash
+
+# 进入你的工作目录
+cd /mnt
+
+# 拉取我们的项目
+git clone https://github.com/IEIT-Yuan/Yuan2.0-M32.git
+
+# 进入autogptq项目
+cd  Yuan2.0-M32/3rd_party/AutoGPTQ
+
+# 安装autogptq
+pip install auto-gptq --no-build-isolation
+```
+
+## 量化Yuan2-M32-HF模型
+
+量化Yuan2-M32模型主要分为三步：1.下载Yuan2-M32-HF模型 2.下载数据集 3.设置量化参数，量化Yuan2-M32-HF模型
+- 1.下载Yuan2-M32 hugging face模型并移动到指定路径(/mnt/beegfs2/Yuan2-M32-HF)，可参考[vllm](https://github.com/IEI-mjx/Yuan2.0-M32/blob/main/vllm/README_Yuan_vllm.md)，模型下载地址：https://huggingface.co/IEIT-Yuan/Yuan2-M32-hf
+- 2.数据集下载点击[这里](https://huggingface.co/datasets/hakurei/open-instruct-v1)，下载后移动到指定路径如：/mnt/beegfs2/
+- 3.按照以下步骤调整量化参数进行量化操作
+```shell
+# 编辑Yuan2-M32-int4.py
+cd /mnt/beegfs2/Yuan2.0-M32/3rd_party/AutoGPTQ
+vim Yuan2-M32-int4.py
+
+'''
+pretrained_model_dir = "/mnt/beegfs2/Yuan2-M32-HF"
+quantized_model_dir = "/mnt/beegfs2/Yuan2-M32-GPTQ-int4"
+
+tokenizer = LlamaTokenizer.from_pretrained("/mnt/beegfs2/Yuan2-M32-HF", add_eos_token=False, add_bos_token=False, eos_token='<eod>', use_fast=True)
+tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
+
+examples = []
+with open("/mnt/beegfs2/instruct_data.json", 'r', encoding='utf-8') as file: # 数据集路径
+    data = json.load(file)
+
+for i, item in enumerate(data):
+    if i >= 2000:
+        break
+    instruction = item.get('instruction', '')
+    output = item.get('output', '')
+    combined_text = instruction + " " + output
+    examples.append(tokenizer(combined_text))
+
+max_memory = {0: "80GIB", 1: "80GIB", 2: "80GIB", 3: "80GIB", 4: "80GIB", 5: "80GIB", 6: "80GIB", 7: "80GIB"}
+quantize_config = BaseQuantizeConfig(
+    bits=4,  # quantize model to 4-bit
+    group_size=128,  # it is recommended to set the value to 128
+    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
+)
+'''
+# 1.修改pretrained_model_dir，指定量化后的quantized_model_dir
+# 2.修改数据集路径
+# 3.max_memory可以指定要使用的GPUs
+# 4.修改量化参数，若要int4精度bits=4，若要int8精度bits=8，其他参数可以参考默认值
+
+# 运行此脚本
+python Yuan2-M32-int4.py
+
+# 模型量化和packing过程耗时约8h，可以指定不同的GPU同时分别量化int4和int8
+```
+
+
+## GPTQ量化模型的推理
+量化完成后，目标路径文件夹中会生成.safetensors后缀的ckpt文件以及config.json、quantize_config.json文件，需要先从Yuan2-M32-HF路径中拷贝tokenizer相关的文件
+```shell
+# 进入Yuan2-M32-HF路径
+cd /mnt/beegfs2/Yuan2-M32-HF
+
+# 拷贝tokenizer相关文件至Yuan2-M32-GPTQ-int4
+cp special_tokens_map.json tokenizer* /mnt/beegfs2/Yuan2-M32-GPTQ-int4
+
+# 编辑inference.py
+cd /mnt/beegfs2/Yuan2.0-M32/3rd_party/AutoGPTQ
+vim inference.py
+
+'''
+quantized_model_dir = "/mnt/beegfs2/Yuan2-M32-GPTQ-int4"
+
+tokenizer = LlamaTokenizer.from_pretrained('/mnt/beegfs2/Yuan2-M32-GPTQ-int4', add_eos_token=False, add_bos_token=False, eos_token='<eod>')
+tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
+
+model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", trust_remote_code=True)
+'''
+# 修改quantized_model_dir和tokenizer路径
+
+# 运行inference.py
+python inference.py
+```
+
+## 推理精度&性能测试
+> HumanEval测试参数如下：
+> generation_params = {
+        "max_new_tokens": 512,
+        "top_k": 1,
+        "top_p": 0,
+        "temperature": 1.0,
+}
+
+> BF16模型推理使用2张80GB GPU卡，GPTQ-INT4/INT8模型推理用1张80GB GPU卡
+
+> 测试结果：
+
+| Model               | Accuracy Type |  HumanEval | Inference Speed |  Inference Memory Usage |
+|---------------------|---------------|------------|-----------------|-------------------------|
+| Yuan2-M32-HF        | BF16          |  73.17%    | 13.16 token/s   |76.34 GB                 |
+| Yuan2-M32-GPTQ-int8 | INT8          |  72.56%    |  9.05 token/s   |39.81 GB                 |
+| Yuan2-M32-GPTQ-int4 | INT4          |  66.46%    |  9.24 token/s   |23.27GB                  |
+
+
+
--- a/3rd_party/AutoGPTQ/README_zh.md
+++ b/3rd_party/AutoGPTQ/README_zh.md
+<h1 align="center">AutoGPTQ</h1>
+<p align="center">一个基于 GPTQ 算法，简单易用且拥有用户友好型接口的大语言模型量化工具包。</p>
+<p align="center">
+    <a href="https://github.com/PanQiWei/AutoGPTQ/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/PanQiWei/AutoGPTQ.svg">
+    </a>
+    <a href="https://pypi.org/project/auto-gptq/">
+        <img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dd/auto-gptq">
+    </a>
+</p>
+<h4 align="center">
+    <p>
+        <a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
+        <b>中文</b>
+    </p>
+</h4>
+
+Note: The English README is likely to be more up to date.
+
+## 通向 v1.0.0 之路
+
+嗨，社区的伙伴们，好久不见！很抱歉这段时间由于个人原因，我没能以较高的频率来更新这个项目。过去几周对我的职业生涯规划而言意义重大。在不久前，我正式告别了毕业后便加入两年之久的创业团队，非常感谢团队的领导和同事们给予我的信任与指导，让我能够在两年时间里飞速地成长；同时也十分感激团队允许我自 AutoGPTQ 项目创立以来一直无偿使用内部的 A100 GPU 服务器集群以完成各项实验与性能测评。（当然今后是无法继续使用了，因此**若有新的硬件赞助我将感激不尽**！）过去的两年里，我在这个团队中担任算法工程师的角色，负责基于大语言模型的对话系统架构设计与开发，我们曾成功推出一款名为 gemsouls 的产品，但不幸的是它已经停止运营。而现在，这个团队即将推出一款名为 [modelize](https://www.beta.modelize.ai/) 的新产品，**这是一个大模型原生的 AI 智能体平台，用户可以使用多个 AI 智能体搭建一个高度自动化的团队，让它们在工作流中相互合作，高效完成复杂的项目。**
+
+话归正题，我非常兴奋地看到，在过去几个月的时间里，针对大语言模型推理性能优化的研究取得了巨大的进展，如今我们不仅能够在高端显卡上完成大语言模型的推理，甚至在 CPU 和边缘设备上都可以轻松运行大语言模型。一系列的技术进步，让我同样迫不及待地在开源社区上做出更多的贡献，因此，首先，我将用约四周的时间将 AutoGPTQ 迭代至 v1.0.0 正式版本，在此期间，也会有 2~3 个小版本发布以让用户能够及时体验性能优化和新特性。在我的愿景里，**到 v1.0.0 版本正式发布时，AutoGPTQ 将能够作为一个灵活可拓展的、支持所有 GPTQ-like 方法的量化后端，自动地完成各种基于 Pytorch 编写的大语言模型的量化工作**。我在[这里](https://github.com/PanQiWei/AutoGPTQ/issues/348)详细介绍了开发计划，欢迎移步至此进行讨论并给出你们的建议！
+
+## 新闻或更新
+
+- 2023-08-23 - (新闻) - 🤗 Transformers、optimum 和 peft 完成了对 `auto-gptq` 的集成，现在使用 GPTQ 模型进行推理和训练将变得更容易！阅读 [这篇博客](https://huggingface.co/blog/gptq-integration) 和相关资源以了解更多细节！
+- 2023-08-21 - (新闻) - 通义千问团队发布了基于 `auto-gptq` 的 Qwen-7B 4bit 量化版本模型，并提供了[详尽的测评结果](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4#%E9%87%8F%E5%8C%96-quantization)
+- 2023-08-06 - (更新) - 支持 exllama 的 q4 CUDA 算子使得 int4 量化模型能够获得至少1.3倍的推理速度提升.
+- 2023-08-04 - (更新) - 支持 RoCm 使得 AMD GPU 的用户能够使用 auto-gptq 的 CUDA 拓展.
+- 2023-07-26 - (更新) - 一个优雅的 [PPL 测评脚本](examples/benchmark/perplexity.py)以获得可以与诸如 `llama.cpp` 等代码库进行公平比较的结果。
+- 2023-06-05 - (更新) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层，支持 LoRA，AdaLoRA，AdaptionPrompt 等。
+- 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
+
+*获取更多的历史信息，请转至[这里](docs/NEWS_OR_UPDATE.md)*
+
+## 性能对比
+
+### 推理速度
+> 以下结果通过[这个脚本](examples/benchmark/generation_speed.py)生成，文本输入的 batch size 为1，解码策略为 beam search 并且强制模型生成512个 token，速度的计量单位为 tokens/s（越大越好）。
+> 
+> 量化模型通过能够最大化推理速度的方式加载。
+
+| model         | GPU           | num_beams | fp16  | gptq-int4 |
+|---------------|---------------|-----------|-------|-----------|
+| llama-7b      | 1xA100-40G    | 1         | 18.87 | 25.53     |
+| llama-7b      | 1xA100-40G    | 4         | 68.79 | 91.30     |
+| moss-moon 16b | 1xA100-40G    | 1         | 12.48 | 15.25     |
+| moss-moon 16b | 1xA100-40G    | 4         | OOM   | 42.67     |
+| moss-moon 16b | 2xA100-40G    | 1         | 06.83 | 06.78     |
+| moss-moon 16b | 2xA100-40G    | 4         | 13.10 | 10.80     |
+| gpt-j 6b      | 1xRTX3060-12G | 1         | OOM   | 29.55     |
+| gpt-j 6b      | 1xRTX3060-12G | 4         | OOM   | 47.36     |
+
+
+### 困惑度（PPL）
+对于困惑度的对比， 你可以参考 [这里](https://github.com/qwopqwop200/GPTQ-for-LLaMa#result) 和 [这里](https://github.com/qwopqwop200/GPTQ-for-LLaMa#gptq-vs-bitsandbytes)
+
+## 安装
+
+### 快速安装
+你可以通过 pip 来安装与 PyTorch 2.0.1 相兼容的最新稳定版本的 AutoGPTQ 的预构建轮子文件：
+
+* 对于 CUDA 11.7： `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/`
+* 对于 CUDA 11.8： `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`
+* 对于 RoCm 5.4.2： `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm542/`
+
+**警告：** 预构建的轮子文件不一定在 PyTorch 的 nightly 版本上有效。如果要使用 PyTorch 的 nightly 版本，请从源码安装 AutoGPTQ。
+
+#### 取消 cuda 拓展的安装
+默认情况下，在 `torch` 和 `cuda` 已经于你的机器上被安装时，cuda 拓展将被自动安装，如果你不想要这些拓展的话，采用以下安装命令：
+```shell
+BUILD_CUDA_EXT=0 pip install auto-gptq
+```
+同时为确保该拓展——`autogptq_cuda` 不再存在于你的虚拟环境，执行以下命令：
+```shell
+pip uninstall autogptq_cuda -y
+```
+
+#### 支持使用 triton 加速
+若想使用 `triton` 加速模型推理，使用以下命令：
+> 警告：目前 triton 仅支持 linux 操作系统；当使用 triton 时 3-bit 数值类型的量化将不被支持
+
+```shell
+pip install auto-gptq[triton]
+```
+
+### 从源码安装
+<details>
+<summary>点击以查看详情</summary>
+
+克隆源码:
+```shell
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+```
+然后，从项目目录安装:
+```shell
+pip install .
+```
+正如在快速安装一节，你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。
+
+如果你想要使用 triton 加速且其能够被你的操作系统所支持，请使用 `.[triton]`。
+
+对应 AMD GPUs，为了从源码安装以支持 RoCm，请设置 `ROCM_VERSION` 环境变量。同时通过设置 `PYTORCH_ROCM_ARCH` ([reference](https://github.com/pytorch/pytorch/blob/7b73b1e8a73a1777ebe8d2cd4487eb13da55b3ba/setup.py#L132)) 可提升编译速度，例如：对于 MI200 系列设备，该变量可设为 `gfx90a`。例子：
+
+```
+ROCM_VERSION=5.6 pip install .
+```
+
+对于 RoCm 系统，在从源码安装时额外需要提前安装以下包：`rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev`。
+
+</details>
+
+## 快速开始
+
+### 量化和推理
+> 警告：这里仅是对 AutoGPTQ 中基本接口的用法展示，只使用了一条文本来量化一个特别小的模型，因此其结果的表现可能不如在大模型上执行量化后预期的那样好。
+
+以下展示了使用 `auto_gptq` 进行量化和推理的最简单用法：
+```python
+from transformers import AutoTokenizer, TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+
+
+pretrained_model_dir = "facebook/opt-125m"
+quantized_model_dir = "opt-125m-4bit"
+
+
+tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
+examples = [
+    tokenizer(
+        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
+    )
+]
+
+quantize_config = BaseQuantizeConfig(
+    bits=4,  # 将模型量化为 4-bit 数值类型
+    group_size=128,  # 一般推荐将此参数的值设置为 128
+    desc_act=False,  # 设为 False 可以显著提升推理速度，但是 ppl 可能会轻微地变差
+)
+
+# 加载未量化的模型，默认情况下，模型总是会被加载到 CPU 内存中
+model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
+
+# 量化模型, 样本的数据类型应该为 List[Dict]，其中字典的键有且仅有 input_ids 和 attention_mask
+model.quantize(examples)
+
+# 保存量化好的模型
+model.save_quantized(quantized_model_dir)
+
+# 使用 safetensors 保存量化好的模型
+model.save_quantized(quantized_model_dir, use_safetensors=True)
+
+# 将量化好的模型直接上传至 Hugging Face Hub 
+# 当使用 use_auth_token=True 时, 确保你已经首先使用 huggingface-cli login 进行了登录
+# 或者可以使用 use_auth_token="hf_xxxxxxx" 来显式地添加账户认证 token
+# （取消下面三行代码的注释来使用该功能）
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+
+# 或者你也可以同时将量化好的模型保存到本地并上传至 Hugging Face Hub
+# （取消下面三行代码的注释来使用该功能）
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
+
+# 加载量化好的模型到能被识别到的第一块显卡中
+model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
+
+# 从 Hugging Face Hub 下载量化好的模型并加载到能被识别到的第一块显卡中
+# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
+
+# 使用 model.generate 执行推理
+print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
+
+# 或者使用 TextGenerationPipeline
+pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
+print(pipeline("auto-gptq is")[0]["generated_text"])
+```
+
+参考 [此样例脚本](examples/quantization/quant_with_alpaca.py) 以了解进阶的用法。
+
+### 自定义模型
+
+<details>
+
+<summary>以下展示了如何拓展 `auto_gptq` 以支持 `OPT` 模型，如你所见，这非常简单：</summary>
+
+```python
+from auto_gptq.modeling import BaseGPTQForCausalLM
+
+
+class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
+    # chained attribute name of transformer layer block
+    layers_block_name = "model.decoder.layers"
+    # chained attribute names of other nn modules that in the same level as the transformer layer block
+    outside_layer_modules = [
+        "model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
+        "model.decoder.project_in", "model.decoder.final_layer_norm"
+    ]
+    # chained attribute names of linear layers in transformer layer module
+    # normally, there are four sub lists, for each one the modules in it can be seen as one operation, 
+    # and the order should be the order when they are truly executed, in this case (and usually in most cases), 
+    # they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.out_proj"],
+        ["fc1"],
+        ["fc2"]
+    ]
+```
+然后, 你就可以像在基本用法一节中展示的那样使用 `OPTGPTQForCausalLM.from_pretrained` 和其他方法。
+
+</details>
+
+
+### 在下游任务上执行评估
+你可以使用在 `auto_gptq.eval_tasks` 中定义的任务来评估量化前后的模型在某个特定下游任务上的表现。
+
+这些预定义的模型支持所有在 [🤗 transformers](https://github.com/huggingface/transformers)和本项目中被实现了的 causal-language-models。
+
+<details>
+
+<summary>以下是使用 `cardiffnlp/tweet_sentiment_multilingual` 数据集在序列分类（文本分类）任务上评估 `EleutherAI/gpt-j-6b` 模型的示例:</summary>
+
+```python
+from functools import partial
+
+import datasets
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from auto_gptq.eval_tasks import SequenceClassificationTask
+
+
+MODEL = "EleutherAI/gpt-j-6b"
+DATASET = "cardiffnlp/tweet_sentiment_multilingual"
+TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
+ID2LABEL = {
+    0: "negative",
+    1: "neutral",
+    2: "positive"
+}
+LABELS = list(ID2LABEL.values())
+
+
+def ds_refactor_fn(samples):
+    text_data = samples["text"]
+    label_data = samples["label"]
+
+    new_samples = {"prompt": [], "label": []}
+    for text, label in zip(text_data, label_data):
+        prompt = TEMPLATE.format(labels=LABELS, text=text)
+        new_samples["prompt"].append(prompt)
+        new_samples["label"].append(ID2LABEL[label])
+
+    return new_samples
+
+
+#  model = AutoModelForCausalLM.from_pretrained(MODEL).eval().half().to("cuda:0")
+model = AutoGPTQForCausalLM.from_pretrained(MODEL, BaseQuantizeConfig())
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
+
+task = SequenceClassificationTask(
+        model=model,
+        tokenizer=tokenizer,
+        classes=LABELS,
+        data_name_or_path=DATASET,
+        prompt_col_name="prompt",
+        label_col_name="label",
+        **{
+            "num_samples": 1000,  # how many samples will be sampled to evaluation
+            "sample_max_len": 1024,  # max tokens for each sample
+            "block_max_len": 2048,  # max tokens for each data block
+            # function to load dataset, one must only accept data_name_or_path as input 
+            # and return datasets.Dataset
+            "load_fn": partial(datasets.load_dataset, name="english"),  
+            # function to preprocess dataset, which is used for datasets.Dataset.map, 
+            # must return Dict[str, list] with only two keys: [prompt_col_name, label_col_name]
+            "preprocess_fn": ds_refactor_fn,  
+            # truncate label when sample's length exceed sample_max_len
+            "truncate_prompt": False  
+        }
+    )
+
+# note that max_new_tokens will be automatically specified internally based on given classes
+print(task.run())
+
+# self-consistency
+print(
+    task.run(
+        generation_config=GenerationConfig(
+            num_beams=3,
+            num_return_sequences=3,
+            do_sample=True
+        )
+    )
+)
+```
+
+</details>
+
+## 了解更多
+[教程](docs/tutorial) 提供了将 `auto_gptq` 集成到你的项目中的手把手指导和最佳实践准则。
+
+[示例](examples/README.md) 提供了大量示例脚本以将 `auto_gptq` 用于不同领域。
+
+## 支持的模型
+
+> 你可以使用 `model.config.model_type` 来对照下表以检查你正在使用的一个模型是否被 `auto_gptq` 所支持。
+> 
+> 比如， `WizardLM`，`vicuna` 和 `gpt4all` 模型的 `model_type` 皆为 `llama`， 因此这些模型皆被 `auto_gptq` 所支持。
+
+| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                              |
+|------------------------------------|--------------|-----------|-----------|---------------|-----------------------------------------------------------------------------------|
+| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                 |
+| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+
+## 支持的评估任务
+目前， `auto_gptq` 支持以下评估任务： `LanguageModelingTask`, `SequenceClassificationTask` 和 `TextSummarizationTask`；更多的评估任务即将到来！
+
+## 致谢
+- 特别感谢 **Elias Frantar**， **Saleh Ashkboos**， **Torsten Hoefler** 和 **Dan Alistarh** 提出 **GPTQ** 算法并开源[代码](https://github.com/IST-DASLab/gptq)。
+- 特别感谢 **qwopqwop200**， 本项目中涉及到模型量化的代码主要参考自 [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda)。
+
+[![Star History Chart](https://api.star-history.com/svg?repos=PanQiwei/AutoGPTQ&type=Date)](https://star-history.com/#PanQiWei/AutoGPTQ&Date)
--- a/3rd_party/AutoGPTQ/Yuan_M32_int4.py
+++ b/3rd_party/AutoGPTQ/Yuan_M32_int4.py
+from transformers import LlamaTokenizer, TextGenerationPipeline
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+import logging
+import json
+logging.basicConfig(
+    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+)
+
+pretrained_model_dir = "/mnt/beegfs2/Yuan2-M32-HF"
+quantized_model_dir = "/mnt/beegfs2/Yuan2-M32-GPTQ-int4"
+
+tokenizer = LlamaTokenizer.from_pretrained("/mnt/beegfs2/Yuan2-M32-HF", add_eos_token=False, add_bos_token=False, eos_token='<eod>', use_fast=True)
+tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
+
+examples = []
+with open("/mnt/beegfs2/instruct_data.json", 'r', encoding='utf-8') as file:
+    data = json.load(file)
+
+for i, item in enumerate(data):
+    if i >= 2000:
+        break
+    instruction = item.get('instruction', '')
+    output = item.get('output', '')
+    combined_text = instruction + " " + output
+    examples.append(tokenizer(combined_text))
+
+max_memory = {0: "80GIB", 1: "80GIB", 2: "80GIB", 3: "80GIB", 4: "80GIB", 5: "80GIB", 6: "80GIB", 7: "80GIB"}
+quantize_config = BaseQuantizeConfig(
+    bits=4,  # quantize model to 4-bit
+    group_size=128,  # it is recommended to set the value to 128
+    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
+)
+
+# load un-quantized model, by default, the model will always be loaded into CPU memory
+model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config, trust_remote_code=True, max_memory = max_memory)
+
+# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
+model.quantize(examples)
+
+# save quantized model using safetensors
+model.save_quantized(quantized_model_dir, use_safetensors=True)
+# push quantized model to Hugging Face Hub.
+# to use use_auth_token=True, Login first via huggingface-cli login.
+# or pass explcit token with: use_auth_token="hf_xxxxxxx"
+# (uncomment the following three lines to enable this feature)
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+
+# alternatively you can save and push at the same time
+# (uncomment the following three lines to enable this feature)
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
+