Commit 6a583c2f authored by chenych's avatar chenych
Browse files

update dtk to 24.04.1 and modify README

parent 7d576a9a
[html]
directory = coverage
[run]
data_file = .coverage_$LOCAL_RANK
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**Hardware details**
Information about CPU and GPU, such as RAM, number, etc.
**Software version**
Version of relevant software such as operation system, cuda toolkit, python, auto-gptq, pytorch, transformers, accelerate, etc.
**To Reproduce**
Steps to reproduce the behavior:
1. Go to '...'
2. Click on '....'
3. Scroll down to '....'
4. See error
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Additional context**
Add any other context about the problem here.
---
name: Custom issue template
about: Describe this issue template's purpose here.
title: ''
labels: ''
assignees: ''
---
---
name: Feature request
about: Suggest an idea for this project
title: "[FEATURE]"
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
name: Build AutoGPTQ Wheels with CUDA for Linux
on: workflow_dispatch
jobs:
build_wheels:
if: ${{ github.repository_owner == 'AutoGPTQ' }}
name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA ${{ matrix.cuda }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-20.04]
pyver: ["3.8", "3.9", "3.10", "3.11"]
cuda: ["11.8"] # wheel for 12.1 are built in build_wheels_pypi.yml
defaults:
run:
shell: bash
env:
CUDA_VERSION: ${{ matrix.cuda }}
steps:
- uses: actions/checkout@v3
- name: Free disk space
run: |
# Go from 19G to 54G free disk space in 3min
df -h
sudo apt-get update
sudo apt-get purge -y '^apache.*'
sudo apt-get purge -y '^imagemagick.*'
sudo apt-get purge -y '^dotnet.*'
sudo apt-get purge -y '^aspnetcore.*'
sudo apt-get purge -y 'php.*'
sudo apt-get purge -y '^temurin.*'
sudo apt-get purge -y '^mysql.*'
sudo apt-get purge -y '^java.*'
sudo apt-get purge -y '^openjdk.*'
sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
echo "remove /usr/share leftovers"
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
echo "remove other leftovers"
sudo rm -rf /var/lib/mysql > /dev/null 2>&1
sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
df -h
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.pyver }}
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Install Dependencies
run: |
conda install cuda-toolkit -c "nvidia/label/cuda-${CUDA_VERSION}.0"
# Refer to https://pytorch.org/get-started/locally/
python -m pip install torch --index-url https://download.pytorch.org/whl/cu118
python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
- name: Check install
run: |
python -c "import torch; print('torch version:', torch.__version__)"
- name: Build Wheel
run: |
# For some reason $CONDA_PREFIX is empty.
export CUDA_HOME=/usr/share/miniconda
export CUDA_PATH=/usr/share/miniconda
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CONDA_PREFIX}/lib"
export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX"
echo "CUDA_PATH:"
echo $CUDA_PATH
echo "PYPI_RELEASE:"
echo $PYPI_RELEASE
python setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v3
with:
name: 'linux-cuda-wheels'
path: ./dist/*.whl
name: Build AutoGPTQ Wheels with CUDA for Windows
on: workflow_dispatch
jobs:
build_wheels:
if: ${{ github.repository_owner == 'AutoGPTQ' }}
name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA ${{ matrix.cuda }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest]
pyver: ["3.8", "3.9", "3.10", "3.11"]
cuda: ["11.8"] # wheel for 12.1 are built in build_wheels_pypi.yml
defaults:
run:
shell: pwsh
env:
CUDA_VERSION: ${{ matrix.cuda }}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.pyver }}
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Install Dependencies
run: |
conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
# Refer to https://pytorch.org/get-started/locally/
python -m pip install torch --index-url https://download.pytorch.org/whl/cu118
python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
- name: Check install
run: |
python -c "import torch; print('torch version:', torch.__version__)"
- name: Build Wheel
run: |
$env:CUDA_PATH = $env:CONDA_PREFIX
$env:CUDA_HOME = $env:CONDA_PREFIX
$env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
python setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v3
with:
name: 'windows-cuda-wheels'
path: ./dist/*.whl
name: Build AutoGPTQ Wheels for PyPI with CUDA for Linux
on: workflow_dispatch
jobs:
build_wheels:
if: ${{ github.repository_owner == 'AutoGPTQ' }}
name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA 12.1
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-20.04]
pyver: ["3.8", "3.9", "3.10", "3.11"]
defaults:
run:
shell: bash
env:
CUDA_VERSION: "12.1"
steps:
- uses: actions/checkout@v3
- name: Free disk space
run: |
# Go from 19G to 54G free disk space in 3min
df -h
sudo apt-get update
sudo apt-get purge -y '^apache.*'
sudo apt-get purge -y '^imagemagick.*'
sudo apt-get purge -y '^dotnet.*'
sudo apt-get purge -y '^aspnetcore.*'
sudo apt-get purge -y 'php.*'
sudo apt-get purge -y '^temurin.*'
sudo apt-get purge -y '^mysql.*'
sudo apt-get purge -y '^java.*'
sudo apt-get purge -y '^openjdk.*'
sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
echo "remove /usr/share leftovers"
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
echo "remove other leftovers"
sudo rm -rf /var/lib/mysql > /dev/null 2>&1
sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
df -h
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.pyver }}
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Install Dependencies
run: |
conda install cuda-toolkit -c "nvidia/label/cuda-${CUDA_VERSION}.0"
# Refer to https://pytorch.org/get-started/locally/
python -m pip install torch
python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
- name: Check install
run: |
python -c "import torch; print('torch version:', torch.__version__)"
- name: Build Wheel
run: |
# For some reason $CONDA_PREFIX is empty.
export CUDA_HOME=/usr/share/miniconda
export CUDA_PATH=/usr/share/miniconda
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${CONDA_PREFIX}/lib"
export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export PYPI_RELEASE="1"
echo "CUDA_PATH:"
echo $CUDA_PATH
echo "PYPI_RELEASE:"
echo $PYPI_RELEASE
python setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v3
with:
name: 'linux-cuda-wheels-pypi'
path: ./dist/*.whl
name: Build AutoGPTQ Wheels for PyPI with CUDA for Windows
on: workflow_dispatch
jobs:
build_wheels:
if: ${{ github.repository_owner == 'AutoGPTQ' }}
name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and CUDA 12.1
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest]
pyver: ["3.8", "3.9", "3.10", "3.11"]
defaults:
run:
shell: pwsh
env:
CUDA_VERSION: "12.1"
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.pyver }}
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.pyver }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Install Dependencies
run: |
conda install cuda-toolkit -c "nvidia/label/cuda-${env:CUDA_VERSION}.0"
# Refer to https://pytorch.org/get-started/locally/
python -m pip install torch --index-url https://download.pytorch.org/whl/cu121
python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
- name: Check install
run: |
python -c "import torch; print('torch version:', torch.__version__)"
- name: Build Wheel
run: |
$env:CUDA_PATH = $env:CONDA_PREFIX
$env:CUDA_HOME = $env:CONDA_PREFIX
$env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'
$env:PYPI_RELEASE = "1"
echo "CUDA_PATH:"
echo $env:CUDA_PATH
echo "PYPI_RELEASE:"
echo $env:PYPI_RELEASE
python setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v3
with:
name: 'windows-cuda-wheels-pypi'
path: ./dist/*.whl
name: Build AutoGPTQ Wheels with ROCm
on: workflow_dispatch
jobs:
build_wheels:
if: ${{ github.repository_owner == 'AutoGPTQ' }}
strategy:
matrix:
os: [ubuntu-20.04]
python: ["3.8", "3.9", "3.10", "3.11"]
rocm: ["5.7.3"] # we build only for ROCm 5.7 to match PyTorch 2.2.0 and PyTorch 2.2 nightly
name: Build wheels for ${{ matrix.os }} and Python ${{ matrix.python }} and RoCm ${{ matrix.rocm }}
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v3
- name: Free disk space
run: |
# Go from 19G to 54G free disk space in 3min
df -h
sudo apt-get update
sudo apt-get purge -y '^apache.*'
sudo apt-get purge -y '^imagemagick.*'
sudo apt-get purge -y '^dotnet.*'
sudo apt-get purge -y '^aspnetcore.*'
sudo apt-get purge -y 'php.*'
sudo apt-get purge -y '^temurin.*'
sudo apt-get purge -y '^mysql.*'
sudo apt-get purge -y '^java.*'
sudo apt-get purge -y '^openjdk.*'
sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
echo "remove /usr/share leftovers"
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
echo "remove other leftovers"
sudo rm -rf /var/lib/mysql > /dev/null 2>&1
sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
df -h
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python }}
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.python }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Set up environment
run: |
echo "Using python:"
python --version
which python
if [[ "${{ matrix.rocm }}" == "5.4.2" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.4.50402-1_all.deb
elif [[ "${{ matrix.rocm }}" == "5.6.1" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.6.50601-1_all.deb
elif [[ "${{ matrix.rocm }}" == "5.7.3" ]]; then
export ROCM_DL_FILE=amdgpu-install_5.7.50703-1_all.deb
else
echo Unknown rocm version
exit 1
fi
curl -O https://repo.radeon.com/amdgpu-install/${{ matrix.rocm }}/ubuntu/focal/$ROCM_DL_FILE
sudo dpkg -i $ROCM_DL_FILE
sudo DEBIAN_FRONTEND=noninteractive amdgpu-install --usecase=rocm --no-dkms --no-32 -y
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends rocsparse-dev rocthrust-dev rocblas-dev hipblas-dev hipsparse-dev
python -m pip install --upgrade build setuptools wheel ninja numpy gekko pandas
if [[ "${{ matrix.rocm }}" == "5.7.3" ]]; then
echo "Using PyTorch stable"
python -m pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
else
echo Unknown rocm version for python install
exit 1
fi
- name: Build wheels
run: |
echo "Using python for build:"
python --version
which python
ROCM_VERSION=${{ matrix.rocm }} python setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v3
with:
name: 'linux-rocm-wheels'
path: ./dist/*.whl
name: check_code_quality
on:
push:
branches: [ main ]
paths:
- "auto_gptq/**.py"
- "tests/**.py"
- "examples/**.py"
- "setup.py"
pull_request:
branches: [ main ]
paths:
- "auto_gptq/**.py"
- "tests/**.py"
- "examples/**.py"
- "setup.py"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
strategy:
fail-fast: false
matrix:
python-version: [3.9]
os: [ubuntu-22.04]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- name: Free disk space
run: |
# Go from 19G to 54G free disk space in 3min
df -h
sudo apt-get update
sudo apt-get purge -y '^apache.*'
sudo apt-get purge -y '^imagemagick.*'
sudo apt-get purge -y '^dotnet.*'
sudo apt-get purge -y '^aspnetcore.*'
sudo apt-get purge -y 'php.*'
sudo apt-get purge -y '^temurin.*'
sudo apt-get purge -y '^mysql.*'
sudo apt-get purge -y '^java.*'
sudo apt-get purge -y '^openjdk.*'
sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
df -h
echo "https://github.com/actions/virtual-environments/issues/709"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -h
echo "remove big /usr/local"
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf /usr/local/lib/android >/dev/null 2>&1
df -h
echo "remove /usr/share leftovers"
sudo rm -rf /usr/share/dotnet/sdk > /dev/null 2>&1
sudo rm -rf /usr/share/dotnet/shared > /dev/null 2>&1
sudo rm -rf /usr/share/swift > /dev/null 2>&1
df -h
echo "remove other leftovers"
sudo rm -rf /var/lib/mysql > /dev/null 2>&1
sudo rm -rf /home/runner/.dotnet > /dev/null 2>&1
sudo rm -rf /home/runneradmin/.dotnet > /dev/null 2>&1
sudo rm -rf /etc/skel/.dotnet > /dev/null 2>&1
sudo rm -rf /usr/local/.ghcup > /dev/null 2>&1
sudo rm -rf /usr/local/aws-cli > /dev/null 2>&1
sudo rm -rf /usr/local/lib/node_modules > /dev/null 2>&1
sudo rm -rf /usr/lib/heroku > /dev/null 2>&1
sudo rm -rf /usr/local/share/chromium > /dev/null 2>&1
df -h
- uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2.2.0
with:
activate-environment: "build"
python-version: ${{ matrix.python-version }}
mamba-version: "*"
use-mamba: false
channels: conda-forge,defaults
channel-priority: true
add-pip-as-python-dependency: true
auto-activate-base: false
- name: Install dependencies
run: |
conda install cuda-toolkit -c "nvidia/label/cuda-12.1.0"
# Refer to https://pytorch.org/get-started/locally/
python -m pip install torch
python -m pip install --upgrade build setuptools wheel numpy
- name: Check install
run: |
python -c "import torch; print('torch version:', torch.__version__)"
- name: Install AutoGPTQ
run: |
# For some reason $CONDA_PREFIX is empty.
export CUDA_HOME=/usr/share/miniconda
export CUDA_PATH=/usr/share/miniconda
echo "CUDA_HOME:"
echo $CUDA_HOME
echo "CUDA_PATH:"
echo $CUDA_PATH
pip install -vvv .[quality]
- name: Check style with ruff
run: |
ruff auto_gptq examples tests setup.py
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Build with: `docker build -f Dockerfile -t autogptq .`
# Run with: `docker run --gpus all --rm -it autogptq:latest /bin/bash`
FROM nvcr.io/nvidia/cuda:12.1.0-runtime-ubuntu22.04
RUN apt update && \
apt install -y wget git && \
apt clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
ENV PATH="/root/miniconda3/bin:${PATH}"
ARG PATH="/root/miniconda3/bin:${PATH}"
RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& mkdir .conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh
RUN conda init bash
RUN pip install --upgrade pip
RUN pip install --upgrade numpy torch setuptools wheel
RUN git clone https://github.com/AutoGPTQ/AutoGPTQ.git
WORKDIR /AutoGPTQ
RUN pip install -vvv .
\ No newline at end of file
# Build with: `docker build -f Dockerfile_amd -t autogptq-rocm .`
# Run with: `docker run --rm -it --shm-size=150G --device /dev/kfd --device /dev/dri --net host --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined autogptq-rocm:latest /bin/bash`
FROM rocm/dev-ubuntu-22.04:5.7
RUN apt update && \
apt install -y wget \
git \
rocsparse-dev \
hipsparse-dev \
rocthrust-dev \
rocblas-dev \
hipblas-dev && \
apt clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
ENV PATH="/root/miniconda3/bin:${PATH}"
ARG PATH="/root/miniconda3/bin:${PATH}"
RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& mkdir .conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh
RUN conda init bash
RUN pip install --upgrade pip
RUN pip install --upgrade numpy setuptools wheel ninja packaging
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7
RUN git clone https://github.com/AutoGPTQ/AutoGPTQ.git
WORKDIR /AutoGPTQ
RUN ROCM_VERSION="5.7" pip install -vvv .
\ No newline at end of file
MIT License
Copyright (c) 2023 潘其威(William)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
global-include autogptq_extension/**/*.cuh
global-include autogptq_extension/**/*.h
global-include autogptq_extension/**/*.cpp
global-include autogptq_extension/**/*.cu
global-include autogptq_extension/**/*.py
style:
ruff auto_gptq examples tests setup.py --fix
<h1 align="center">AutoGPTQ</h1>
<p align="center">An easy-to-use LLM quantization package with user-friendly APIs, based on GPTQ algorithm (weight-only quantization).</p>
<p align="center">
<a href="https://github.com/PanQiWei/AutoGPTQ/releases">
<img alt="GitHub release" src="https://img.shields.io/github/release/PanQiWei/AutoGPTQ.svg">
</a>
<a href="https://pypi.org/project/auto-gptq/">
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dd/auto-gptq">
</a>
</p>
<h4 align="center">
<p>
<b>English</b> |
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
</p>
</h4>
## News or Update
- 2024-02-15 - (News) - AutoGPTQ 0.7.0 is released, with [Marlin](https://github.com/IST-DASLab/marlin) int4*fp16 matrix multiplication kernel support, with the argument `use_marlin=True` when loading models.
- 2023-08-23 - (News) - 🤗 Transformers, optimum and peft have integrated `auto-gptq`, so now running and training GPTQ models can be more available to everyone! See [this blog](https://huggingface.co/blog/gptq-integration) and it's resources for more details!
*For more histories please turn to [here](docs/NEWS_OR_UPDATE.md)*
## Performance Comparison
### Inference Speed
> The result is generated using [this script](examples/benchmark/generation_speed.py), batch size of input is 1, decode strategy is beam search and enforce the model to generate 512 tokens, speed metric is tokens/s (the larger, the better).
>
> The quantized model is loaded using the setup that can gain the fastest inference speed.
| model | GPU | num_beams | fp16 | gptq-int4 |
|---------------|---------------|-----------|-------|-----------|
| llama-7b | 1xA100-40G | 1 | 18.87 | 25.53 |
| llama-7b | 1xA100-40G | 4 | 68.79 | 91.30 |
| moss-moon 16b | 1xA100-40G | 1 | 12.48 | 15.25 |
| moss-moon 16b | 1xA100-40G | 4 | OOM | 42.67 |
| moss-moon 16b | 2xA100-40G | 1 | 06.83 | 06.78 |
| moss-moon 16b | 2xA100-40G | 4 | 13.10 | 10.80 |
| gpt-j 6b | 1xRTX3060-12G | 1 | OOM | 29.55 |
| gpt-j 6b | 1xRTX3060-12G | 4 | OOM | 47.36 |
### Perplexity
For perplexity comparison, you can turn to [here](https://github.com/qwopqwop200/GPTQ-for-LLaMa#result) and [here](https://github.com/qwopqwop200/GPTQ-for-LLaMa#gptq-vs-bitsandbytes)
## Installation
AutoGPTQ is available on Linux and Windows only. You can install the latest stable release of AutoGPTQ from pip with pre-built wheels:
| CUDA/ROCm version | Installation | Built against PyTorch |
|-------------------|---------------------------------------------------------------------------------------------------|-----------------------|
| CUDA 11.8 | `pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/` | 2.2.1+cu118 |
| CUDA 12.1 | `pip install auto-gptq --no-build-isolation` | 2.2.1+cu121 |
| ROCm 5.7 | `pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm573/` | 2.2.1+rocm5.7 |
AutoGPTQ can be installed with the Triton dependency with `pip install auto-gptq[triton] --no-build-isolation` in order to be able to use the Triton backend (currently only supports linux, no 3-bits quantization).
For older AutoGPTQ, please refer to [the previous releases installation table](docs/INSTALLATION.md).
On NVIDIA systems, AutoGPTQ does not support [Maxwell or lower](https://qiita.com/uyuni/items/733a93b975b524f89f46) GPUs.
### Install from source
Clone the source code:
```bash
git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
```
A few packages are required in order to build from source: `pip install numpy gekko pandas`.
Then, install locally from source:
```bash
pip install -vvv --no-build-isolation -e .
```
You can set `BUILD_CUDA_EXT=0` to disable pytorch extension building, but this is **strongly discouraged** as AutoGPTQ then falls back on a slow python implementation.
As a last resort, if the above command fails, you can try `python setup.py install`.
#### On ROCm systems
To install from source for AMD GPUs supporting ROCm, please specify the `ROCM_VERSION` environment variable. Example:
```bash
ROCM_VERSION=5.6 pip install -vvv --no-build-isolation -e .
```
The compilation can be speeded up by specifying the `PYTORCH_ROCM_ARCH` variable ([reference](https://github.com/pytorch/pytorch/blob/7b73b1e8a73a1777ebe8d2cd4487eb13da55b3ba/setup.py#L132)) in order to build for a single target device, for example `gfx90a` for MI200 series devices.
For ROCm systems, the packages `rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev` are required to build.
## Quick Tour
### Quantization and Inference
> warning: this is just a showcase of the usage of basic apis in AutoGPTQ, which uses only one sample to quantize a much small model, quality of quantized model using such little samples may not good.
Below is an example for the simplest use of `auto_gptq` to quantize a model and inference after quantization:
```python
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
pretrained_model_dir = "facebook/opt-125m"
quantized_model_dir = "opt-125m-4bit"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
tokenizer(
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
)
]
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)
# save quantized model
model.save_quantized(quantized_model_dir)
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)
# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
# alternatively you can save and push at the same time
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
# load quantized model to the first GPU
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
# download quantized model from Hugging Face Hub and load to the first GPU
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
# inference with model.generate
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])
```
For more advanced features of model quantization, please reference to [this script](examples/quantization/quant_with_alpaca.py)
### Customize Model
<details>
<summary>Below is an example to extend `auto_gptq` to support `OPT` model, as you will see, it's very easy:</summary>
```python
from auto_gptq.modeling import BaseGPTQForCausalLM
class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
# chained attribute name of transformer layer block
layers_block_name = "model.decoder.layers"
# chained attribute names of other nn modules that in the same level as the transformer layer block
outside_layer_modules = [
"model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
"model.decoder.project_in", "model.decoder.final_layer_norm"
]
# chained attribute names of linear layers in transformer layer module
# normally, there are four sub lists, for each one the modules in it can be seen as one operation,
# and the order should be the order when they are truly executed, in this case (and usually in most cases),
# they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.out_proj"],
["fc1"],
["fc2"]
]
```
After this, you can use `OPTGPTQForCausalLM.from_pretrained` and other methods as shown in Basic.
</details>
### Evaluation on Downstream Tasks
You can use tasks defined in `auto_gptq.eval_tasks` to evaluate model's performance on specific down-stream task before and after quantization.
The predefined tasks support all causal-language-models implemented in [🤗 transformers](https://github.com/huggingface/transformers) and in this project.
<details>
<summary>Below is an example to evaluate `EleutherAI/gpt-j-6b` on sequence-classification task using `cardiffnlp/tweet_sentiment_multilingual` dataset:</summary>
```python
from functools import partial
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from auto_gptq.eval_tasks import SequenceClassificationTask
MODEL = "EleutherAI/gpt-j-6b"
DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
ID2LABEL = {
0: "negative",
1: "neutral",
2: "positive"
}
LABELS = list(ID2LABEL.values())
def ds_refactor_fn(samples):
text_data = samples["text"]
label_data = samples["label"]
new_samples = {"prompt": [], "label": []}
for text, label in zip(text_data, label_data):
prompt = TEMPLATE.format(labels=LABELS, text=text)
new_samples["prompt"].append(prompt)
new_samples["label"].append(ID2LABEL[label])
return new_samples
# model = AutoModelForCausalLM.from_pretrained(MODEL).eval().half().to("cuda:0")
model = AutoGPTQForCausalLM.from_pretrained(MODEL, BaseQuantizeConfig())
tokenizer = AutoTokenizer.from_pretrained(MODEL)
task = SequenceClassificationTask(
model=model,
tokenizer=tokenizer,
classes=LABELS,
data_name_or_path=DATASET,
prompt_col_name="prompt",
label_col_name="label",
**{
"num_samples": 1000, # how many samples will be sampled to evaluation
"sample_max_len": 1024, # max tokens for each sample
"block_max_len": 2048, # max tokens for each data block
# function to load dataset, one must only accept data_name_or_path as input
# and return datasets.Dataset
"load_fn": partial(datasets.load_dataset, name="english"),
# function to preprocess dataset, which is used for datasets.Dataset.map,
# must return Dict[str, list] with only two keys: [prompt_col_name, label_col_name]
"preprocess_fn": ds_refactor_fn,
# truncate label when sample's length exceed sample_max_len
"truncate_prompt": False
}
)
# note that max_new_tokens will be automatically specified internally based on given classes
print(task.run())
# self-consistency
print(
task.run(
generation_config=GenerationConfig(
num_beams=3,
num_return_sequences=3,
do_sample=True
)
)
)
```
</details>
## Learn More
[tutorials](docs/tutorial) provide step-by-step guidance to integrate `auto_gptq` with your own project and some best practice principles.
[examples](examples/README.md) provide plenty of example scripts to use `auto_gptq` in different ways.
## Supported Models
> you can use `model.config.model_type` to compare with the table below to check whether the model you use is supported by `auto_gptq`.
>
> for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
| model type | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt |
|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
| bloom | ✅ | ✅ | ✅ | ✅ | |
| gpt2 | ✅ | ✅ | ✅ | ✅ | |
| gpt_neox | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| gptj | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| llama | ✅ | ✅ | ✅ | ✅ | ✅ |
| moss | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| opt | ✅ | ✅ | ✅ | ✅ | |
| gpt_bigcode | ✅ | ✅ | ✅ | ✅ | |
| codegen | ✅ | ✅ | ✅ | ✅ | |
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | ✅ | ✅ | |
## Supported Evaluation Tasks
Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
## Running tests
Tests can be run with:
```
pytest tests/ -s
```
## FAQ
### Which kernel is used by default?
AutoGPTQ defaults to using exllamav2 int4*fp16 kernel for matrix multiplication.
### How to use Marlin kernel?
Marlin is an optimized int4 * fp16 kernel was recently proposed at https://github.com/IST-DASLab/marlin. This is integrated in AutoGPTQ when loading a model with `use_marlin=True`. This kernel is available only on devices with compute capability 8.0 or 8.6 (Ampere GPUs).
## Acknowledgement
- Special thanks **Elias Frantar**, **Saleh Ashkboos**, **Torsten Hoefler** and **Dan Alistarh** for proposing **GPTQ** algorithm and open source the [code](https://github.com/IST-DASLab/gptq), and for releasing [Marlin kernel](https://github.com/IST-DASLab/marlin) for mixed precision computation.
- Special thanks **qwopqwop200**, for code in this project that relevant to quantization are mainly referenced from [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda).
- Special thanks to **turboderp**, for releasing [Exllama](https://github.com/turboderp/exllama) and [Exllama v2](https://github.com/turboderp/exllamav2) libraries with efficient mixed precision kernels.
<h1 align="center">Yuan2-M32基于AutoGPTQ的量化和推理</h1>
## 配置AutoGPTQ环境
- AutoGPTQ环境配置要求:CUDA版本高于11.8
- 容器:使用[vllm](https://github.com/IEI-mjx/Yuan2.0-M32/blob/main/vllm/README_Yuan_vllm.md)项目提供的镜像创建容器
```shell
# 进入容器
docker exec -it vllm_yuan bash
# 进入你的工作目录
cd /mnt
# 拉取我们的项目
git clone https://github.com/IEIT-Yuan/Yuan2.0-M32.git
# 进入autogptq项目
cd Yuan2.0-M32/3rd_party/AutoGPTQ
# 安装autogptq
pip install auto-gptq --no-build-isolation
```
## 量化Yuan2-M32-HF模型
量化Yuan2-M32模型主要分为三步:1.下载Yuan2-M32-HF模型 2.下载数据集 3.设置量化参数,量化Yuan2-M32-HF模型
- 1.下载Yuan2-M32 hugging face模型并移动到指定路径(/mnt/beegfs2/Yuan2-M32-HF),可参考[vllm](https://github.com/IEI-mjx/Yuan2.0-M32/blob/main/vllm/README_Yuan_vllm.md),模型下载地址:https://huggingface.co/IEIT-Yuan/Yuan2-M32-hf
- 2.数据集下载点击[这里](https://huggingface.co/datasets/hakurei/open-instruct-v1),下载后移动到指定路径如:/mnt/beegfs2/
- 3.按照以下步骤调整量化参数进行量化操作
```shell
# 编辑Yuan2-M32-int4.py
cd /mnt/beegfs2/Yuan2.0-M32/3rd_party/AutoGPTQ
vim Yuan2-M32-int4.py
'''
pretrained_model_dir = "/mnt/beegfs2/Yuan2-M32-HF"
quantized_model_dir = "/mnt/beegfs2/Yuan2-M32-GPTQ-int4"
tokenizer = LlamaTokenizer.from_pretrained("/mnt/beegfs2/Yuan2-M32-HF", add_eos_token=False, add_bos_token=False, eos_token='<eod>', use_fast=True)
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
examples = []
with open("/mnt/beegfs2/instruct_data.json", 'r', encoding='utf-8') as file: # 数据集路径
data = json.load(file)
for i, item in enumerate(data):
if i >= 2000:
break
instruction = item.get('instruction', '')
output = item.get('output', '')
combined_text = instruction + " " + output
examples.append(tokenizer(combined_text))
max_memory = {0: "80GIB", 1: "80GIB", 2: "80GIB", 3: "80GIB", 4: "80GIB", 5: "80GIB", 6: "80GIB", 7: "80GIB"}
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
'''
# 1.修改pretrained_model_dir,指定量化后的quantized_model_dir
# 2.修改数据集路径
# 3.max_memory可以指定要使用的GPUs
# 4.修改量化参数,若要int4精度bits=4,若要int8精度bits=8,其他参数可以参考默认值
# 运行此脚本
python Yuan2-M32-int4.py
# 模型量化和packing过程耗时约8h,可以指定不同的GPU同时分别量化int4和int8
```
## GPTQ量化模型的推理
量化完成后,目标路径文件夹中会生成.safetensors后缀的ckpt文件以及config.json、quantize_config.json文件,需要先从Yuan2-M32-HF路径中拷贝tokenizer相关的文件
```shell
# 进入Yuan2-M32-HF路径
cd /mnt/beegfs2/Yuan2-M32-HF
# 拷贝tokenizer相关文件至Yuan2-M32-GPTQ-int4
cp special_tokens_map.json tokenizer* /mnt/beegfs2/Yuan2-M32-GPTQ-int4
# 编辑inference.py
cd /mnt/beegfs2/Yuan2.0-M32/3rd_party/AutoGPTQ
vim inference.py
'''
quantized_model_dir = "/mnt/beegfs2/Yuan2-M32-GPTQ-int4"
tokenizer = LlamaTokenizer.from_pretrained('/mnt/beegfs2/Yuan2-M32-GPTQ-int4', add_eos_token=False, add_bos_token=False, eos_token='<eod>')
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", trust_remote_code=True)
'''
# 修改quantized_model_dir和tokenizer路径
# 运行inference.py
python inference.py
```
## 推理精度&性能测试
> HumanEval测试参数如下:
> generation_params = {
"max_new_tokens": 512,
"top_k": 1,
"top_p": 0,
"temperature": 1.0,
}
> BF16模型推理使用2张80GB GPU卡,GPTQ-INT4/INT8模型推理用1张80GB GPU卡
> 测试结果:
| Model | Accuracy Type | HumanEval | Inference Speed | Inference Memory Usage |
|---------------------|---------------|------------|-----------------|-------------------------|
| Yuan2-M32-HF | BF16 | 73.17% | 13.16 token/s |76.34 GB |
| Yuan2-M32-GPTQ-int8 | INT8 | 72.56% | 9.05 token/s |39.81 GB |
| Yuan2-M32-GPTQ-int4 | INT4 | 66.46% | 9.24 token/s |23.27GB |
<h1 align="center">AutoGPTQ</h1>
<p align="center">一个基于 GPTQ 算法,简单易用且拥有用户友好型接口的大语言模型量化工具包。</p>
<p align="center">
<a href="https://github.com/PanQiWei/AutoGPTQ/releases">
<img alt="GitHub release" src="https://img.shields.io/github/release/PanQiWei/AutoGPTQ.svg">
</a>
<a href="https://pypi.org/project/auto-gptq/">
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dd/auto-gptq">
</a>
</p>
<h4 align="center">
<p>
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
<b>中文</b>
</p>
</h4>
Note: The English README is likely to be more up to date.
## 通向 v1.0.0 之路
嗨,社区的伙伴们,好久不见!很抱歉这段时间由于个人原因,我没能以较高的频率来更新这个项目。过去几周对我的职业生涯规划而言意义重大。在不久前,我正式告别了毕业后便加入两年之久的创业团队,非常感谢团队的领导和同事们给予我的信任与指导,让我能够在两年时间里飞速地成长;同时也十分感激团队允许我自 AutoGPTQ 项目创立以来一直无偿使用内部的 A100 GPU 服务器集群以完成各项实验与性能测评。(当然今后是无法继续使用了,因此**若有新的硬件赞助我将感激不尽**!)过去的两年里,我在这个团队中担任算法工程师的角色,负责基于大语言模型的对话系统架构设计与开发,我们曾成功推出一款名为 gemsouls 的产品,但不幸的是它已经停止运营。而现在,这个团队即将推出一款名为 [modelize](https://www.beta.modelize.ai/) 的新产品,**这是一个大模型原生的 AI 智能体平台,用户可以使用多个 AI 智能体搭建一个高度自动化的团队,让它们在工作流中相互合作,高效完成复杂的项目。**
话归正题,我非常兴奋地看到,在过去几个月的时间里,针对大语言模型推理性能优化的研究取得了巨大的进展,如今我们不仅能够在高端显卡上完成大语言模型的推理,甚至在 CPU 和边缘设备上都可以轻松运行大语言模型。一系列的技术进步,让我同样迫不及待地在开源社区上做出更多的贡献,因此,首先,我将用约四周的时间将 AutoGPTQ 迭代至 v1.0.0 正式版本,在此期间,也会有 2~3 个小版本发布以让用户能够及时体验性能优化和新特性。在我的愿景里,**到 v1.0.0 版本正式发布时,AutoGPTQ 将能够作为一个灵活可拓展的、支持所有 GPTQ-like 方法的量化后端,自动地完成各种基于 Pytorch 编写的大语言模型的量化工作**。我在[这里](https://github.com/PanQiWei/AutoGPTQ/issues/348)详细介绍了开发计划,欢迎移步至此进行讨论并给出你们的建议!
## 新闻或更新
- 2023-08-23 - (新闻) - 🤗 Transformers、optimum 和 peft 完成了对 `auto-gptq` 的集成,现在使用 GPTQ 模型进行推理和训练将变得更容易!阅读 [这篇博客](https://huggingface.co/blog/gptq-integration) 和相关资源以了解更多细节!
- 2023-08-21 - (新闻) - 通义千问团队发布了基于 `auto-gptq` 的 Qwen-7B 4bit 量化版本模型,并提供了[详尽的测评结果](https://huggingface.co/Qwen/Qwen-7B-Chat-Int4#%E9%87%8F%E5%8C%96-quantization)
- 2023-08-06 - (更新) - 支持 exllama 的 q4 CUDA 算子使得 int4 量化模型能够获得至少1.3倍的推理速度提升.
- 2023-08-04 - (更新) - 支持 RoCm 使得 AMD GPU 的用户能够使用 auto-gptq 的 CUDA 拓展.
- 2023-07-26 - (更新) - 一个优雅的 [PPL 测评脚本](examples/benchmark/perplexity.py)以获得可以与诸如 `llama.cpp` 等代码库进行公平比较的结果。
- 2023-06-05 - (更新) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层,支持 LoRA,AdaLoRA,AdaptionPrompt 等。
- 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
*获取更多的历史信息,请转至[这里](docs/NEWS_OR_UPDATE.md)*
## 性能对比
### 推理速度
> 以下结果通过[这个脚本](examples/benchmark/generation_speed.py)生成,文本输入的 batch size 为1,解码策略为 beam search 并且强制模型生成512个 token,速度的计量单位为 tokens/s(越大越好)。
>
> 量化模型通过能够最大化推理速度的方式加载。
| model | GPU | num_beams | fp16 | gptq-int4 |
|---------------|---------------|-----------|-------|-----------|
| llama-7b | 1xA100-40G | 1 | 18.87 | 25.53 |
| llama-7b | 1xA100-40G | 4 | 68.79 | 91.30 |
| moss-moon 16b | 1xA100-40G | 1 | 12.48 | 15.25 |
| moss-moon 16b | 1xA100-40G | 4 | OOM | 42.67 |
| moss-moon 16b | 2xA100-40G | 1 | 06.83 | 06.78 |
| moss-moon 16b | 2xA100-40G | 4 | 13.10 | 10.80 |
| gpt-j 6b | 1xRTX3060-12G | 1 | OOM | 29.55 |
| gpt-j 6b | 1xRTX3060-12G | 4 | OOM | 47.36 |
### 困惑度(PPL)
对于困惑度的对比, 你可以参考 [这里](https://github.com/qwopqwop200/GPTQ-for-LLaMa#result)[这里](https://github.com/qwopqwop200/GPTQ-for-LLaMa#gptq-vs-bitsandbytes)
## 安装
### 快速安装
你可以通过 pip 来安装与 PyTorch 2.0.1 相兼容的最新稳定版本的 AutoGPTQ 的预构建轮子文件:
* 对于 CUDA 11.7: `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/`
* 对于 CUDA 11.8: `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/`
* 对于 RoCm 5.4.2: `pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/rocm542/`
**警告:** 预构建的轮子文件不一定在 PyTorch 的 nightly 版本上有效。如果要使用 PyTorch 的 nightly 版本,请从源码安装 AutoGPTQ。
#### 取消 cuda 拓展的安装
默认情况下,在 `torch``cuda` 已经于你的机器上被安装时,cuda 拓展将被自动安装,如果你不想要这些拓展的话,采用以下安装命令:
```shell
BUILD_CUDA_EXT=0 pip install auto-gptq
```
同时为确保该拓展——`autogptq_cuda` 不再存在于你的虚拟环境,执行以下命令:
```shell
pip uninstall autogptq_cuda -y
```
#### 支持使用 triton 加速
若想使用 `triton` 加速模型推理,使用以下命令:
> 警告:目前 triton 仅支持 linux 操作系统;当使用 triton 时 3-bit 数值类型的量化将不被支持
```shell
pip install auto-gptq[triton]
```
### 从源码安装
<details>
<summary>点击以查看详情</summary>
克隆源码:
```shell
git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
```
然后,从项目目录安装:
```shell
pip install .
```
正如在快速安装一节,你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。
如果你想要使用 triton 加速且其能够被你的操作系统所支持,请使用 `.[triton]`
对应 AMD GPUs,为了从源码安装以支持 RoCm,请设置 `ROCM_VERSION` 环境变量。同时通过设置 `PYTORCH_ROCM_ARCH` ([reference](https://github.com/pytorch/pytorch/blob/7b73b1e8a73a1777ebe8d2cd4487eb13da55b3ba/setup.py#L132)) 可提升编译速度,例如:对于 MI200 系列设备,该变量可设为 `gfx90a`。例子:
```
ROCM_VERSION=5.6 pip install .
```
对于 RoCm 系统,在从源码安装时额外需要提前安装以下包:`rocsparse-dev`, `hipsparse-dev`, `rocthrust-dev`, `rocblas-dev` and `hipblas-dev`
</details>
## 快速开始
### 量化和推理
> 警告:这里仅是对 AutoGPTQ 中基本接口的用法展示,只使用了一条文本来量化一个特别小的模型,因此其结果的表现可能不如在大模型上执行量化后预期的那样好。
以下展示了使用 `auto_gptq` 进行量化和推理的最简单用法:
```python
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
pretrained_model_dir = "facebook/opt-125m"
quantized_model_dir = "opt-125m-4bit"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
examples = [
tokenizer(
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
)
]
quantize_config = BaseQuantizeConfig(
bits=4, # 将模型量化为 4-bit 数值类型
group_size=128, # 一般推荐将此参数的值设置为 128
desc_act=False, # 设为 False 可以显著提升推理速度,但是 ppl 可能会轻微地变差
)
# 加载未量化的模型,默认情况下,模型总是会被加载到 CPU 内存中
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
# 量化模型, 样本的数据类型应该为 List[Dict],其中字典的键有且仅有 input_ids 和 attention_mask
model.quantize(examples)
# 保存量化好的模型
model.save_quantized(quantized_model_dir)
# 使用 safetensors 保存量化好的模型
model.save_quantized(quantized_model_dir, use_safetensors=True)
# 将量化好的模型直接上传至 Hugging Face Hub
# 当使用 use_auth_token=True 时, 确保你已经首先使用 huggingface-cli login 进行了登录
# 或者可以使用 use_auth_token="hf_xxxxxxx" 来显式地添加账户认证 token
# (取消下面三行代码的注释来使用该功能)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
# 或者你也可以同时将量化好的模型保存到本地并上传至 Hugging Face Hub
# (取消下面三行代码的注释来使用该功能)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
# 加载量化好的模型到能被识别到的第一块显卡中
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
# 从 Hugging Face Hub 下载量化好的模型并加载到能被识别到的第一块显卡中
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
# 使用 model.generate 执行推理
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
# 或者使用 TextGenerationPipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("auto-gptq is")[0]["generated_text"])
```
参考 [此样例脚本](examples/quantization/quant_with_alpaca.py) 以了解进阶的用法。
### 自定义模型
<details>
<summary>以下展示了如何拓展 `auto_gptq` 以支持 `OPT` 模型,如你所见,这非常简单:</summary>
```python
from auto_gptq.modeling import BaseGPTQForCausalLM
class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
# chained attribute name of transformer layer block
layers_block_name = "model.decoder.layers"
# chained attribute names of other nn modules that in the same level as the transformer layer block
outside_layer_modules = [
"model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
"model.decoder.project_in", "model.decoder.final_layer_norm"
]
# chained attribute names of linear layers in transformer layer module
# normally, there are four sub lists, for each one the modules in it can be seen as one operation,
# and the order should be the order when they are truly executed, in this case (and usually in most cases),
# they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.out_proj"],
["fc1"],
["fc2"]
]
```
然后, 你就可以像在基本用法一节中展示的那样使用 `OPTGPTQForCausalLM.from_pretrained` 和其他方法。
</details>
### 在下游任务上执行评估
你可以使用在 `auto_gptq.eval_tasks` 中定义的任务来评估量化前后的模型在某个特定下游任务上的表现。
这些预定义的模型支持所有在 [🤗 transformers](https://github.com/huggingface/transformers)和本项目中被实现了的 causal-language-models。
<details>
<summary>以下是使用 `cardiffnlp/tweet_sentiment_multilingual` 数据集在序列分类(文本分类)任务上评估 `EleutherAI/gpt-j-6b` 模型的示例:</summary>
```python
from functools import partial
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from auto_gptq.eval_tasks import SequenceClassificationTask
MODEL = "EleutherAI/gpt-j-6b"
DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
ID2LABEL = {
0: "negative",
1: "neutral",
2: "positive"
}
LABELS = list(ID2LABEL.values())
def ds_refactor_fn(samples):
text_data = samples["text"]
label_data = samples["label"]
new_samples = {"prompt": [], "label": []}
for text, label in zip(text_data, label_data):
prompt = TEMPLATE.format(labels=LABELS, text=text)
new_samples["prompt"].append(prompt)
new_samples["label"].append(ID2LABEL[label])
return new_samples
# model = AutoModelForCausalLM.from_pretrained(MODEL).eval().half().to("cuda:0")
model = AutoGPTQForCausalLM.from_pretrained(MODEL, BaseQuantizeConfig())
tokenizer = AutoTokenizer.from_pretrained(MODEL)
task = SequenceClassificationTask(
model=model,
tokenizer=tokenizer,
classes=LABELS,
data_name_or_path=DATASET,
prompt_col_name="prompt",
label_col_name="label",
**{
"num_samples": 1000, # how many samples will be sampled to evaluation
"sample_max_len": 1024, # max tokens for each sample
"block_max_len": 2048, # max tokens for each data block
# function to load dataset, one must only accept data_name_or_path as input
# and return datasets.Dataset
"load_fn": partial(datasets.load_dataset, name="english"),
# function to preprocess dataset, which is used for datasets.Dataset.map,
# must return Dict[str, list] with only two keys: [prompt_col_name, label_col_name]
"preprocess_fn": ds_refactor_fn,
# truncate label when sample's length exceed sample_max_len
"truncate_prompt": False
}
)
# note that max_new_tokens will be automatically specified internally based on given classes
print(task.run())
# self-consistency
print(
task.run(
generation_config=GenerationConfig(
num_beams=3,
num_return_sequences=3,
do_sample=True
)
)
)
```
</details>
## 了解更多
[教程](docs/tutorial) 提供了将 `auto_gptq` 集成到你的项目中的手把手指导和最佳实践准则。
[示例](examples/README.md) 提供了大量示例脚本以将 `auto_gptq` 用于不同领域。
## 支持的模型
> 你可以使用 `model.config.model_type` 来对照下表以检查你正在使用的一个模型是否被 `auto_gptq` 所支持。
>
> 比如, `WizardLM`,`vicuna` 和 `gpt4all` 模型的 `model_type` 皆为 `llama`, 因此这些模型皆被 `auto_gptq` 所支持。
| model type | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt |
|------------------------------------|--------------|-----------|-----------|---------------|-----------------------------------------------------------------------------------|
| bloom | ✅ | ✅ | ✅ | ✅ | |
| gpt2 | ✅ | ✅ | ✅ | ✅ | |
| gpt_neox | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| gptj | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| llama | ✅ | ✅ | ✅ | ✅ | ✅ |
| moss | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| opt | ✅ | ✅ | ✅ | ✅ | |
| gpt_bigcode | ✅ | ✅ | ✅ | ✅ | |
| codegen | ✅ | ✅ | ✅ | ✅ | |
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | ✅ | ✅ | |
## 支持的评估任务
目前, `auto_gptq` 支持以下评估任务: `LanguageModelingTask`, `SequenceClassificationTask``TextSummarizationTask`;更多的评估任务即将到来!
## 致谢
- 特别感谢 **Elias Frantar****Saleh Ashkboos****Torsten Hoefler****Dan Alistarh** 提出 **GPTQ** 算法并开源[代码](https://github.com/IST-DASLab/gptq)
- 特别感谢 **qwopqwop200**, 本项目中涉及到模型量化的代码主要参考自 [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda)
[![Star History Chart](https://api.star-history.com/svg?repos=PanQiwei/AutoGPTQ&type=Date)](https://star-history.com/#PanQiWei/AutoGPTQ&Date)
from transformers import LlamaTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging
import json
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
pretrained_model_dir = "/mnt/beegfs2/Yuan2-M32-HF"
quantized_model_dir = "/mnt/beegfs2/Yuan2-M32-GPTQ-int4"
tokenizer = LlamaTokenizer.from_pretrained("/mnt/beegfs2/Yuan2-M32-HF", add_eos_token=False, add_bos_token=False, eos_token='<eod>', use_fast=True)
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
examples = []
with open("/mnt/beegfs2/instruct_data.json", 'r', encoding='utf-8') as file:
data = json.load(file)
for i, item in enumerate(data):
if i >= 2000:
break
instruction = item.get('instruction', '')
output = item.get('output', '')
combined_text = instruction + " " + output
examples.append(tokenizer(combined_text))
max_memory = {0: "80GIB", 1: "80GIB", 2: "80GIB", 3: "80GIB", 4: "80GIB", 5: "80GIB", 6: "80GIB", 7: "80GIB"}
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config, trust_remote_code=True, max_memory = max_memory)
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(examples)
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)
# push quantized model to Hugging Face Hub.
# to use use_auth_token=True, Login first via huggingface-cli login.
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
# alternatively you can save and push at the same time
# (uncomment the following three lines to enable this feature)
# repo_id = f"YourUserName/{quantized_model_dir}"
# commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment