Unverified Commit 73d3e7b6 authored by Rickard's avatar Rickard Committed by GitHub
Browse files

Make native code portable and add GitHub workflow for building (#949)



* Make native code portable and add GitHub workflow for building

* Removed deprecated Python versions

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>

* Update python-package.yml

* Do not test on Python 3.13 until released

* Update python-package.yml

* Update python-package.yml

* Update python-package.yml

* Update python-package.yml

* Refactor build stage

* Fixed breaking actions change

* Slim down Windows cuda

* Create dependabot.yml

* Bespoke local dev requirements.txt

* Enable VS integration

* Group Dependabot updates

* Cleanup

* Update python-package.yml

* Reinstate file that was wrongly merged

* Fixed regression caused by new version of download-artifact

* Update python-package.yml

* Update python-package.yml

* Fix matrix

* Update python-package.yml

* Merge

* Pipeline

* Fixed conflict

* Fixed conflict

* Update CMakeLists.txt

* Fixed merge error

* cleanup

* cleanup

* Find CUDA

* Fix

* Fixing merge error from latest merge from main

* Fix setup.py

* Fixed typo in artifact name

* Remove linker flags

* Build nocublaslt versions

* Fixed formatting

* Fixed VS Code format on save

* Ran format on save from VScode

* Re-saved the json files using the new settings

* Re-saved CMakeLists.txt to get formatting right

* Add path filter

* Formatting

---------
Co-authored-by: default avatarAarni Koskela <akx@iki.fi>
parent 332530ba
version: 2
updates:
- package-ecosystem: pip
directory: "/"
schedule:
interval: "weekly"
groups:
major:
update-types: [major]
minor-patch:
update-types: [minor, patch]
name: CMake on multiple platforms
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
concurrency:
group: cmake-${{ github.ref }}
cancel-in-progress: true
jobs:
build-shared-libs:
runs-on: ${{ matrix.os }}
strategy:
# Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest]
arch: [x86_64, aarch64]
build_type: [Release]
exclude:
- os: windows-latest
arch: aarch64
steps:
- uses: actions/checkout@v4
- name: Set up MSVC
if: matrix.os == 'windows-latest'
uses: ilammy/msvc-dev-cmd@v1.13.0
with:
arch: amd64
- name: Set reusable strings
# Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
id: strings
shell: bash
run: |
echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
- name: Prep build
run: python3 -m pip install cmake==3.27.9 ninja setuptools wheel
- name: Prep Compilers
shell: bash -el {0}
run: |
if [ "${{ matrix.os }}" = "windows-latest" ]; then
echo CXX_COMPILER=cl >> "$GITHUB_ENV"
echo C_COMPILER=cl >> "$GITHUB_ENV"
else
echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
echo C_COMPILER=gcc >> "$GITHUB_ENV"
fi
- name: Configure CPU
run: >
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-G Ninja
-DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DBUILD_CUDA=OFF
-S ${{ github.workspace }}
- name: Build CPU
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
- name: Copy libraries
shell: bash
run: |
mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} )
- name: Upload Build Artifacts
uses: actions/upload-artifact@v4
with:
name: shared_library-${{ matrix.os }}-${{ matrix.arch }}
path: output/*
build-shared-libs-cuda:
runs-on: ${{ matrix.os }}
strategy:
# Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest]
cuda-version: ['11.8', '12.1']
arch: [x86_64, aarch64]
build_type: [Release]
exclude:
- os: windows-latest
arch: aarch64
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Set up MSVC
if: matrix.os == 'windows-latest'
uses: ilammy/msvc-dev-cmd@v1.13.0
with:
arch: amd64
- name: Setup Mambaforge
uses: conda-incubator/setup-miniconda@v3.0.1
with:
miniforge-variant: Mambaforge
miniforge-version: latest
activate-environment: bnb-env
use-mamba: true
- uses: conda-incubator/setup-miniconda@v3.0.1
with:
auto-update-conda: true
activate-environment: bnb-env
environment-file: environment-bnb.yml
use-only-tar-bz2: false
auto-activate-base: true
python-version: "3.10"
mamba-version: "*"
- name: Set reusable strings
# Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
id: strings
shell: bash
run: |
echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
- name: CUDA Toolkit
shell: bash -el {0}
run: |
if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
# to prepare space
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
fi
addon=""
cuda_version=${{ matrix.cuda-version }}
[ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
[ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
[ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
[ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
[ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
if [ "${{ matrix.os }}" = "windows-latest" ]; then
echo CXX_COMPILER=cl >> "$GITHUB_ENV"
echo C_COMPILER=cl >> "$GITHUB_ENV"
# without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
else
echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
echo C_COMPILER=gcc >> "$GITHUB_ENV"
fi
nvcc --version
- name: Update environment
run: mamba env update -n bnb-env -f environment-bnb.yml
- name: Prep build
run: python -m pip install cmake==3.27.9 ninja setuptools wheel
# TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel
- name: Configure CUDA
run: >
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-S ${{ github.workspace }}
- name: Build CUDA
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
- name: Configure NOBLASLT
run: >
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
-DCMAKE_C_COMPILER=${{ env.C_COMPILER }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-DNO_CUBLASLT=ON
-S ${{ github.workspace }}
- name: Build NOBLASLT
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}
- name: Copy libraries
shell: bash
run: |
mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} )
- name: Upload Build Artifacts
uses: actions/upload-artifact@v4
with:
name: shared_library_cuda-${{ matrix.os }}-${{ matrix.cuda-version }}-${{ matrix.arch }}
path: output/*
build-wheels:
needs:
- build-shared-libs
- build-shared-libs-cuda
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
arch: [x86_64, aarch64]
exclude:
- os: windows-latest
arch: aarch64
steps:
# Check out code
- uses: actions/checkout@v4
# Download shared libraries
- name: Download build artifact
uses: actions/download-artifact@v4
with:
merge-multiple: true
path: output/
- name: Copy correct platform shared libraries
shell: bash
run: |
cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
# Set up the Python version needed
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
cache: pip
- name: Install build package
shell: bash
run: pip install build
- name: Build wheel
shell: bash
run: python -m build . --wheel
- name: Upload Build Artifacts
uses: actions/upload-artifact@v4
with:
name: bdist_wheel-${{ matrix.os }}-${{ matrix.arch }}
path: |
${{ github.workspace }}/dist/
name: Python package
on:
push: {}
pull_request:
branches: [ main ]
paths:
- '.github/workflows/python-package.yml'
- 'bitsandbytes/**'
- 'csrc/**'
- 'include/**'
- 'tests/**'
- 'CMakeLists.txt'
- 'requirements*.txt'
- 'setup.py'
- 'pyproject.toml'
- 'pytest.ini'
- '**/*.md'
release:
types: [ published ]
jobs:
##
# This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
##
build-shared-libs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
arch: [x86_64, aarch64]
exclude:
- os: windows-latest # This probably requires arm64 Windows agents
arch: aarch64
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
steps:
# Check out code
- uses: actions/checkout@v4
# On Linux we use CMake within Docker
- name: Setup cmake
uses: jwlawson/actions-setup-cmake@v1.14
with:
cmake-version: '3.26.x'
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1.1
if: ${{ startsWith(matrix.os, 'windows') }}
# Check out dependencies code
- uses: actions/checkout@v4
name: Check out NVidia cub
with:
repository: nvidia/cub
ref: 1.11.0
path: dependencies/cub
# Compile C++ code
- name: Build C++
shell: bash
run: |
set -ex
build_os=${{ matrix.os }}
build_arch=${{ matrix.arch }}
if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then
# Allow cross-compile om aarch64
sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu
fi
if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then
cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu .
else
cmake -DCOMPUTE_BACKEND=cpu .
fi
if [ ${build_os:0:7} == windows ]; then
pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
else
make
fi
mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
- name: Upload build artifact
uses: actions/upload-artifact@v4
with:
name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
path: output/*
retention-days: 7
##
# This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
##
build-shared-libs-cuda:
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
arch: [x86_64, aarch64]
cuda_version: ['12.1.0']
exclude:
- os: windows-latest # This probably requires arm64 Windows agents
arch: aarch64
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
steps:
# Check out code
- uses: actions/checkout@v4
# Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
- name: Set up Docker multiarch
if: startsWith(matrix.os, 'ubuntu')
uses: docker/setup-qemu-action@v2
# On Linux we use CMake within Docker
- name: Setup cmake
if: ${{ !startsWith(matrix.os, 'linux') }}
uses: jwlawson/actions-setup-cmake@v1.14
with:
cmake-version: '3.26.x'
# Windows: We install Cuda on the agent (slow)
- uses: Jimver/cuda-toolkit@v0.2.14
if: startsWith(matrix.os, 'windows')
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda_version }}
method: 'local'
# sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1.1
if: ${{ startsWith(matrix.os, 'windows') }}
# Check out dependencies code
- uses: actions/checkout@v4
name: Check out NVidia cub
with:
repository: nvidia/cub
ref: 1.11.0
path: dependencies/cub
# Compile C++ code
- name: Build C++
shell: bash
run: |
set -ex
build_os=${{ matrix.os }}
build_arch=${{ matrix.arch }}
for NO_CUBLASLT in ON OFF; do
if [ ${build_os:0:6} == ubuntu ]; then
image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
echo "Using image $image"
docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
"apt-get update \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
&& cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
&& make"
else
cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} .
pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
fi
done
mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
- name: Upload build artifact
uses: actions/upload-artifact@v4
with:
name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
path: output/*
retention-days: 7
build-wheels:
needs:
- build-shared-libs
- build-shared-libs-cuda
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.9", "3.10", "3.11", "3.12"]
arch: [x86_64, aarch64]
exclude:
- os: windows-latest # This probably requires arm64 Windows agents
arch: aarch64
runs-on: ${{ matrix.os }}
steps:
# Check out code
- uses: actions/checkout@v4
# Download shared libraries
- name: Download build artifact
uses: actions/download-artifact@v4
with:
merge-multiple: true
pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
path: output/
- name: Copy correct platform shared library
shell: bash
run: |
ls -lR output/
cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
# Set up the Python version needed
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: pip
- name: Install build package
shell: bash
run: pip install build
- name: Install Python test dependencies
shell: bash
run: pip install -r requirements-ci.txt
# TODO: How to run CUDA tests on GitHub actions?
#- name: Run unit tests
# if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
# run: |
# PYTHONPATH=. pytest --log-cli-level=DEBUG tests
- name: Build wheel
shell: bash
run: python -m build .
- name: Upload build artifact
uses: actions/upload-artifact@v4
with:
name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.python-version }}
path: dist/bitsandbytes-*.whl
retention-days: 7
publish:
needs: build-wheels
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Download build artifact
uses: actions/download-artifact@v4
with:
path: dist/
merge-multiple: true
pattern: "bdist_wheel_*"
- run: |
ls -lR dist/
- name: Publish to PyPi
if: startsWith(github.ref, 'refs/tags')
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.pypi }}
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
# Ensure the CUDA Toolkit is available on your path. Then run: # Ensure the CUDA Toolkit is available on your path. Then run:
# For GCC: `cmake -B build . && cmake --build build` # For GCC: `cmake -B build . && cmake --build build`
# For MSVC: `cmake -B build . && cmake --build build --config Release` # For MSVC: `cmake -B build . && cmake --build build --config Release`
# You can also use the following options # You can also use the following options and variables
# - BUILD_CUDA: Default ON, will build with CUDA # - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend
# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support # - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version # - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
# is whatever CMake finds on your path. # is whatever CMake finds on your path.
...@@ -11,25 +11,53 @@ ...@@ -11,25 +11,53 @@
# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90` # Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
# Check your compute capability here: https://developer.nvidia.com/cuda-gpus # Check your compute capability here: https://developer.nvidia.com/cuda-gpus
# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler # - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.22.1)
project(bitsandbytes LANGUAGES C CXX) project(bitsandbytes LANGUAGES CXX)
option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON) # Define included source files
option(NO_CUBLASLT "Disable CUBLAS" OFF) set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
set(MPS_FILES csrc/mps_ops.mm)
set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c) set(METAL_FILES csrc/mps_kernels.metal)
list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu) # C++ sources are always included
list(APPEND SRC_FILES ${CPP_FILES}) list(APPEND SRC_FILES ${CPP_FILES})
message(STATUS "BUILD_CUDA := ${BUILD_CUDA}") set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
if(APPLE)
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
endif()
set(BNB_OUTPUT_NAME "bitsandbytes") set(BNB_OUTPUT_NAME "bitsandbytes")
message(STATUS "Building with backend ${COMPUTE_BACKEND}")
if(${COMPUTE_BACKEND} STREQUAL "cuda")
if(APPLE)
message(FATAL_ERROR "CUDA is not supported on macOS" )
endif()
option(NO_CUBLASLT "Disable CUBLAS" OFF)
set(BUILD_CUDA ON)
set(BUILD_MPS OFF)
message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
elseif(${COMPUTE_BACKEND} STREQUAL "mps")
if(NOT APPLE)
message(FATAL_ERROR "MPS is only supported on macOS" )
endif()
set(BUILD_CUDA OFF)
set(BUILD_MPS ON)
else()
set(BUILD_CUDA OFF)
set(BUILD_MPS OFF)
endif()
if(BUILD_CUDA) if(BUILD_CUDA)
enable_language(CUDA) # This will fail if CUDA is not found enable_language(CUDA) # This will fail if CUDA is not found
find_package(CUDAToolkit REQUIRED)
# Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
...@@ -87,28 +115,56 @@ if(BUILD_CUDA) ...@@ -87,28 +115,56 @@ if(BUILD_CUDA)
if(NO_CUBLASLT) if(NO_CUBLASLT)
string(APPEND BNB_OUTPUT_NAME "_nocublaslt") string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
endif() endif()
else() add_compile_definitions(BUILD_CUDA)
message(STATUS "Building CPU Only") elseif(BUILD_MPS)
string(APPEND BNB_OUTPUT_NAME "_cpu") if(NOT APPLE)
if(NO_CUBLASLT) message(FATAL_ERROR "MPS is only supported on macOS" )
message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.")
endif() endif()
enable_language(OBJCXX)
list(APPEND SRC_FILES ${MPS_FILES})
string(APPEND BNB_OUTPUT_NAME "_mps")
add_compile_definitions(BUILD_MPS)
file(MAKE_DIRECTORY "build")
add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"
COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES}
COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
DEPENDS "${METAL_FILES}"
COMMENT "Compiling Metal kernels"
VERBATIM)
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
else()
set(LIBSUFFIX "cpu")
set(GPU_SOURCES)
endif()
if(WIN32)
# Export all symbols
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
# Weird MSVC hacks
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast")
endif() endif()
set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES}) add_library(bitsandbytes SHARED ${SRC_FILES})
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_include_directories(bitsandbytes PUBLIC csrc include)
target_compile_features(bitsandbytes PUBLIC cxx_std_14) target_compile_features(bitsandbytes PUBLIC cxx_std_14)
target_include_directories(bitsandbytes PUBLIC csrc include)
if(BUILD_CUDA) if(BUILD_CUDA)
target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA) target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse) target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse)
if(NO_CUBLASLT) if(NO_CUBLASLT)
target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
else() else()
target_link_libraries(bitsandbytes PUBLIC cublasLt) target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
endif() endif()
set_target_properties(bitsandbytes set_target_properties(bitsandbytes
...@@ -116,17 +172,20 @@ if(BUILD_CUDA) ...@@ -116,17 +172,20 @@ if(BUILD_CUDA)
CUDA_SEPARABLE_COMPILATION ON CUDA_SEPARABLE_COMPILATION ON
) )
endif() endif()
if(BUILD_MPS)
add_dependencies(bitsandbytes metallib)
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
endif()
if(WIN32) if(WIN32)
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib") set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
endif() endif()
set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
if(MSVC)
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
endif()
set_target_properties(bitsandbytes set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes)
PROPERTIES
OUTPUT_NAME ${BNB_OUTPUT_NAME}
# We have to use a generator expression to prevent MSVC Debug/Release subdirs being made
RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers
WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports
)
MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
GPP:= /usr/bin/g++
#GPP:= /sw/gcc/11.2.0/bin/g++
ifeq ($(CUDA_HOME),)
CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
endif
ifndef CUDA_VERSION
ifneq ($(MAKECMDGOALS),clean)
$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
CUDA_VERSION:=
endif
endif
NVCC := $(CUDA_HOME)/bin/nvcc
###########################################
CSRC := $(ROOT_DIR)/csrc
BUILD_DIR:= $(ROOT_DIR)/build
FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
# NVIDIA NVCC compilation flags
COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
# Later versions of CUDA support the new architectures
CC_CUDA11x := -gencode arch=compute_75,code=sm_75
CC_CUDA11x += -gencode arch=compute_80,code=sm_80
CC_CUDA11x += -gencode arch=compute_86,code=sm_86
CC_cublasLt110 := -gencode arch=compute_75,code=sm_75
CC_cublasLt110 += -gencode arch=compute_80,code=sm_80
CC_cublasLt111 := -gencode arch=compute_75,code=sm_75
CC_cublasLt111 += -gencode arch=compute_80,code=sm_80
CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89
CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
all: $(BUILD_DIR) env
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
cuda110_nomatmul_kepler: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
cuda11x_nomatmul_kepler: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
cuda110_nomatmul: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
cuda11x_nomatmul: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
cuda118_nomatmul: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
cuda12x_nomatmul: $(BUILD_DIR) env
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
cuda110: $(BUILD_DIR) env
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
cuda11x: $(BUILD_DIR) env
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
cuda118: $(BUILD_DIR) env
$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
cuda12x: $(BUILD_DIR) env
$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
cpuonly: $(BUILD_DIR) env
$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
env:
@echo "ENVIRONMENT"
@echo "============================"
@echo "CUDA_VERSION: $(CUDA_VERSION)"
@echo "============================"
@echo "NVCC path: $(NVCC)"
@echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`"
@echo "CUDA_HOME: $(CUDA_HOME)"
@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
@echo "PATH: $(PATH)"
@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
@echo "============================"
$(BUILD_DIR):
mkdir -p build
mkdir -p dependencies
$(ROOT_DIR)/dependencies/cub:
git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub
cd dependencies/cub; git checkout 1.11.0
clean:
rm -rf build/* *.egg*
rm -f bitsandbytes/libbitsandbytes*.so
#include <metal_stdlib>
using namespace metal;
#define HLF_MAX 65504
#define TH 1024
#define NUM 4
#define NUM_BLOCK 4096
template<bool STOCHASTIC>
static unsigned char quantize_scalar(
float rand,
device float* code,
float x)
{
int pivot = 127;
int upper_pivot = 255;
int lower_pivot = 0;
float lower = -1.0f;
float upper = 1.0f;
float val = code[pivot];
// i>>=1 = {32, 16, 8, 4, 2, 1}
for(int i = 64; i > 0; i>>=1)
{
if(x > val)
{
lower_pivot = pivot;
lower = val;
pivot+=i;
}
else
{
upper_pivot = pivot;
upper = val;
pivot-=i;
}
val = code[pivot];
}
if(upper_pivot == 255)
upper = code[upper_pivot];
if(lower_pivot == 0)
lower = code[lower_pivot];
if(!STOCHASTIC)
{
if(x > val)
{
float midpoint = (upper+val)*0.5f;
if(x > midpoint)
{
return upper_pivot;
}
else
return pivot;
}
else
{
float midpoint = (lower+val)*0.5f;
if(x < midpoint)
return lower_pivot;
else
return pivot;
}
}
else
{
if(x > val)
{
float dist_to_upper = fabs(upper-x);
float dist_full = upper-val;
if(rand >= dist_to_upper/dist_full) return upper_pivot;
else return pivot;
}
else
{
float dist_to_lower = fabs(lower-x);
float dist_full = val-lower;
if(rand >= dist_to_lower/dist_full) return lower_pivot;
else return pivot;
}
}
}
kernel void quantize(device float* code [[buffer(0)]],
device float* A [[buffer(1)]],
device uchar* out [[buffer(2)]],
constant uint& n [[buffer(3)]],
uint id [[thread_position_in_grid]]) {
const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);
uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK;
const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK);
float vals[NUM];
uchar qvals[NUM];
for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) {
valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint j = 0; j < valid_items; j++) {
vals[j] = A[i + j];
}
for (uint j = 0; j < valid_items; j++) {
qvals[j] = quantize_scalar<false>(0.0f, code, vals[j]);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint j = 0; j < valid_items; j++) {
out[i + j] = qvals[j];
}
}
}
#import <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
#define HLF_MAX 65504
#define TH 1024
#define NUM 4
#define NUM_BLOCK 4096
static inline MPSGraph* get_graph()
{
static MPSGraph* cur = nil;
if(!cur) {
cur = [[MPSGraph alloc] init];
}
return cur;
}
static inline id<MTLDevice> get_device()
{
NSError *error = nil;
static id<MTLDevice> device = nil;
if(!device) {
device = MTLCreateSystemDefaultDevice();
}
if(!device) {
NSLog(@"Failed to get MPS device");
abort();
}
return device;
}
static inline id<MTLLibrary> get_library()
{
NSError *error = nil;
static id<MTLLibrary> library = nil;
if(!library) {
library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error];
}
if(!library) {
NSLog(@"Failed to load bitsandbytes.metallib");
abort();
}
return library;
}
/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n)
{
id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"];
return out;
}*/
// MPSGraph function for quantize
extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n)
{
id<MTLDevice> device = get_device();
id<MTLLibrary> library = get_library();
static id<MTLFunction> kernel = nil;
if(!kernel) {
kernel = [library newFunctionWithName:@"quantize"];
if(!kernel) {
NSLog(@"Failed to load bitsandbytes.metallib");
abort();
}
}
NSLog(@"Not implemented");
return nil;
}
...@@ -6,6 +6,9 @@ ...@@ -6,6 +6,9 @@
#if BUILD_CUDA #if BUILD_CUDA
#include <ops.cuh> #include <ops.cuh>
#endif #endif
#if BUILD_MPS
// #include <mps_ops.h>
#endif
#include <cpu_ops.h> #include <cpu_ops.h>
// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
...@@ -412,6 +415,7 @@ extern "C" ...@@ -412,6 +415,7 @@ extern "C"
{ gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); } { gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); }
#endif #endif
void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
} }
...@@ -190,7 +190,7 @@ struct DirectInfo ...@@ -190,7 +190,7 @@ struct DirectInfo
xi = xws; xi = xws;
} }
else { else {
myassert(Gap==1, "if Gap>1 then X workspace must be provided"); myassert((Gap==1), "if Gap>1 then X workspace must be provided");
xi = x; xi = x;
} }
......
...@@ -52,6 +52,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val ...@@ -52,6 +52,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
private: private:
typedef AlgoScalarBase<T, A> base_t; typedef AlgoScalarBase<T, A> base_t;
#ifdef USE_SSE2
FORCE_INLINE FORCE_INLINE
//NO_INLINE //NO_INLINE
void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
...@@ -135,6 +136,7 @@ private: ...@@ -135,6 +136,7 @@ private:
pr[0] = u.ui32[0]; pr[0] = u.ui32[0];
pr[1] = u.ui32[2]; pr[1] = u.ui32[2];
} }
#endif // USE_SSE2
#ifdef USE_AVX #ifdef USE_AVX
......
...@@ -4,10 +4,40 @@ ...@@ -4,10 +4,40 @@
#include <stdexcept> #include <stdexcept>
#include <sstream> #include <sstream>
#if defined(__aarch64__)
#ifdef __CUDACC__
#undef USE_NEON // Doesn't work with nvcc, undefined symbols
#else
#include <arm_neon.h>
#undef USE_NEON // Not yet implemented
#endif
#undef USE_AVX // x86_64 only
#undef USE_AVX2 // x86_64 only
#undef USE_SSE2 // x86_64 only
#undef USE_SSE41 // x86_64 only
#undef USE_SSE42 // x86_64 only
#undef USE_FMA // x86_64 only
#ifdef USE_NEON
typedef float32x4_t __m128;
typedef int32x4_t __m128i;
typedef float64x2_t __m128d;
#else
typedef struct {float a; float b; float c; float d;} __m128;
typedef struct {int a; int b; int c; int d;} __m128i;
typedef struct {double a; double b;} __m128d;
#endif
#else
#undef USE_NEON // ARM64 only
#ifdef __FMA__ #ifdef __FMA__
#define USE_FMA #define USE_FMA
#endif #endif
#if !defined(__SSE2__) && !defined(_MSC_VER)
#error Compiler must support SSE2
#endif
#define USE_SSE2
#if defined(__aarch64__)
#else
#ifdef __AVX2__ #ifdef __AVX2__
#define USE_AVX2 #define USE_AVX2
#endif #endif
...@@ -24,7 +54,8 @@ ...@@ -24,7 +54,8 @@
#ifdef __SSE4_2__ #ifdef __SSE4_2__
#define USE_SSE42 #define USE_SSE42
#endif #endif
#endif
#endif
#ifndef _MSC_VER #ifndef _MSC_VER
#include <stdint.h> #include <stdint.h>
......
...@@ -2,6 +2,46 @@ ...@@ -2,6 +2,46 @@
#include "Portable.h" #include "Portable.h"
#ifdef USE_SSE2
#include <emmintrin.h>
#if defined(USE_AVX) || defined(USE_AVX2)
#include <immintrin.h>
#else
#ifdef USE_SSE41
#include <smmintrin.h>
#endif
#endif
#endif
namespace BinSearch {
namespace Details {
template <InstrSet I, typename T>
struct FTOITraits{};
template <InstrSet I, class T>
struct FVec;
template <InstrSet I, class T>
struct IVec;
template <InstrSet I, class T>
struct FVec1;
template <> struct InstrFloatTraits<Scalar, float>
{
typedef __m128 vec_t;
};
template <> struct InstrFloatTraits<Scalar, double>
{
typedef __m128d vec_t;
};
}
}
#if !defined(__aarch64__)
#ifdef USE_SSE42 #ifdef USE_SSE42
#ifndef _MSC_VER #ifndef _MSC_VER
#include <popcntintrin.h> #include <popcntintrin.h>
...@@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32) ...@@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32)
} // namespace } // namespace
#endif #endif
#if defined(USE_AVX) || defined(USE_AVX2)
#include <immintrin.h>
#else
#include <emmintrin.h>
#ifdef USE_SSE41
#include <smmintrin.h>
#endif
#endif
#include "Type.h" #include "Type.h"
namespace BinSearch { namespace BinSearch {
namespace Details { namespace Details {
template <InstrSet I, class T>
struct FVec;
template <InstrSet I, class T>
struct IVec;
template <InstrSet I, class T>
struct FVec1;
template <> struct InstrIntTraits<SSE> template <> struct InstrIntTraits<SSE>
{ {
typedef __m128i vec_t; typedef __m128i vec_t;
...@@ -64,18 +86,8 @@ template <> struct InstrFloatTraits<SSE, double> ...@@ -64,18 +86,8 @@ template <> struct InstrFloatTraits<SSE, double>
typedef __m128d vec_t; typedef __m128d vec_t;
}; };
template <> struct InstrFloatTraits<Scalar, float> template <>
{ struct FTOITraits<SSE, float>
typedef float vec_t;
};
template <> struct InstrFloatTraits<Scalar, double>
{
typedef double vec_t;
};
template <InstrSet I, typename T>
struct FTOITraits
{ {
typedef IVec<SSE, float> vec_t; typedef IVec<SSE, float> vec_t;
}; };
...@@ -295,9 +307,11 @@ FORCE_INLINE FVec<SSE,float> operator- (const FVec<SSE,float>& a, const FVec< ...@@ -295,9 +307,11 @@ FORCE_INLINE FVec<SSE,float> operator- (const FVec<SSE,float>& a, const FVec<
FORCE_INLINE FVec<SSE,float> operator* (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_mul_ps( a, b ); } FORCE_INLINE FVec<SSE,float> operator* (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_mul_ps( a, b ); }
FORCE_INLINE FVec<SSE,float> operator/ (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_div_ps( a, b ); } FORCE_INLINE FVec<SSE,float> operator/ (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_div_ps( a, b ); }
FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,float>& a) { return _mm_cvttps_epi32(a); } FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,float>& a) { return _mm_cvttps_epi32(a); }
#ifndef __clang__ // Conflicts with builtin operator
FORCE_INLINE IVec<SSE,float> operator<= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } FORCE_INLINE IVec<SSE,float> operator<= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
FORCE_INLINE IVec<SSE,float> operator>= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } FORCE_INLINE IVec<SSE,float> operator>= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
FORCE_INLINE IVec<SSE,float> operator< (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } FORCE_INLINE IVec<SSE,float> operator< (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
#endif
#ifdef USE_FMA #ifdef USE_FMA
FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); } FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
#endif #endif
...@@ -349,9 +363,11 @@ FORCE_INLINE FVec<SSE,double> operator- (const FVec<SSE,double>& a, const FVec ...@@ -349,9 +363,11 @@ FORCE_INLINE FVec<SSE,double> operator- (const FVec<SSE,double>& a, const FVec
FORCE_INLINE FVec<SSE,double> operator* (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_mul_pd( a, b ); } FORCE_INLINE FVec<SSE,double> operator* (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_mul_pd( a, b ); }
FORCE_INLINE FVec<SSE,double> operator/ (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_div_pd( a, b ); } FORCE_INLINE FVec<SSE,double> operator/ (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_div_pd( a, b ); }
FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,double>& a) { return _mm_cvttpd_epi32(a); } FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,double>& a) { return _mm_cvttpd_epi32(a); }
#ifndef __clang__ // Conflicts with builtin operator
FORCE_INLINE IVec<SSE,double> operator<= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } FORCE_INLINE IVec<SSE,double> operator<= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
FORCE_INLINE IVec<SSE,double> operator< (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } FORCE_INLINE IVec<SSE,double> operator< (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
FORCE_INLINE IVec<SSE,double> operator>= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } FORCE_INLINE IVec<SSE,double> operator>= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
#endif
#ifdef USE_FMA #ifdef USE_FMA
FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); } FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
#endif #endif
...@@ -570,3 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX ...@@ -570,3 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
} // namespace Details } // namespace Details
} // namespace BinSearch } // namespace BinSearch
#endif // !defined(__aarch64__)
...@@ -10,7 +10,7 @@ using std::size_t; ...@@ -10,7 +10,7 @@ using std::size_t;
namespace BinSearch { namespace BinSearch {
enum InstrSet { Scalar, SSE, AVX }; enum InstrSet { Scalar, SSE, AVX, Neon };
#define ALGOENUM(x, b) x, #define ALGOENUM(x, b) x,
enum Algos enum Algos
......
[build-system] [build-system]
requires = [ requires = [ "setuptools", "wheel" ]
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.ruff] [tool.ruff]
......
# Requirements used for GitHub actions
pytest==7.2.2
einops==0.6.0
wheel==0.40.0
lion-pytorch==0.0.6
scipy==1.11.4
pandas==2.2.0
# Requirements used for local development
setuptools>=63
pytest~=7.2.2
einops~=0.6.0
wheel~=0.40.0
lion-pytorch~=0.0.6
scipy~=1.11.4
pandas~=2.2.0
matplotlib~=3.8.2
...@@ -5,10 +5,10 @@ ...@@ -5,10 +5,10 @@
import glob import glob
import os import os
from setuptools import Extension, find_packages, setup from setuptools import find_packages, setup
from setuptools.dist import Distribution
libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*"))
libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll"))
libs = [os.path.basename(p) for p in libs] libs = [os.path.basename(p) for p in libs]
print("libs:", libs) print("libs:", libs)
...@@ -17,6 +17,12 @@ def read(fname): ...@@ -17,6 +17,12 @@ def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
# Tested with wheel v0.29.0
class BinaryDistribution(Distribution):
def has_ext_modules(self):
return True
setup( setup(
name="bitsandbytes", name="bitsandbytes",
version="0.43.0.dev0", version="0.43.0.dev0",
...@@ -28,18 +34,16 @@ setup( ...@@ -28,18 +34,16 @@ setup(
url="https://github.com/TimDettmers/bitsandbytes", url="https://github.com/TimDettmers/bitsandbytes",
packages=find_packages(), packages=find_packages(),
package_data={"": libs}, package_data={"": libs},
install_requires=['torch', 'numpy'], install_requires=["torch", "numpy"],
extras_require={ extras_require={
'benchmark': ['pandas', 'matplotlib'], "benchmark": ["pandas", "matplotlib"],
'test': ['scipy'], "test": ["scipy"],
}, },
long_description=read("README.md"), long_description=read("README.md"),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
# HACK: pretend we have a native extension module so the wheel is tagged
# correctly with a platform tag (e.g. `-linux_x86_64.whl`).
ext_modules=[Extension("bitsandbytes", sources=[], language="c")],
classifiers=[ classifiers=[
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
], ],
distclass=BinaryDistribution,
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment