Unverified Commit bdb8b2b7 authored by Matthew Douglas's avatar Matthew Douglas Committed by GitHub
Browse files

Add CUDA 13.0 Support (#1761)

* CUDA 13 build enablement

* Try to fix Windows build workflow

* Add torch 2.9+cu130 to tests

* Fix python version

* Update test workflow

* Don't test CPU on torch 2.9 yet

* Update doc
parent e8170363
...@@ -12,13 +12,16 @@ elif [ "${build_arch}" = "aarch64" ]; then ...@@ -12,13 +12,16 @@ elif [ "${build_arch}" = "aarch64" ]; then
build_capability="75;80;90" build_capability="75;80;90"
# CUDA 12.8+: Add sm100/sm120 # CUDA 12.8+: Add sm100/sm120
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="75;80;90;100;120" [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* || "${cuda_version}" == 13.*.* ]] && build_capability="75;80;90;100;120"
else else
# By default, target Pascal through Hopper. # By default, target Pascal through Hopper.
build_capability="60;70;75;80;86;89;90" build_capability="60;70;75;80;86;89;90"
# CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum # CUDA 12.8+: Add sm100 and sm120; remove < sm70 to align with PyTorch 2.8+cu128 minimum
[[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120" [[ "${cuda_version}" == 12.8.* || "${cuda_version}" == 12.9.* ]] && build_capability="70;75;80;86;89;90;100;120"
# CUDA 13.0+: Remove < sm75 to align with PyTorch 2.9+cu130 minimum
[[ "${cuda_version}" == 13.*.* ]] && build_capability="75;80;86;89;90;100;120"
fi fi
[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja [[ "${build_os}" = windows-* ]] && python3 -m pip install ninja
......
...@@ -72,21 +72,22 @@ jobs: ...@@ -72,21 +72,22 @@ jobs:
- os: windows-latest - os: windows-latest
arch: x86_64 arch: x86_64
cuda_version: cuda_version:
["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1"] ["11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "12.6.3", "12.8.1", "12.9.1", "13.0.1"]
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
# Windows: We install Cuda on the agent (slow) # Windows: We install Cuda on the agent (slow)
- uses: Jimver/cuda-toolkit@c35baa1a18fd1fc9dcf47c5bd839bf30559c0bc3 # v0.2.24 #- uses: Jimver/cuda-toolkit@433d453c1fa37d10a3254452fa8e284441c9192d # v0.2.27
- uses: N-Storm/cuda-toolkit@d68ba29a800229200a2c3f572f9e816d7f67cdb4 # v0.2.24m
if: startsWith(matrix.os, 'windows') if: startsWith(matrix.os, 'windows')
id: cuda-toolkit id: cuda-toolkit
with: with:
# Temporary: Use CUDA 12.9.0 for Windows until 12.9.1 is supported with this action. # Temporary: Use CUDA 13.0.0 for Windows until 13.0.1 is supported with this action.
cuda: ${{ matrix.cuda_version == '12.9.1' && '12.9.0' || matrix.cuda_version }} cuda: ${{ matrix.cuda_version == '13.0.1' && '13.0.0' || matrix.cuda_version }}
method: "network" method: "local"
sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]'
linux-local-args: '["--toolkit"]'
use-github-cache: false use-github-cache: false
use-local-cache: false
log-file-suffix: ${{matrix.os}}-${{matrix.cuda_version}}.txt
- name: Setup MSVC - name: Setup MSVC
if: startsWith(matrix.os, 'windows') if: startsWith(matrix.os, 'windows')
uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
......
name: Unit tests name: Nightly Tests
on: on:
workflow_dispatch: workflow_dispatch:
...@@ -49,6 +49,7 @@ jobs: ...@@ -49,6 +49,7 @@ jobs:
build-cuda: build-cuda:
strategy: strategy:
matrix: matrix:
# TODO: Add 13.0.1 when we have runners with new enough drivers.
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"]
os: [ubuntu-22.04, ubuntu-22.04-arm] os: [ubuntu-22.04, ubuntu-22.04-arm]
include: include:
...@@ -111,7 +112,7 @@ jobs: ...@@ -111,7 +112,7 @@ jobs:
arch: aarch64 arch: aarch64
- os: ubuntu-22.04-arm - os: ubuntu-22.04-arm
arch: aarch64 arch: aarch64
torch_version: "2.5.1" torch_version: "2.5.1" # Higher minimum requirement for aarch64
- os: windows-2025 - os: windows-2025
arch: x86_64 arch: x86_64
- os: macos-15 - os: macos-15
...@@ -136,7 +137,7 @@ jobs: ...@@ -136,7 +137,7 @@ jobs:
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: '3.10'
- name: Setup MSVC - name: Setup MSVC
if: startsWith(matrix.os, 'windows') if: startsWith(matrix.os, 'windows')
...@@ -182,7 +183,7 @@ jobs: ...@@ -182,7 +183,7 @@ jobs:
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: '3.10'
- name: Install dependencies - name: Install dependencies
run: | run: |
...@@ -313,7 +314,7 @@ jobs: ...@@ -313,7 +314,7 @@ jobs:
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: '3.10'
- name: Install PyTorch - name: Install PyTorch
run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu run: pip install torch==${{ matrix.torch_version }} --index-url https://download.pytorch.org/whl/xpu
...@@ -343,7 +344,7 @@ jobs: ...@@ -343,7 +344,7 @@ jobs:
os: [ubuntu-22.04, windows-2025] os: [ubuntu-22.04, windows-2025]
arch: [x86_64] arch: [x86_64]
gpu: [T4, L40S] gpu: [T4, L40S]
cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] cuda_version: ["11.8.0", "12.6.3", "12.8.1", "12.9.1"] #, "13.0.1"]
include: include:
- cuda_version: "11.8.0" - cuda_version: "11.8.0"
torch_version: "2.3.1" torch_version: "2.3.1"
...@@ -351,12 +352,18 @@ jobs: ...@@ -351,12 +352,18 @@ jobs:
- cuda_version: "12.6.3" - cuda_version: "12.6.3"
torch_version: "2.6.0" torch_version: "2.6.0"
pypi_index: "https://download.pytorch.org/whl/cu126" pypi_index: "https://download.pytorch.org/whl/cu126"
- cuda_version: "12.8.1"
torch_version: "2.7.1"
pypi_index: "https://download.pytorch.org/whl/cu128"
- cuda_version: "12.9.1" - cuda_version: "12.9.1"
torch_version: "2.8.0" torch_version: "2.8.0"
pypi_index: "https://download.pytorch.org/whl/cu129" pypi_index: "https://download.pytorch.org/whl/cu129"
- cuda_version: "12.8.1"
torch_version: "2.9.0"
pypi_index: "https://download.pytorch.org/whl/test/cu128"
# Note: Currently our runners do not have new enough drivers for CUDA 13.
# Add this when supported.
# - cuda_version: "13.0.1"
# torch_version: "2.9.0"
# pypi_index: "https://download.pytorch.org/whl/test/cu130"
# Linux L40S runners # Linux L40S runners
...@@ -395,6 +402,8 @@ jobs: ...@@ -395,6 +402,8 @@ jobs:
exclude: exclude:
# Our current T4 Windows runner has a driver too old (471.11) # Our current T4 Windows runner has a driver too old (471.11)
# and cannot support CUDA 12+. Skip for now. # and cannot support CUDA 12+. Skip for now.
- os: windows-2025
cuda_version: "13.0.1"
- os: windows-2025 - os: windows-2025
cuda_version: "12.9.1" cuda_version: "12.9.1"
- os: windows-2025 - os: windows-2025
...@@ -424,7 +433,7 @@ jobs: ...@@ -424,7 +433,7 @@ jobs:
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: '3.10'
- name: Install dependencies - name: Install dependencies
run: | run: |
......
...@@ -113,30 +113,36 @@ if(BUILD_CUDA) ...@@ -113,30 +113,36 @@ if(BUILD_CUDA)
) )
endif() endif()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.4") if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.8")
message(FATAL_ERROR "CUDA Version < 11.4 is not supported") message(FATAL_ERROR "CUDA Version < 11.8 is not supported")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "14.0")
message(FATAL_ERROR "CUDA Version > 12 is not supported") message(FATAL_ERROR "CUDA Version > 13 is not supported")
endif() endif()
# CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.
if(CMAKE_VERSION VERSION_LESS "3.23.0") if(CMAKE_VERSION VERSION_LESS "3.23.0")
message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")
# 11.4+ supports these at a minimum. if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87) # Starting in CUDA 13.0, Thor Blackwell is renamed to SM110.
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) # Support for architectures older than Turing (SM75) is removed.
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 75 80 86 87 88 89 90 100 103 110 120 121)
# CUDA 11.8 adds support for Ada and Hopper. list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 80 90 100 110 120)
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") else()
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) # 11.8-12.9 supports these at a minimum.
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80 86 87 89 90)
endif() set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80 90)
# CUDA 12.8 adds support for Blackwell. # CUDA 12.8 adds support for Blackwell.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120) list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 100 101 120 121)
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120) list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 100 120)
endif()
# CUDA 12.9 adds SM103 (Blackwell B300).
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 103)
endif()
endif() endif()
endif() endif()
...@@ -252,7 +258,7 @@ endif() ...@@ -252,7 +258,7 @@ endif()
set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES}) add_library(bitsandbytes SHARED ${SRC_FILES})
target_compile_features(bitsandbytes PUBLIC cxx_std_14) target_compile_features(bitsandbytes PUBLIC cxx_std_17)
target_include_directories(bitsandbytes PUBLIC csrc include) target_include_directories(bitsandbytes PUBLIC csrc include)
......
...@@ -16,6 +16,14 @@ ...@@ -16,6 +16,14 @@
#include <math_constants.h> #include <math_constants.h>
#include <mma.h> #include <mma.h>
#if CCCL_VERSION >= 2008002
#include <cuda/std/functional>
#define CUB_REDUCTIONOP_MAX \
cuda::maximum<> {}
#else
#define CUB_REDUCTIONOP_MAX cub::Max()
#endif
#define HLF_MAX 65504 #define HLF_MAX 65504
#define TH 1024 #define TH 1024
#define NUM 4 #define NUM 4
...@@ -365,7 +373,7 @@ __global__ void kQuantizeBlockwise( ...@@ -365,7 +373,7 @@ __global__ void kQuantizeBlockwise(
for (int j = 0; j < NUM_PER_TH; j++) for (int j = 0; j < NUM_PER_TH; j++)
local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j])); local_abs_max = fmaxf(local_abs_max, fabsf((float)vals[j]));
local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, cub::Max(), valid_items); local_abs_max = BlockReduce(reduce).Reduce(local_abs_max, CUB_REDUCTIONOP_MAX, valid_items);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
smem_absmax_value[0] = 1.0f / local_abs_max; smem_absmax_value[0] = 1.0f / local_abs_max;
...@@ -951,12 +959,12 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b ...@@ -951,12 +959,12 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
} }
__syncthreads(); __syncthreads();
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items);
__syncthreads(); __syncthreads();
local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, cub::Max(), valid_items); local_max_s2 = BlockReduce(temp_storage.reduce).Reduce(local_max_s2, CUB_REDUCTIONOP_MAX, valid_items);
if (unorm != NULL) { if (unorm != NULL) {
__syncthreads(); __syncthreads();
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items);
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -1162,13 +1170,13 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b ...@@ -1162,13 +1170,13 @@ __global__ void __launch_bounds__(NUM_THREADS, 2) kPreconditionOptimizerStatic8b
} }
__syncthreads(); __syncthreads();
local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, cub::Max(), valid_items); local_max_s1 = BlockReduce(temp_storage.reduce).Reduce(local_max_s1, CUB_REDUCTIONOP_MAX, valid_items);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
atomicMax(&new_max1[0], local_max_s1); atomicMax(&new_max1[0], local_max_s1);
} }
if (unorm != NULL) { if (unorm != NULL) {
__syncthreads(); __syncthreads();
local_unorm = BlockReduce(temp_storage.reduce).Reduce(local_unorm, cub::Sum(), valid_items); local_unorm = BlockReduce(temp_storage.reduce).Sum(local_unorm, valid_items);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
atomicAdd(&unorm[0], local_unorm); atomicAdd(&unorm[0], local_unorm);
} }
...@@ -1473,11 +1481,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise( ...@@ -1473,11 +1481,11 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit2StateBlockwise(
} }
// reduce: 2.51/1.60 -> 2.67/1.69 // reduce: 2.51/1.60 -> 2.67/1.69
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max()); new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX);
new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, cub::Max()); new_local_abs_max2 = BlockReduce2(reduce2).Reduce(new_local_abs_max2, CUB_REDUCTIONOP_MAX);
if (OPTIMIZER == ADEMAMIX) { if (OPTIMIZER == ADEMAMIX) {
new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, cub::Max()); new_local_abs_max3 = BlockReduce3(reduce3).Reduce(new_local_abs_max3, CUB_REDUCTIONOP_MAX);
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -1686,7 +1694,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise( ...@@ -1686,7 +1694,7 @@ __launch_bounds__(256, 3) __global__ void kOptimizerStatic8bit1StateBlockwise(
} }
// reduce: 2.51/1.60 -> 2.67/1.69 // reduce: 2.51/1.60 -> 2.67/1.69
new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, cub::Max()); new_local_abs_max1 = BlockReduce1(reduce1).Reduce(new_local_abs_max1, CUB_REDUCTIONOP_MAX);
if (threadIdx.x == 0) if (threadIdx.x == 0)
smem_exchange1[0] = new_local_abs_max1; smem_exchange1[0] = new_local_abs_max1;
...@@ -1792,7 +1800,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ ...@@ -1792,7 +1800,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
} }
// Reduce thread-local absmax across the block. // Reduce thread-local absmax across the block.
const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); const TReduction row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
// Save our block's absmax to shared memory for the quantization step. // Save our block's absmax to shared memory for the quantization step.
rowStats[row_id] = smem_row_absmax = row_absmax; rowStats[row_id] = smem_row_absmax = row_absmax;
...@@ -1847,7 +1855,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__ ...@@ -1847,7 +1855,7 @@ __launch_bounds__(1024, BNB_MAX_THREADS_PER_SM / 1024) __global__
// Reduce thread-local absmax across the block. // Reduce thread-local absmax across the block.
// TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY // TODO: Consider algorithm BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, cub::Max(), cols); const float row_absmax = BlockReduceT(temp_storage).Reduce(row_local_absmax, CUB_REDUCTIONOP_MAX, cols);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
// Save our block's absmax to shared memory for the quantization step. // Save our block's absmax to shared memory for the quantization step.
rowStats[row_id] = row_absmax; rowStats[row_id] = row_absmax;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#if BUILD_CUDA #if BUILD_CUDA
#include <cuda_runtime_api.h>
#include <ops.cuh> #include <ops.cuh>
#endif #endif
#if BUILD_HIP #if BUILD_HIP
...@@ -710,7 +711,15 @@ void cprefetch(void* ptr, size_t bytes, int device) { ...@@ -710,7 +711,15 @@ void cprefetch(void* ptr, size_t bytes, int device) {
if (hasPrefetch == 0) if (hasPrefetch == 0)
return; return;
#if CUDART_VERSION >= 13000
cudaMemLocation loc{};
loc.type = cudaMemLocationTypeDevice;
loc.id = device;
CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, loc, 0u, 0));
#else
CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
#endif
CUDA_CHECK_RETURN(cudaPeekAtLastError()); CUDA_CHECK_RETURN(cudaPeekAtLastError());
} }
......
...@@ -40,10 +40,12 @@ The currently distributed `bitsandbytes` packages are built with the following c ...@@ -40,10 +40,12 @@ The currently distributed `bitsandbytes` packages are built with the following c
|--------------------|------------------|----------------------|-------------- |--------------------|------------------|----------------------|--------------
| **Linux x86-64** | 11.8 - 12.6 | GCC 11.2 | sm60, sm70, sm75, sm80, sm86, sm89, sm90 | **Linux x86-64** | 11.8 - 12.6 | GCC 11.2 | sm60, sm70, sm75, sm80, sm86, sm89, sm90
| **Linux x86-64** | 12.8 - 12.9 | GCC 11.2 | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120 | **Linux x86-64** | 12.8 - 12.9 | GCC 11.2 | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120
| **Linux x86-64** | 13.0 | GCC 11.2 | sm75, sm80, sm86, sm89, sm90, sm100, sm120
| **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm90 | **Linux aarch64** | 11.8 - 12.6 | GCC 11.2 | sm75, sm80, sm90
| **Linux aarch64** | 12.8 - 12.9 | GCC 11.2 | sm75, sm80, sm90, sm100, sm120 | **Linux aarch64** | 12.8 - 13.0 | GCC 11.2 | sm75, sm80, sm90, sm100, sm120
| **Windows x86-64** | 11.8 - 12.6 | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90 | **Windows x86-64** | 11.8 - 12.6 | MSVC 19.43+ (VS2022) | sm50, sm60, sm75, sm80, sm86, sm89, sm90
| **Windows x86-64** | 12.8 - 12.9 | MSVC 19.43+ (VS2022) | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120 | **Windows x86-64** | 12.8 - 12.9 | MSVC 19.43+ (VS2022) | sm70, sm75, sm80, sm86, sm89, sm90, sm100, sm120
| **Windows x86-64** | 13.0 | MSVC 19.43+ (VS2022) | sm75, sm80, sm86, sm89, sm90, sm100, sm120
Use `pip` or `uv` to install: Use `pip` or `uv` to install:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment