Enable ROCm backend with custom ops integration (#1683)

* Port ROCm changes from multi-backend-refactor branch * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update test_ops.py * Update test_functional.py * Update test_ops.py * Update test_functional.py * Update test_functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update test_functional.py * Update test_functional.py * Update cextension.py * Update cuda_specs.py * Update cuda_specs.py * Update test_functional.py * Update test_linear4bit.py * Update test_cuda_setup_evaluator.py * Update test_functional.py * Update modules.py * Update modules.py * Update ops.py * Update test_linear4bit.py * Update ops.py * Update ops.py * Update test_linear4bit.py * Update test_linear4bit.py * Update python-package.yml * Update python-package.yml * Update python-package.yml * Update python-package.yml * Create build-rocm.sh * Update cuda_specs.py * Fix trailing whitespace * Remove conflicts.diff * update for hipblasVersionMajor >=3 * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update main.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update test_linear4bit.py * Lint * Lint * Update helpers.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Lint * Update pythonInterface.cpp * lint fix * lint * Update pythonInterface.cpp * revert permissions change * Fix indentation * Update kernels_hip.cuh * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update kernels_hip.cuh * Update kernels.hip * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update ops.hip * Update CMakeLists.txt * Update functional.py * Update cextension.py * Update cextension.py --------- Co-authored-by: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Co-authored-by: MISHANMAUYRA <mishanmaurya31081@gmail.com> Co-authored-by: amcamd <andrew.chapman@amd.com> Co-authored-by: Prasanth Nunna <root@banff-cyxtera-s78-1.amd.com>

Enable ROCm backend with custom ops integration (#1683)
* Port ROCm changes from multi-backend-refactor branch * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update test_ops.py * Update test_functional.py * Update test_ops.py * Update test_functional.py * Update test_functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update test_functional.py * Update test_functional.py * Update cextension.py * Update cuda_specs.py * Update cuda_specs.py * Update test_functional.py * Update test_linear4bit.py * Update test_cuda_setup_evaluator.py * Update test_functional.py * Update modules.py * Update modules.py * Update ops.py * Update test_linear4bit.py * Update ops.py * Update ops.py * Update test_linear4bit.py * Update test_linear4bit.py * Update python-package.yml * Update python-package.yml * Update python-package.yml * Update python-package.yml * Create build-rocm.sh * Update cuda_specs.py * Fix trailing whitespace * Remove conflicts.diff * update for hipblasVersionMajor >=3 * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update main.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update test_linear4bit.py * Lint * Lint * Update helpers.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Lint * Update pythonInterface.cpp * lint fix * lint * Update pythonInterface.cpp * revert permissions change * Fix indentation * Update kernels_hip.cuh * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update kernels_hip.cuh * Update kernels.hip * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update ops.hip * Update CMakeLists.txt * Update functional.py * Update cextension.py * Update cextension.py --------- Co-authored-by: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Co-authored-by: MISHANMAUYRA <mishanmaurya31081@gmail.com> Co-authored-by: amcamd <andrew.chapman@amd.com> Co-authored-by: Prasanth Nunna <root@banff-cyxtera-s78-1.amd.com>
888788d7 · pnunna93 · GitHub · a1cd3f6e · 888788d7 · 888788d7
Unverified Commit 888788d7 authored Jun 20, 2025 by pnunna93 Committed by GitHub Jun 20, 2025
20 changed files
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
+#!/bin/bash
+declare build_arch
+declare build_os
+declare rocm_version
+
+set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+      && cmake --build ."
+fi
+
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -102,10 +102,55 @@ jobs:
          path: output/*
          retention-days: 7

+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-22.04]
+        arch: [x86_64]
+        rocm_version:
+          ["6.1.2", "6.2.4", "6.3.2"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        uses: docker/setup-qemu-action@v3
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf \
+              /usr/share/dotnet \
+              /opt/ghc \
+              "/usr/local/share/boost" \
+              "$AGENT_TOOLSDIRECTORY" \
+              /opt/hostedtoolcache \
+              /opt/google/chrome \
+              /opt/microsoft/msedge \
+              /opt/microsoft/powershell \
+              /opt/pipx \
+              /usr/lib/mono \
+              /usr/local/julia* \
+              /usr/local/lib/android \
+              /usr/local/lib/node_modules \
+              /usr/local/share/chromium \
+              /usr/local/share/powershell \
+              /usr/share/swift
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
+
  build-wheels:
    needs:
      - build-shared-libs
      - build-shared-libs-cuda
+      - build-shared-libs-rocm
    strategy:
      matrix:
        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-latest, macos-latest]
@@ -173,6 +218,7 @@ jobs:
          merge-multiple: true

      - name: Inspect tmp directory after downloading artifacts
+
        run: |
          ls -alFR tmp/
          WHEEL_COUNT=$(find tmp/ -type f -name "*.whl" | wc -l)
@@ -210,6 +256,7 @@ jobs:
      - uses: actions/checkout@v4
        with:
          path: repo
+
      - name: Delete old pre-release (if exists)
        run: |
          cd repo && gh release delete continuous-release_main --cleanup-tag -y

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,13 +25,14 @@ endif()
 # Define included source files
 set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
 set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
+set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
 set(METAL_FILES csrc/mps_kernels.metal)
 # C++ sources are always included
 list(APPEND SRC_FILES ${CPP_FILES})

-set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)")
-set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps)
+set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
+set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
 option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)

 if(APPLE)
@@ -47,15 +48,25 @@ if(${COMPUTE_BACKEND} STREQUAL "cuda")
        message(FATAL_ERROR "CUDA is not supported on macOS" )
    endif()
    set(BUILD_CUDA ON)
+    set(BUILD_HIP OFF)
+    set(BUILD_MPS OFF)
+elseif(${COMPUTE_BACKEND} STREQUAL "hip")
+    if(APPLE)
+        message(FATAL_ERROR "HIP is not supported on macOS" )
+    endif()
+    set(BUILD_CUDA OFF)
+    set(BUILD_HIP ON)
    set(BUILD_MPS OFF)
 elseif(${COMPUTE_BACKEND} STREQUAL "mps")
    if(NOT APPLE)
        message(FATAL_ERROR "MPS is only supported on macOS" )
    endif()
    set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
    set(BUILD_MPS ON)
 else()
    set(BUILD_CUDA OFF)
+    set(BUILD_HIP OFF)
    set(BUILD_MPS OFF)
 endif()

@@ -160,6 +171,33 @@ if(BUILD_CUDA)

    string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
    add_compile_definitions(BUILD_CUDA)
+elseif(BUILD_HIP)
+    enable_language(HIP)
+    message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}")
+    if(DEFINED BNB_ROCM_ARCH)
+      set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
+    else()
+      if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
+      elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+      endif()
+    endif()
+    message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}")
+
+    list(APPEND SRC_FILES ${HIP_FILES})
+
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
+
+    # get hip version
+    execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
+    string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
+
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
+    add_compile_definitions(__HIP_PLATFORM_HCC__)
+    add_compile_definitions(BUILD_HIP)
 elseif(BUILD_MPS)
    if(NOT APPLE)
        message(FATAL_ERROR "MPS is only supported on macOS" )
@@ -208,6 +246,41 @@ if(BUILD_CUDA)
            CUDA_SEPARABLE_COMPILATION ON
    )
 endif()
+if(BUILD_HIP)
+    if(NOT DEFINED ENV{ROCM_PATH})
+      set(ROCM_PATH /opt/rocm)
+    else()
+      set(ROCM_PATH $ENV{ROCM_PATH})
+    endif()
+    list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+    macro(find_package_and_print_version PACKAGE_NAME)
+      find_package("${PACKAGE_NAME}" ${ARGN})
+      message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+    endmacro()
+    find_package_and_print_version(hipblas REQUIRED)
+    find_package_and_print_version(hiprand REQUIRED)
+    find_package_and_print_version(hipsparse REQUIRED)
+
+    ## hacky way of excluding hip::amdhip64 (with it linked many tests unexpectedly fail e.g. adam8bit because of inaccuracies)
+    set_target_properties(hip::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
+    set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "")
+    set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "")
+
+    target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include)
+    target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib)
+    target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse)
+
+    target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP)
+    set_source_files_properties(${HIP_FILES} PROPERTIES LANGUAGE HIP)
+    set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX)
+
+    if(HIP_VERSION VERSION_LESS "6.1")
+	target_compile_definitions(bitsandbytes PUBLIC NO_HIPBLASLT)
+    else()
+	find_package(hipblaslt)
+        target_link_libraries(bitsandbytes PUBLIC roc::hipblaslt)
+    endif()
+endif()
 if(BUILD_MPS)
    add_dependencies(bitsandbytes metallib)
    target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")

--- a/bitsandbytes/backends/cuda/ops.py
+++ b/bitsandbytes/backends/cuda/ops.py
@@ -8,7 +8,7 @@ import torch
 from bitsandbytes.functional import CUBLAS_Context, _cuda_device_of, _get_tensor_stream, get_ptr

 from ..._ops import register_kernel
-from ...cextension import lib
+from ...cextension import HIP_ENVIRONMENT, lib


 @register_kernel("bitsandbytes::int8_linear_matmul", "cuda")
@@ -210,7 +210,12 @@ def _get_col_absmax(
 @register_kernel("bitsandbytes::quantize_blockwise", "cuda")
 def _(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
    torch._check_is_size(blocksize)
-    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
    torch._check(code.dtype == torch.float32, lambda: f"code must be float32, got {code.dtype}")

    n = A.numel()
@@ -264,7 +269,11 @@ def _(
 def _dequantize_blockwise_impl(
    A: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int, dtype: torch.dtype, out: torch.Tensor
 ) -> None:
-    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
    torch._check(A.dtype == torch.uint8, lambda: f"A must be uint8, got {A.dtype}")
    torch._check(
        dtype in [torch.float16, torch.bfloat16, torch.float32],
@@ -294,7 +303,11 @@ def _dequantize_blockwise_impl(
 def _(
    A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
    torch._check(quant_type in ["fp4", "nf4"])
    torch._check(
        A.dtype in [torch.bfloat16, torch.float16, torch.float32],
@@ -372,7 +385,11 @@ def _dequantize_4bit_impl(
    dtype: torch.dtype,
    out: torch.Tensor,
 ) -> None:
-    torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+    if HIP_ENVIRONMENT:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128])
+    else:
+        torch._check(blocksize in [4096, 2048, 1024, 512, 256, 128, 64])
+
    torch._check(quant_type in ["fp4", "nf4"])
    torch._check(
        dtype in [torch.bfloat16, torch.float16, torch.float32],

--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -9,7 +9,7 @@ from typing import Optional
 import torch

 from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR
-from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_cuda_version_tuple
+from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_cuda_version_tuple, get_rocm_gpu_arch

 logger = logging.getLogger(__name__)

@@ -28,6 +28,11 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
    override_value = os.environ.get("BNB_CUDA_VERSION")
    if override_value:
        library_name = re.sub(r"cuda\d+", f"cuda{override_value}", library_name, count=1)
+        if torch.version.hip:
+            raise RuntimeError(
+                f"BNB_CUDA_VERSION={override_value} detected for ROCm!! \n"
+                f"Clear the variable and retry: export BNB_CUDA_VERSION=\n"
+            )
        logger.warning(
            f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n"
            "This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.\n"
@@ -75,10 +80,11 @@ class CudaBNBNativeLibrary(BNBNativeLibrary):

 def get_available_cuda_binary_versions() -> list[str]:
    """Get formatted CUDA versions from existing library files using cuda_specs logic"""
-    lib_pattern = f"libbitsandbytes_cuda*{DYNAMIC_LIBRARY_SUFFIX}"
+    lib_pattern = f"libbitsandbytes_{BNB_BACKEND.lower()}*{DYNAMIC_LIBRARY_SUFFIX}"
    versions = []
    for lib in Path(__file__).parent.glob(lib_pattern):
-        match = re.search(r"cuda(\d{3})", lib.name)
+        pattern = rf"{BNB_BACKEND.lower()}(\d+)"
+        match = re.search(pattern, lib.name)
        if match:
            ver_code = int(match.group(1))
            major = ver_code // 10
@@ -89,8 +95,8 @@ def get_available_cuda_binary_versions() -> list[str]:

 def parse_cuda_version(version_str: str) -> str:
    """Convert raw version string (e.g. '118' from env var) to formatted version (e.g. '11.8')"""
-    if version_str.isdigit() and len(version_str) == 3:
-        return f"{version_str[:2]}.{version_str[2]}"
+    if version_str.isdigit():
+        return f"{version_str[:-1]}.{version_str[-1]}"
    return version_str  # fallback as safety net


@@ -151,7 +157,7 @@ class ErrorHandlerMockBNBNativeLibrary(BNBNativeLibrary):
        """Format detailed error message for library loading failures"""
        analysis = ""
        no_cpu_lib_found = "libbitsandbytes_cpu.so: cannot open" in original_error
-        no_cuda_lib_found = "CUDA binary not found" in original_error
+        no_cuda_lib_found = f"{BNB_BACKEND} binary not found" in original_error

        if no_cpu_lib_found:
            analysis = "\n🚨 Failed to load CPU-only bitsandbytes library 🚨\n\n"
@@ -160,9 +166,9 @@ class ErrorHandlerMockBNBNativeLibrary(BNBNativeLibrary):
            version_list_str = "\n  - " + "\n  - ".join(available_versions) if available_versions else "NONE"
            analysis = (
                (
-                    f"\n🚨 CUDA VERSION MISMATCH 🚨\n"
-                    f"Requested CUDA version:          {requested_version}\n"
-                    f"Detected PyTorch CUDA version:   {user_cuda_version}\n"
+                    f"\n🚨 {BNB_BACKEND} VERSION MISMATCH 🚨\n"
+                    f"Requested {BNB_BACKEND} version:          {requested_version}\n"
+                    f"Detected PyTorch {BNB_BACKEND} version:   {user_cuda_version}\n"
                    f"Available pre-compiled versions: {version_list_str}\n\n"
                    "This means:\n"
                    "The version you're trying to use is NOT distributed with this package\n\n"
@@ -177,42 +183,47 @@ class ErrorHandlerMockBNBNativeLibrary(BNBNativeLibrary):

        troubleshooting = (
            (
-                "This typically happens when:\n"
-                "1. bitsandbytes doesn't ship with a pre-compiled binary for your CUDA version\n"
-                "2. The library wasn't compiled properly during installation from source\n\n"
+                f"This typically happens when:\n"
+                f"1. bitsandbytes doesn't ship with a pre-compiled binary for your {BNB_BACKEND} version\n"
+                f"2. The library wasn't compiled properly during installation from source\n\n"
            )
            if no_cuda_lib_found
-            else "This typically happens when you checked the code out from source and your torch installation doesn't detect CUDA on your machine.\n\n"
+            else f"This typically happens when you checked the code out from source and your torch installation doesn't detect {BNB_BACKEND} on your machine.\n\n"
        )

        note = (
            (
-                "To make bitsandbytes work, the compiled library version MUST exactly match the linked CUDA version.\n"
-                "If your CUDA version doesn't have a pre-compiled binary, you MUST compile from source.\n\n"
+                f"To make bitsandbytes work, the compiled library version MUST exactly match the linked {BNB_BACKEND} version.\n"
+                f"If your {BNB_BACKEND} version doesn't have a pre-compiled binary, you MUST compile from source.\n\n"
            )
            if no_cuda_lib_found
            else ""
        )

        compile_instructions = (
-            (
+            ("COMPILE FROM SOURCE for CPU-only:\n  `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n")
+            if not no_cuda_lib_found
+            else (
                "You have two options:\n"
                "1. COMPILE FROM SOURCE (required if no binary exists):\n"
                "   https://huggingface.co/docs/bitsandbytes/main/en/installation#cuda-compile\n"
                "2. Use BNB_CUDA_VERSION to specify a DIFFERENT CUDA version from the detected one, which is installed on your machine and matching an available pre-compiled version listed above\n\n"
            )
-            if no_cuda_lib_found
-            else "COMPILE FROM SOURCE for CPU-only:\n  `cmake -DCOMPUTE_BACKEND=cpu -S . && make`\n\n"
+            if not HIP_ENVIRONMENT
+            else (
+                "You can COMPILE FROM SOURCE as mentioned here:\n"
+                "   https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=AMD+ROCm#amd-gpu\n"
+            )
        )

        diagnostics = (
-            "🔍 Run this command for detailed diagnostics:\n"
-            "python -m bitsandbytes\n\n"
-            "If you've tried everything and still have issues:\n"
-            "1. Include ALL version info (operating system, bitsandbytes, pytorch, cuda, python)\n"
-            "2. Describe what you've tried in detail\n"
-            "3. Open an issue with this information:\n"
-            "   https://github.com/bitsandbytes-foundation/bitsandbytes/issues\n\n"
+            f"🔍 Run this command for detailed diagnostics:\n"
+            f"python -m bitsandbytes\n\n"
+            f"If you've tried everything and still have issues:\n"
+            f"1. Include ALL version info (operating system, bitsandbytes, pytorch, {BNB_BACKEND.lower()}, python)\n"
+            f"2. Describe what you've tried in detail\n"
+            f"3. Open an issue with this information:\n"
+            f"   https://github.com/bitsandbytes-foundation/bitsandbytes/issues\n\n"
        )

        return f"{analysis}{base_msg}{troubleshooting}{note}{compile_instructions}{original_error}\n{diagnostics}"
@@ -227,18 +238,19 @@ class ErrorHandlerMockBNBNativeLibrary(BNBNativeLibrary):
        )

        return (
-            f"\n🚨 CUDA SETUP ERROR: Missing dependency: {missing_lib} 🚨\n\n"
-            f"CUDA {cuda_major_version}.x runtime libraries were not found in the LD_LIBRARY_PATH.\n\n"
+            f"\n🚨 {BNB_BACKEND} SETUP ERROR: Missing dependency: {missing_lib} 🚨\n\n"
+            f"{BNB_BACKEND} {cuda_major_version}.x runtime libraries were not found in the LD_LIBRARY_PATH.\n\n"
            f"To fix this, make sure that:\n"
-            f"1. You have installed CUDA {cuda_major_version}.x toolkit on your system\n"
-            f"2. The CUDA runtime libraries are in your LD_LIBRARY_PATH\n\n"
+            f"1. You have installed {BNB_BACKEND} {cuda_major_version}.x toolkit on your system\n"
+            f"2. The {BNB_BACKEND} runtime libraries are in your LD_LIBRARY_PATH\n\n"
            f"You can add them with (and persist the change by adding the line to your .bashrc):\n"
-            f"   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/cuda-{cuda_major_version}.x/lib64\n\n"
+            f"   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/{BNB_BACKEND.lower()}-{cuda_major_version}.x/\
+                    {'lib64' if not HIP_ENVIRONMENT else 'lib'}\n\n"
            f"Original error: {self.error_msg}\n\n"
            f"🔍 Run this command for detailed diagnostics:\n"
            f"python -m bitsandbytes\n\n"
            f"If you've tried everything and still have issues:\n"
-            f"1. Include ALL version info (operating system, bitsandbytes, pytorch, cuda, python)\n"
+            f"1. Include ALL version info (operating system, bitsandbytes, pytorch, {BNB_BACKEND.lower()}, python)\n"
            f"2. Describe what you've tried in detail\n"
            f"3. Open an issue with this information:\n"
            f"   https://github.com/bitsandbytes-foundation/bitsandbytes/issues\n\n"
@@ -267,7 +279,7 @@ def get_native_library() -> BNBNativeLibrary:
        cuda_binary_path = get_cuda_bnb_library_path(cuda_specs)

        if not cuda_binary_path.exists():
-            raise RuntimeError(f"Configured CUDA binary not found at {cuda_binary_path}")
+            raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")

        binary_path = cuda_binary_path

@@ -286,6 +298,8 @@ def get_native_library() -> BNBNativeLibrary:
    return BNBNativeLibrary(dll)


+ROCM_GPU_ARCH = get_rocm_gpu_arch()
+
 try:
    # to support Intel CPU/GPU (XPU) backend
    import intel_extension_for_pytorch as ipex
@@ -296,8 +310,12 @@ except BaseException:
    ipex_cpu = None
    ipex_xpu = None

-
 try:
+    if torch.version.hip:
+        HIP_ENVIRONMENT, BNB_BACKEND = True, "ROCm"
+    else:
+        HIP_ENVIRONMENT, BNB_BACKEND = False, "CUDA"
+
    lib = get_native_library()
 except Exception as e:
    error_msg = str(e)

--- a/bitsandbytes/cuda_specs.py
+++ b/bitsandbytes/cuda_specs.py
 import dataclasses
 from functools import lru_cache
+import logging
+import re
+import subprocess
 from typing import Optional

 import torch
@@ -73,3 +76,27 @@ def get_cuda_specs() -> Optional[CUDASpecs]:
        )
    except Exception:
        return None
+
+
+def get_rocm_gpu_arch() -> str:
+    """Get ROCm GPU architecture."""
+    logger = logging.getLogger(__name__)
+    try:
+        if torch.version.hip:
+            result = subprocess.run(["rocminfo"], capture_output=True, text=True)
+            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
+            if match:
+                return "gfx" + match.group(1)
+            else:
+                return "unknown"
+        else:
+            return "unknown"
+    except Exception as e:
+        logger.error(f"Could not detect ROCm GPU architecture: {e}")
+        if torch.cuda.is_available():
+            logger.warning(
+                """
+ROCm GPU architecture detection failed despite ROCm being available.
+                """,
+            )
+        return "unknown"
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -5,7 +5,7 @@ from pathlib import Path

 import torch

-from bitsandbytes.cextension import get_cuda_bnb_library_path
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
 from bitsandbytes.cuda_specs import CUDASpecs
 from bitsandbytes.diagnostics.utils import print_dedented

@@ -32,9 +32,13 @@ CUDART_PATH_IGNORED_ENVVARS = {
 }

 CUDA_RUNTIME_LIB_PATTERNS = (
-    "cudart64*.dll",  # Windows
-    "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
-    "nvcuda*.dll",  # Windows
+    ("libamdhip64.so*",)
+    if HIP_ENVIRONMENT
+    else (
+        "cudart64*.dll",  # Windows
+        "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+        "nvcuda*.dll",  # Windows
+    )
 )

 logger = logging.getLogger(__name__)
@@ -56,7 +60,7 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
                pass
            for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
                for pth in dir.glob(lib_pattern):
-                    if pth.is_file():
+                    if pth.is_file() and not pth.is_symlink():
                        yield pth
        except (OSError, PermissionError):
            pass
@@ -103,7 +107,7 @@ def find_cudart_libraries() -> Iterator[Path]:
        yield from find_cuda_libraries_in_path_list(value)


-def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
+def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
    print(
        f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
        f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
@@ -128,7 +132,37 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
        )


-def print_cuda_runtime_diagnostics() -> None:
+def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
+    print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+
+    binary_path = get_cuda_bnb_library_path(cuda_specs)
+    if not binary_path.exists():
+        print_dedented(
+            f"""
+        Library not found: {binary_path}.
+        Maybe you need to compile it from source? If you compiled from source, check that ROCm version
+        in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
+        and rebuild bitsandbytes.
+        """,
+        )
+
+    hip_major, hip_minor = cuda_specs.cuda_version_tuple
+    if (hip_major, hip_minor) < (6, 1):
+        print_dedented(
+            """
+            WARNING: bitsandbytes is fully supported only from ROCm 6.1.
+            """,
+        )
+
+
+def print_diagnostics(cuda_specs: CUDASpecs) -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_diagnostics(cuda_specs)
+    else:
+        _print_cuda_diagnostics(cuda_specs)
+
+
+def _print_cuda_runtime_diagnostics() -> None:
    cudart_paths = list(find_cudart_libraries())
    if not cudart_paths:
        print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
@@ -153,3 +187,33 @@ def print_cuda_runtime_diagnostics() -> None:
        )
        for pth in cudart_paths:
            print(f"* Found CUDA runtime at: {pth}")
+
+
+def _print_hip_runtime_diagnostics() -> None:
+    cudart_paths = list(find_cudart_libraries())
+    if not cudart_paths:
+        print("WARNING! ROCm runtime files not found in any environmental path.")
+    elif len(cudart_paths) > 1:
+        print_dedented(
+            f"""
+            Found duplicate ROCm runtime files (see below).
+
+            We select the PyTorch default ROCm runtime, which is {torch.version.hip},
+            but this might mismatch with the ROCm version that is needed for bitsandbytes.
+
+            To resolve it, install PyTorch built for the ROCm version you want to use
+
+            and set LD_LIBRARY_PATH to your ROCm install path, e.g.
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
+            """,
+        )
+
+        for pth in cudart_paths:
+            print(f"* Found ROCm runtime at: {pth}")
+
+
+def print_runtime_diagnostics() -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_runtime_diagnostics()
+    else:
+        _print_cuda_runtime_diagnostics()
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -6,10 +6,11 @@ import traceback
 import torch

 from bitsandbytes import __version__ as bnb_version
+from bitsandbytes.cextension import BNB_BACKEND
 from bitsandbytes.consts import PACKAGE_GITHUB_URL
 from bitsandbytes.cuda_specs import get_cuda_specs
 from bitsandbytes.diagnostics.cuda import (
-    print_cuda_diagnostics,
+    print_diagnostics,
 )
 from bitsandbytes.diagnostics.utils import print_dedented, print_header

@@ -77,19 +78,19 @@ def main():
    cuda_specs = get_cuda_specs()

    if cuda_specs:
-        print_cuda_diagnostics(cuda_specs)
+        print_diagnostics(cuda_specs)

    # TODO: There's a lot of noise in this; needs improvement.
    # print_cuda_runtime_diagnostics()

    if not torch.cuda.is_available():
-        print("PyTorch says CUDA is not available. Possible reasons:")
-        print("1. CUDA driver not installed")
+        print(f"PyTorch says {BNB_BACKEND} is not available. Possible reasons:")
+        print(f"1. {BNB_BACKEND} driver not installed")
        print("2. Using a CPU-only PyTorch build")
        print("3. No GPU detected")

    else:
-        print("Checking that the library is importable and CUDA is callable...")
+        print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")

        try:
            sanity_check()

--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -15,7 +15,7 @@ from typing_extensions import deprecated

 from bitsandbytes.utils import _reverse_4bit_compress_format, pack_dict_to_tensor, unpack_tensor_to_dict

-from .cextension import ipex_cpu, ipex_xpu, lib
+from .cextension import HIP_ENVIRONMENT, ipex_cpu, ipex_xpu, lib

 name2qmap = {}

@@ -868,10 +868,12 @@ def quantize_fp4(
    A: torch.Tensor,
    absmax: Optional[torch.Tensor] = None,
    out: Optional[torch.Tensor] = None,
-    blocksize=64,
+    blocksize=None,
    compress_statistics=False,
    quant_storage=torch.uint8,
 ):
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage)


@@ -879,10 +881,12 @@ def quantize_nf4(
    A: torch.Tensor,
    absmax: Optional[torch.Tensor] = None,
    out: Optional[torch.Tensor] = None,
-    blocksize=64,
+    blocksize=None,
    compress_statistics=False,
    quant_storage=torch.uint8,
 ):
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage)


@@ -890,7 +894,7 @@ def quantize_4bit(
    A: torch.Tensor,
    absmax: Optional[torch.Tensor] = None,
    out: Optional[torch.Tensor] = None,
-    blocksize=64,
+    blocksize=None,
    compress_statistics=False,
    quant_type="fp4",
    quant_storage=torch.uint8,
@@ -904,7 +908,7 @@ def quantize_4bit(
        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
-            The size of the blocks. Defaults to 64.
+            The size of the blocks. Defaults to 128 on ROCm and 64 otherwise.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
@@ -918,6 +922,10 @@ def quantize_4bit(
        - `torch.Tensor`: The quantized tensor with packed 4-bit values.
        - [`QuantState`]: The state object used to undo the quantization.
    """
+
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
+
    input_shape = A.shape

    _out, _absmax = torch.ops.bitsandbytes.quantize_4bit.default(
@@ -968,8 +976,10 @@ def dequantize_fp4(
    quant_state: Optional[QuantState] = None,
    absmax: Optional[torch.Tensor] = None,
    out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
 ) -> torch.Tensor:
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")


@@ -978,8 +988,10 @@ def dequantize_nf4(
    quant_state: Optional[QuantState] = None,
    absmax: Optional[torch.Tensor] = None,
    out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
 ) -> torch.Tensor:
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")


@@ -988,7 +1000,7 @@ def dequantize_4bit(
    quant_state: Optional[QuantState] = None,
    absmax: Optional[torch.Tensor] = None,
    out: Optional[torch.Tensor] = None,
-    blocksize: int = 64,
+    blocksize: Optional[int] = None,
    quant_type="fp4",
 ) -> torch.Tensor:
    """Dequantizes a packed 4-bit quantized tensor.
@@ -1007,7 +1019,7 @@ def dequantize_4bit(
            Required if `quant_state` is not provided and ignored otherwise.
        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
        blocksize (`int`, *optional*):
-            The size of the blocks. Defaults to 64.
+            The size of the blocks. Defaults to 128 on ROCm and 64 otherwise.
            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.

@@ -1017,6 +1029,10 @@ def dequantize_4bit(
    Returns:
        `torch.Tensor`: The dequantized tensor.
    """
+
+    if blocksize is None:
+        blocksize = 64 if not HIP_ENVIRONMENT else 128
+
    if quant_state is None:
        assert absmax is not None and out is not None


--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -11,6 +11,7 @@ from torch import Tensor, device, dtype, nn
 import torch.nn.functional as F

 import bitsandbytes as bnb
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 from bitsandbytes.functional import QuantState, _enable_ipex_fusion, ipex_cpu, ipex_xpu
 from bitsandbytes.optim import GlobalOptimManager
 from bitsandbytes.utils import (
@@ -213,7 +214,7 @@ class Params4bit(torch.nn.Parameter):
        data: Optional[torch.Tensor] = None,
        requires_grad=False,  # quantized weights should be frozen by default
        quant_state: Optional[QuantState] = None,
-        blocksize: int = 64,
+        blocksize: Optional[int] = None,
        compress_statistics: bool = True,
        quant_type: str = "fp4",
        quant_storage: torch.dtype = torch.uint8,
@@ -223,6 +224,9 @@ class Params4bit(torch.nn.Parameter):
        if data is None:
            data = torch.empty(0)

+        if blocksize is None:
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
+
        self = torch.Tensor._make_subclass(cls, data, requires_grad)
        self.blocksize = blocksize
        self.compress_statistics = compress_statistics

--- a/csrc/common_hip.cuh
+++ b/csrc/common_hip.cuh
+#pragma once
+
+#define BNB_WARP_SIZE warpSize
+
+// These are set based on current BNB support for CDNA 2 & RDNA 3. Update as needed for future archs
+#define BNB_MAX_THREADS_PER_SM 2048
+#define BNB_BF16_AVAILABLE true
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
--- a/csrc/kernels_hip.cuh
+++ b/csrc/kernels_hip.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <float.h>
+#include <ops_hip.cuh>
+
+#ifndef kernels
+#define kernels
+
+__global__ void kQuantize(float* code, float* __restrict__ const A, unsigned char* out, const int n);
+__global__ void kDequantize(float* code, unsigned char* A, float* out, const int n);
+
+template <typename T, int BLOCK_SIZE, int NUM_PER_TH, int STOCHASTIC, int DATA_TYPE>
+__global__ void kQuantizeBlockwise(
+    float* code, T* __restrict__ const A, float* absmax, unsigned char* out, float* __restrict__ const rand,
+    const int rand_offset, const int n
+);
+template <typename T, int BLOCK_SIZE, int THREADS, int NUM_PER_TH, int DATA_TYPE>
+__global__ void
+    kDequantizeBlockwise(float* code, unsigned char* A, float* absmax, T* out, const int blocksize, const int n);
+
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit2State(
+    T* g, T* p, float* state1, float* state2, float* unorm, const float beta1, const float beta2, const float eps,
+    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
+);
+
+template <typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit2State(
+    T* g, T* p, float* state1, float* state2, float* unorm, const float max_unorm, const float param_norm,
+    const float beta1, const float beta2, const float beta3, const float alpha, const float eps,
+    const float weight_decay, const int step, const float lr, const float gnorm_scale, const bool skip_zeros,
+    const int n
+);
+
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPreconditionOptimizer32bit1State(
+    T* g, T* p, float* state1, float* unorm, const float beta1, const float beta2, const float eps,
+    const float weight_decay, const int step, const float lr, const float gnorm_scale, const int n
+);
+
+template <typename T, int OPTIMIZER>
+__global__ void kOptimizer32bit1State(
+    T* g, T* p, float* state1, float* unorm, const float max_unorm, const float param_norm, const float beta1,
+    const float beta2, const float eps, const float weight_decay, const int step, const float lr,
+    const float gnorm_scale, const bool skip_zeros, const int n
+);
+
+template <typename T, int OPTIMIZER>
+__global__ void kPreconditionOptimizerStatic8bit1State(
+    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, float* unorm, const float beta1,
+    const float beta2, const float eps, const int step, float* __restrict__ const quantiles1, float* max1,
+    float* new_max1, const float weight_decay, const float gnorm_scale, const int n
+);
+
+template <typename T, int OPTIMIZER>
+__global__ void kOptimizerStatic8bit1State(
+    T* p, T* const g, unsigned char* state1, const float* unorm, const float max_unorm, const float param_norm,
+    const float beta1, const float beta2, const float eps, const int step, const float lr,
+    float* __restrict__ const quantiles1, float* max1, float* new_max1, float weight_decay, const float gnorm_scale,
+    const int n
+);
+
+template <typename T, int OPTIMIZER>
+__global__ void kPreconditionOptimizerStatic8bit2State(
+    T* p, T* __restrict__ const g, unsigned char* __restrict__ const state1, unsigned char* __restrict__ const state2,
+    float* unorm, const float beta1, const float beta2, const float eps, const int step,
+    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
+    float* new_max1, float* new_max2, const float gnorm_scale, const int n
+);
+
+template <typename T, int OPTIMIZER>
+__global__ void kOptimizerStatic8bit2State(
+    T* p, T* const g, unsigned char* state1, unsigned char* state2, const float* unorm, const float max_unorm,
+    const float param_norm, const float beta1, const float beta2, const float eps, const int step, const float lr,
+    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* max1, float* max2,
+    float* new_max1, float* new_max2, float weight_decay, const float gnorm_scale, const int n
+);
+
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
+__global__ void kOptimizerStatic8bit2StateBlockwise(
+    T* p, T* __restrict__ const g, unsigned char* state1, unsigned char* state2, const float beta1, const float beta2,
+    const float beta3, const float alpha, const float eps, const int step, const float lr,
+    float* __restrict__ const quantiles1, float* __restrict__ const quantiles2, float* absmax1, float* absmax2,
+    float weight_decay, const float gnorm_scale, const bool skip_zeros, const int n
+);
+
+template <typename T, int OPTIMIZER, int BLOCK_SIZE, int N_PER_TH>
+__global__ void kOptimizerStatic8bit1StateBlockwise(
+    T* p, T* __restrict__ const g, unsigned char* state1, const float beta1, const float beta2, const float eps,
+    const int step, const float lr, float* __restrict__ const quantiles1, float* absmax1, float weight_decay,
+    const float gnorm_scale, const bool skip_zeros, const int n
+);
+
+template <typename T, int BLOCK_SIZE, int NUM_VALS>
+__global__ void kPercentileClipping(T* __restrict__ g, float* gnorm_vec, int step, const int n);
+
+template <typename T, int SPMM_ITEMS, int BITS>
+__global__ void kspmm_coo_very_sparse_naive(
+    int* max_count, int* max_idx, int* offset_rowidx, int* rowidx, int* colidx, half* values, T* B, half* out,
+    float* __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB
+);
+
+template <int ITEMS_PER_THREAD, int THREADS>
+__global__ void kdequant_mm_int32_fp16(
+    int* __restrict__ const A, float* __restrict__ const rowStats, float* __restrict__ const colStats, half* out,
+    half* __restrict__ const bias, const int numRows, const int numCols, const int n
+);
+
+template <typename T, int THREADS, int SPARSE_DECOMP>
+__global__ void kgetRowStats(T* __restrict__ A, float* rowStats, float threshold, int rows, int cols);
+template <typename T, int THREADS, int SPARSE_DECOMP>
+__global__ void kInt8VectorQuant(T* __restrict__ A, int8_t* out, float* rowStats, float threshold, int rows, int cols);
+
+template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT>
+__global__ void kTransformRowToFormat(
+    char* __restrict__ const A, char* out, int rows, int cols, int tiledCols, int outRows, int outCols
+);
+
+template <typename T, int BITS, int THREADS>
+__global__ void gemm_device(int M, int N, int K, T* __restrict__ const A, T* B, T* out, int lda, int ldb, int ldc);
+template <typename T, int THREADS>
+__global__ void kgemm_4bit_inference(
+    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, T* out, int lda, int ldb, int ldc,
+    int blocksize
+);
+template <typename T, int THREADS, int BITS>
+__global__ void kgemm_4bit_inference_naive(
+    int M, int N, int K, T* __restrict__ const A, unsigned char* B, float* absmax, const float* datatype, T* out,
+    int lda, int ldb, int ldc, int blocksize
+);
+
+template <typename T, int FUNC> __global__ void kfunc(T* A, T* B, T value, long n);
+
+#endif
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
--- a/csrc/pythonInterface.cpp
+++ b/csrc/pythonInterface.cpp
@@ -6,11 +6,29 @@
 #if BUILD_CUDA
 #include <ops.cuh>
 #endif
+#if BUILD_HIP
+#include <ops_hip.cuh>
+#endif
 #if BUILD_MPS
 // #include <mps_ops.h>
 #endif
 #include <cpu_ops.h>

+// Compatibility between HIP/CUDA APIs
+#if BUILD_HIP
+#define cudaStream_t hipStream_t
+#define __nv_bfloat16 hip_bfloat16
+#define cublasLtHandle_t hipblasLtHandle_t
+#define ContextCusparse ContextHipsparse
+#define cusparseHandle_t hipsparseHandle_t
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAttachHost hipMemAttachHost
+#define cudaPeekAtLastError hipPeekAtLastError
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#define cudaDevAttrConcurrentManagedAccess hipDeviceAttributeConcurrentManagedAccess
+#define cudaMemPrefetchAsync hipMemPrefetchAsync
+#endif
+
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
 // We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
 // maintain all that boilerplate
@@ -18,7 +36,7 @@
 //                               UNMANGLED CALLS
 //===================================================================================

-#if BUILD_CUDA
+#if BUILD_CUDA || BUILD_HIP

 // void gemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
 //{ gemm_host<float>(M, N, K, A, B, out, lda, ldb, ldc, 32); }
@@ -291,7 +309,7 @@ void spmm_coo_very_sparse_naive_int8(
 #endif

 extern "C" {
-#if BUILD_CUDA
+#if BUILD_CUDA || BUILD_HIP
 void cquantize(float* code, float* A, unsigned char* out, int n) { quantize(code, A, out, n); }

 void cdequantize(float* code, unsigned char* A, float* out, int n, cudaStream_t stream) {

--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -7,6 +7,8 @@ from typing import Any

 import torch

+from bitsandbytes.cextension import HIP_ENVIRONMENT
+
 test_dims_rng = random.Random(42)


@@ -21,7 +23,7 @@ def get_available_devices():
        # If the environment variable is set, use it directly.
        return [os.environ["BNB_TEST_DEVICE"]]

-    devices = ["cpu"]
+    devices = [] if HIP_ENVIRONMENT else ["cpu"]

    if hasattr(torch, "accelerator"):
        # PyTorch 2.6+ - determine accelerator using agnostic API.

--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
 import pytest

-from bitsandbytes.cextension import get_cuda_bnb_library_path
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
 from bitsandbytes.cuda_specs import CUDASpecs


@@ -13,11 +13,13 @@ def cuda120_spec() -> CUDASpecs:
    )


+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
 def test_get_cuda_bnb_library_path(monkeypatch, cuda120_spec):
    monkeypatch.delenv("BNB_CUDA_VERSION", raising=False)
    assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda120"


+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm")
 def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog):
    monkeypatch.setenv("BNB_CUDA_VERSION", "110")
    assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda110"

--- a/tests/test_functional.py
+++ b/tests/test_functional.py
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py