Enable ROCm backend with custom ops integration (#1683)

* Port ROCm changes from multi-backend-refactor branch * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update test_ops.py * Update test_functional.py * Update test_ops.py * Update test_functional.py * Update test_functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update test_functional.py * Update test_functional.py * Update cextension.py * Update cuda_specs.py * Update cuda_specs.py * Update test_functional.py * Update test_linear4bit.py * Update test_cuda_setup_evaluator.py * Update test_functional.py * Update modules.py * Update modules.py * Update ops.py * Update test_linear4bit.py * Update ops.py * Update ops.py * Update test_linear4bit.py * Update test_linear4bit.py * Update python-package.yml * Update python-package.yml * Update python-package.yml * Update python-package.yml * Create build-rocm.sh * Update cuda_specs.py * Fix trailing whitespace * Remove conflicts.diff * update for hipblasVersionMajor >=3 * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update main.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update test_linear4bit.py * Lint * Lint * Update helpers.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Lint * Update pythonInterface.cpp * lint fix * lint * Update pythonInterface.cpp * revert permissions change * Fix indentation * Update kernels_hip.cuh * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update kernels_hip.cuh * Update kernels.hip * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update ops.hip * Update CMakeLists.txt * Update functional.py * Update cextension.py * Update cextension.py --------- Co-authored-by: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Co-authored-by: MISHANMAUYRA <mishanmaurya31081@gmail.com> Co-authored-by: amcamd <andrew.chapman@amd.com> Co-authored-by: Prasanth Nunna <root@banff-cyxtera-s78-1.amd.com>

Enable ROCm backend with custom ops integration (#1683)
* Port ROCm changes from multi-backend-refactor branch * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update ops.py * Update functional.py * Update functional.py * Update functional.py * Update test_ops.py * Update test_functional.py * Update test_ops.py * Update test_functional.py * Update test_functional.py * Update functional.py * Update functional.py * Update ops.py * Update ops.py * Update test_functional.py * Update test_functional.py * Update cextension.py * Update cuda_specs.py * Update cuda_specs.py * Update test_functional.py * Update test_linear4bit.py * Update test_cuda_setup_evaluator.py * Update test_functional.py * Update modules.py * Update modules.py * Update ops.py * Update test_linear4bit.py * Update ops.py * Update ops.py * Update test_linear4bit.py * Update test_linear4bit.py * Update python-package.yml * Update python-package.yml * Update python-package.yml * Update python-package.yml * Create build-rocm.sh * Update cuda_specs.py * Fix trailing whitespace * Remove conflicts.diff * update for hipblasVersionMajor >=3 * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update main.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Update test_linear4bit.py * Lint * Lint * Update helpers.py * Update test_functional.py * Update test_linear4bit.py * Update test_ops.py * Lint * Update pythonInterface.cpp * lint fix * lint * Update pythonInterface.cpp * revert permissions change * Fix indentation * Update kernels_hip.cuh * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update kernels_hip.cuh * Update kernels.hip * Update kernels.hip * Update ops.hip * Update ops_hip.cuh * Update ops.hip * Update CMakeLists.txt * Update functional.py * Update cextension.py * Update cextension.py --------- Co-authored-by: MISHANMAURYA <118961433+MISHANMAURYA@users.noreply.github.com> Co-authored-by: MISHANMAUYRA <mishanmaurya31081@gmail.com> Co-authored-by: amcamd <andrew.chapman@amd.com> Co-authored-by: Prasanth Nunna <root@banff-cyxtera-s78-1.amd.com>
888788d7 · pnunna93 · GitHub · a1cd3f6e · 888788d7
Unverified Commit 888788d7 authored Jun 20, 2025 by pnunna93 Committed by GitHub Jun 20, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 5 deletions

tests/test_ops.py tests/test_ops.py +6 -5

No files found.
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -4,6 +4,7 @@ import pytest
 import torch
 import bitsandbytes
+from bitsandbytes.cextension import HIP_ENVIRONMENT
 from bitsandbytes.functional import ipex_xpu
 from tests.helpers import TRUE_FALSE, get_available_devices, id_formatter, is_supported_on_hpu
@@ -102,7 +103,7 @@ class TestLLMInt8Ops:
 class TestInt8BlockwiseQuantOps:
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
    def test_quantize_blockwise(self, device, dtype, blocksize):
        if device == "cpu":
            if dtype != torch.float32:
@@ -126,7 +127,7 @@ class TestInt8BlockwiseQuantOps:
    @pytest.mark.parametrize("device", get_available_devices())
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
    def test_dequantize_blockwise(self, device, dtype, blocksize):
        if device == "cpu" and dtype != torch.float32:
            pytest.skip("CPU implementation is only available for float32")
@@ -156,7 +157,7 @@ class Test4bitBlockwiseQuantOps:
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
    @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
    def test_quantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
            pytest.skip("This configuration is not supported on HPU.")
@@ -180,7 +181,7 @@ class Test4bitBlockwiseQuantOps:
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
    @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
    def test_dequantize_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
            pytest.skip("This configuration is not supported on HPU.")
@@ -214,7 +215,7 @@ class Test4bitBlockwiseQuantOps:
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
    @pytest.mark.parametrize("storage_dtype", [torch.uint8, torch.bfloat16], ids=id_formatter("storage_dtype"))
    @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
-    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
+    @pytest.mark.parametrize("blocksize", [64, 128, 256, 512] if not HIP_ENVIRONMENT else [128, 256, 512])
    def test_gemv_4bit(self, device, dtype, storage_dtype, quant_type, blocksize):
        if device == "hpu" and not is_supported_on_hpu(quant_type, dtype, storage_dtype):
            pytest.skip("This configuration is not supported on HPU.")