CI runner updates (#1643)

* Test g5g runner * Switch L4 to L40S runner; swap GitHub Linux T4 runner for AWS g4dn * Run tests on last 2 pytorch stable releases * Run tests on last 2 pytorch stable releases

CI runner updates (#1643)
* Test g5g runner * Switch L4 to L40S runner; swap GitHub Linux T4 runner for AWS g4dn * Run tests on last 2 pytorch stable releases * Run tests on last 2 pytorch stable releases
cdcae8d3 · Matthew Douglas · GitHub · 513e69be · cdcae8d3 · cdcae8d3
Unverified Commit cdcae8d3 authored May 19, 2025 by Matthew Douglas Committed by GitHub May 19, 2025
Show whitespace changes
Inline Side-by-side

Showing with 50 additions and 49 deletions

.github/workflows/tests.yml .github/workflows/tests.yml +44 -10

tests/test_functional.py tests/test_functional.py +0 -33

tests/test_modules.py tests/test_modules.py +6 -6

No files found.
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -49,7 +49,7 @@ jobs:
  build-cuda:
    strategy:
      matrix:
-        cuda_version: ["11.8.0", "12.8.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
        include:
          - os: ubuntu-22.04
@@ -100,7 +100,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
-        torch_version: ["2.7.0"]
+        torch_version: ["2.6.0", "2.7.0"]
        include:
          - os: ubuntu-22.04
            arch: x86_64
@@ -138,9 +138,35 @@ jobs:
      - name: Show installed packages
        run: pip list
+      - name: Show environment information
+        run: python -m torch.utils.collect_env
      - name: Run tests
        run: pytest --durations=100
+  # cuda-aarch64-tests:
+  #   if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
+  #   needs: build-cuda
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       os: [ubuntu-22.04-arm]
+  #       arch: [aarch64]
+  #       torch_version: ["2.7.0"]
+  #       cuda_version: ["11.8.0", "12.8.1"]
+  #   runs-on: bandb-aws-g5g-4xlarge-plus-use1-public-80
+  #   env:
+  #     BNB_TEST_DEVICE: cuda
+  #   steps:
+  #     - name: Show GPU Information
+  #       run: nvidia-smi
+  #     - name: Show pip packages
+  #       run: pip list
  cuda-tests:
    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
    needs: build-cuda
@@ -149,25 +175,28 @@ jobs:
      matrix:
        os: [ubuntu-22.04, windows-2025]
        arch: [x86_64]
-        gpu: [T4, L4]
+        gpu: [T4, L40S]
-        cuda_version: ["11.8.0", "12.8.1"]
+        cuda_version: ["11.8.0", "12.6.3", "12.8.1"]
        include:
          - cuda_version: "11.8.0"
            torch_version: "2.4.1"
            pypi_index: "https://download.pytorch.org/whl/cu118"
+          - cuda_version: "12.6.3"
+            torch_version: "2.6.0"
+            pypi_index: "https://download.pytorch.org/whl/cu126"
          - cuda_version: "12.8.1"
            torch_version: "2.7.0"
            pypi_index: "https://download.pytorch.org/whl/cu128"
-          # L4 runners
+          # L40S runners
          - os: ubuntu-22.04
-            gpu: L4
+            gpu: L40S
-            runner: bandb-aws-g6-4xlarge-plus-use1-public-80
+            runner: bandb-aws-g6e-4xlarge-plus-use1-public-80
          # T4 runners
          - os: ubuntu-22.04
            gpu: T4
-            runner: CUDA-Linux-x64
+            runner: bandb-aws-g4dn-4xlarge-plus-use1-public-80
          - os: windows-2025
            gpu: T4
            runner: CUDA-Windows-x64
@@ -176,10 +205,12 @@ jobs:
          # and cannot support CUDA 12+. Skip for now.
          - os: windows-2025
            cuda_version: "12.8.1"
+          - os: windows-2025
+            cuda_version: "12.6.3"
-          # No Windows L4 runners.
+          # No Windows L40S runners.
          - os: windows-2025
-            gpu: L4
+            gpu: L40S
    runs-on: ${{ matrix.runner }}
    env:
      BNB_TEST_DEVICE: cuda
@@ -210,5 +241,8 @@ jobs:
      - name: Show installed packages
        run: pip list
+      - name: Show environment information
+        run: python -m torch.utils.collect_env
      - name: Run tests
        run: pytest --durations=100
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -929,39 +929,6 @@ class TestSpMMFunctional:
        # torch.cuda.synchronize()
        # print(time.time() - t0)
-    @pytest.mark.parametrize("dim1", [256, 1024], ids=id_formatter("dim1"))
-    @pytest.mark.parametrize("dim2", [256, 1024], ids=id_formatter("dim2"))
-    @pytest.mark.skip("No longer supported")
-    def test_integrated_sparse_decomp(self, dim1, dim2):
-        threshold = 3.0
-        for _ in range(k):
-            A = torch.randn(dim1, dim2).cuda().half()
-            w1 = torch.randn(dim1, dim2).cuda().half()
-            out1 = torch.matmul(A, w1.t())
-            Cw1, statsw1, _ = F.int8_vectorwise_quant(w1)
-            CA, statsA, _ = F.int8_vectorwise_quant(A)
-            out1_32 = F.int8_linear_matmul(CA, Cw1)
-            out2 = F.int8_mm_dequant(out1_32, statsA, statsw1)
-            # CA, statsA, outlier_cols = F.int8_vectorwise_quant(A, threshold=threshold)
-            CA, _, statsA, _, coo_tensor = F.double_quant(A, threshold=threshold)
-            out1_32 = F.int8_linear_matmul(CA, Cw1)
-            out3 = F.int8_mm_dequant(out1_32, statsA, statsw1)
-            assert coo_tensor is not None
-            out4 = F.spmm_coo(coo_tensor, w1.t())
-            # idx = torch.unique(coo_tensor._indices()[1]).long()
-            # out4 = torch.matmul(A, w1.t())
-            out5 = out3 + out4
-            err1 = torch.abs(out1 - out2).mean().item()
-            err2 = torch.abs(out1 - out5).mean().item()
-            assert err2 < err1
    @pytest.mark.parametrize("dim1", [1 * 2048])
    @pytest.mark.parametrize("dim2", [2048])
    @pytest.mark.parametrize("dtype", [torch.int8])

--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -130,7 +130,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
    assert l1.weight.dtype == torch.int8
    l1.eval()
-    for i in range(100):
+    for i in range(4):
        b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
        o1 = l1(b1)
        assert o1.dtype == torch.float16
@@ -139,7 +139,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
    assert mlp.fc1.weight.dtype == torch.int8
    assert mlp.fc2.weight.dtype == torch.int8
-    for i in range(100):
+    for i in range(4):
        b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
        o1 = mlp(b1)
        assert o1.dtype == torch.float16
@@ -152,7 +152,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
    assert mlp.fc1.weight.dtype == torch.int8
    assert mlp.fc2.weight.dtype == torch.int8
-    for i in range(100):
+    for i in range(4):
        b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
        o1 = mlp(b1)
        assert o1.dtype == torch.float16
@@ -163,7 +163,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
    mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).half().to(device)
-    for i in range(100):
+    for i in range(4):
        b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
        o1 = mlp(b1)
        assert o1.dtype == torch.float16
@@ -185,7 +185,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
        .to(device)
    )
-    for i in range(100):
+    for i in range(4):
        b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
        o1 = mlp(b1)
        assert o1.dtype == torch.float16
@@ -207,7 +207,7 @@ def test_linear8bitlt_no_fp16_weights(device, threshold):
    w1, w2 = mlp.fc1.weight.clone().to(device), mlp.fc2.weight.clone().to(device)  # grab weights before quantization,
    mlp = mlp.to(device).half()  # and this line triggers quantization
-    for i in range(100):
+    for i in range(4):
        b1 = torch.randn(16, 8, 32, device=device, dtype=torch.float16)
        o1 = mlp(b1)
        assert o1.dtype == torch.float16