Added Thorsten's fix for bug regarding contiguous storage taken from:

https://github.com/NVIDIA/torch-harmonics/compare/main...azrael417:torch-harmonics:tkurth/mauro-rebase

Added Thorsten's fix for bug regarding contiguous storage taken from:
https://github.com/NVIDIA/torch-harmonics/compare/main...azrael417:torch-harmonics:tkurth/mauro-rebase
c90b421a · Mauro Bisson · 7aa95ce5 · c90b421a · c90b421a · c90b421a
Commit c90b421a authored Jul 02, 2025 by Mauro Bisson
Showing with 11 additions and 7 deletions

tests/test_attention.py tests/test_attention.py +3 -3

torch_harmonics/_neighborhood_attention.py torch_harmonics/_neighborhood_attention.py +7 -3

torch_harmonics/attention.py torch_harmonics/attention.py +1 -1

No files found.
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@@ -81,7 +81,7 @@ class TestNeighborhoodAttentionS2(unittest.TestCase):
        ],
        skip_on_empty=True,
    )
-    def test_custom_implementation(self, batch_size, channels, heads, in_shape, out_shape, grid_in, grid_out, atol, rtol, verbose=True):
+    def test_custom_implementation(self, batch_size, channels, heads, in_shape, out_shape, grid_in, grid_out, atol, rtol, verbose=False):
        """Tests numerical equivalence between the custom (CUDA) implementation and the reference torch implementation"""
        nlat_in, nlon_in = in_shape
@@ -161,7 +161,7 @@ class TestNeighborhoodAttentionS2(unittest.TestCase):
        ],
        skip_on_empty=True,
    )
-    def test_neighborhood_global_equivalence(self, batch_size, channels, heads, in_shape, out_shape, grid_in, grid_out, atol, rtol, verbose=True):
+    def test_neighborhood_global_equivalence(self, batch_size, channels, heads, in_shape, out_shape, grid_in, grid_out, atol, rtol, verbose=False):
        """Tests numerical equivalence between the global spherical attention module and the neighborhood spherical attention module with the neighborhood set ot the whole sphere"""
        nlat_in, nlon_in = in_shape
@@ -223,7 +223,7 @@ class TestNeighborhoodAttentionS2(unittest.TestCase):
        skip_on_empty=True,
    )
    @unittest.skipUnless((torch.cuda.is_available() and _cuda_extension_available), "skipping performance test because CUDA is not available")
-    def test_perf(self, batch_size, channels, heads, in_shape, out_shape, grid_in, grid_out, atol, rtol, verbose=True):
+    def test_perf(self, batch_size, channels, heads, in_shape, out_shape, grid_in, grid_out, atol, rtol, verbose=False):
        # extract some parameters
        nlat_in, nlon_in = in_shape

--- a/torch_harmonics/_neighborhood_attention.py
+++ b/torch_harmonics/_neighborhood_attention.py
@@ -479,9 +479,10 @@ class _NeighborhoodAttentionS2Cuda(torch.autograd.Function):
        qw = qw.reshape(B*nh, -1, H, W)
        # convert to float32
-        kw = kw.to(torch.float32)
+        inp_dtype = kw.dtype
-        vw = vw.to(torch.float32)
+        kw = kw.to(torch.float32).contiguous()
-        qw = qw.to(torch.float32)
+        vw = vw.to(torch.float32).contiguous()
+        qw = qw.to(torch.float32).contiguous()
        output = attention_cuda_extension.forward(kw, vw, qw, quad_weights,
                                                  col_idx, row_off,
@@ -490,6 +491,9 @@ class _NeighborhoodAttentionS2Cuda(torch.autograd.Function):
        _, C, H, W = output.shape
        output = output.reshape(B, -1, H, W)
+        # convert back precision
+        output = output.to(dtype=inp_dtype)
        return output
    @staticmethod

--- a/torch_harmonics/attention.py
+++ b/torch_harmonics/attention.py
@@ -291,7 +291,7 @@ class NeighborhoodAttentionS2(nn.Module):
        # set the last value
        row_offset[row + 1] = idz + 1
-        row_offset = torch.from_numpy(row_offset)
+        row_offset = torch.from_numpy(row_offset).contiguous()
        self.max_psi_nnz = col_idx.max().item() + 1
        self.register_buffer("psi_row_idx", row_idx, persistent=False)