Fix natten (#22229)

* Add kernel size to NATTEN's QK arguments. The new NATTEN 0.14.5 supports PyTorch 2.0, but also adds an additional argument to the QK operation to allow optional RPBs. This ends up failing NATTEN tests. This commit adds NATTEN back to circleci and adds the arguments to get it working again. * Force NATTEN >= 0.14.5

Fix natten (#22229)
* Add kernel size to NATTEN's QK arguments. The new NATTEN 0.14.5 supports PyTorch 2.0, but also adds an additional argument to the QK operation to allow optional RPBs. This ends up failing NATTEN tests. This commit adds NATTEN back to circleci and adds the arguments to get it working again. * Force NATTEN >= 0.14.5
3028b20a · Ali Hassani · GitHub · 074490b2 · 3028b20a · 3028b20a
Unverified Commit 3028b20a authored Mar 17, 2023 by Ali Hassani Committed by GitHub Mar 17, 2023
5 changed files
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -374,8 +374,7 @@ exotic_models_job = CircleCIJob(
        "pip install 'git+https://github.com/facebookresearch/detectron2.git'",
        "sudo apt install tesseract-ocr",
        "pip install pytesseract",
-        # wait until natten is ready for torch 2.0.0
+        "pip install natten",
-        # "pip install natten",
    ],
    tests_to_run=[
        "tests/models/*layoutlmv*",

--- a/setup.py
+++ b/setup.py
@@ -129,7 +129,7 @@ _deps = [
    "keras-nlp>=0.3.1",
    "librosa",
    "nltk",
-    "natten>=0.14.4",
+    "natten>=0.14.5",
    "numpy>=1.17",
    "onnxconverter-common",
    "onnxruntime-tools>=1.4.2",

--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -35,7 +35,7 @@ deps = {
    "keras-nlp": "keras-nlp>=0.3.1",
    "librosa": "librosa",
    "nltk": "nltk",
-    "natten": "natten>=0.14.4",
+    "natten": "natten>=0.14.5",
    "numpy": "numpy>=1.17",
    "onnxconverter-common": "onnxconverter-common",
    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",

--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -347,7 +347,7 @@ class NeighborhoodAttention(nn.Module):
        query_layer = query_layer / math.sqrt(self.attention_head_size)
        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)
        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -339,7 +339,7 @@ class NeighborhoodAttention(nn.Module):
        query_layer = query_layer / math.sqrt(self.attention_head_size)
        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)
        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)