Minor test and README changes

336c899a · Christina Floristean · cdd97841 · 336c899a · 336c899a · 336c899a
Commit 336c899a authored Dec 08, 2023 by Christina Floristean
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 8 deletions

README.md README.md +1 -1

tests/data_utils.py tests/data_utils.py +1 -1

tests/test_deepspeed_evo_attention.py tests/test_deepspeed_evo_attention.py +22 -6

No files found.
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ kernels support in-place attention during inference and training. They use
 implementations, respectively.
 - **Efficient alignment scripts** using the original AlphaFold HHblits/JackHMMER pipeline or [ColabFold](https://github.com/sokrypton/ColabFold)'s, which uses the faster MMseqs2 instead. We've used them to generate millions of alignments.
 - **FlashAttention** support greatly speeds up MSA attention.
- **DeepSpeed DS4Sci_EvoformerAttention kernel** is a memory-efficient attention kernel developed as part of a new collaboration between OpenFold and DeepSpeed4Science initiative. The kernel provides substantial speedups for training and inference, and significantly reduces the model's peak device memory requirement by 13X. The model is 15% faster during the initial training and finetuning stages, with an overall of 40% lower peak memory consumption. To use this feature, simply set the `use_deepspeed_evo_attention` option in `openfold/config.py`.
+- **DeepSpeed DS4Sci_EvoformerAttention kernel** is a memory-efficient attention kernel developed as part of a collaboration between OpenFold and the DeepSpeed4Science initiative. The kernel provides substantial speedups for training and inference, and significantly reduces the model's peak device memory requirement by 13X. The model is 15% faster during the initial training and finetuning stages, and up to 4x faster during inference. To use this feature, simply set the `use_deepspeed_evo_attention` option in `openfold/config.py`.

 ## Installation (Linux)


--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -103,7 +103,7 @@ def random_attention_inputs(batch_size, n_seq, n, no_heads, c_hidden, inf=1e9,
    q = torch.rand(batch_size, n_seq, n, c_hidden, dtype=dtype, requires_grad=requires_grad).cuda()
    kv = torch.rand(batch_size, n_seq, n, c_hidden, dtype=dtype, requires_grad=requires_grad).cuda()

-    mask = torch.randint(0, 2, (batch_size, n_seq, 1, 1, n), dtype=dtype, requires_grad=requires_grad).cuda()
+    mask = torch.randint(0, 2, (batch_size, n_seq, 1, 1, n), dtype=dtype, requires_grad=False).cuda()
    z_bias = torch.rand(batch_size, 1, no_heads, n, n, dtype=dtype, requires_grad=requires_grad).cuda()
    mask_bias = inf * (mask - 1)
    if requires_grad:

--- a/tests/test_deepspeed_evo_attention.py
+++ b/tests/test_deepspeed_evo_attention.py
@@ -56,6 +56,8 @@ class TestDeepSpeedKernel(unittest.TestCase):
            c_hidden, c_hidden, c_hidden, c_hidden, no_heads
        ).cuda()

+        # Change output params init for testing since they are initialized with 'final' init (zeros)
+        # Otherwise both will just return zero.
        with torch.no_grad():
            lecun_normal_init_(a.linear_g.weight)
            lecun_normal_init_(a.linear_o.weight)
@@ -106,12 +108,14 @@ class TestDeepSpeedKernel(unittest.TestCase):
            lecun_normal_init_(attn.linear_o.weight)

        def clone(t):
+            # Create new params, clone values
            t = t.clone()
            if t.requires_grad:
                t.retain_grad()
            return t

        def init_attn():
+            # Create new attention object with same initial weights
            a_clone = Attention(
                c_hidden, c_hidden, c_hidden, c_hidden, no_heads
            ).cuda()
@@ -119,12 +123,13 @@ class TestDeepSpeedKernel(unittest.TestCase):
            a_clone.load_state_dict(attn.state_dict())
            return a_clone

+        # Clone param values and run attention with DS kernel
        q_repro = clone(q)
        kv_repro = clone(kv)
        biases_repro = [clone(b) for b in biases]

-        a = init_attn()
-        out_repro = a(q_repro, kv_repro, biases=biases_repro, use_deepspeed_evo_attention=True)
+        a_repro = init_attn()
+        out_repro = a_repro(q_repro, kv_repro, biases=biases_repro, use_deepspeed_evo_attention=True)
        loss_repro = torch.mean(out_repro)
        loss_repro.backward()

@@ -132,19 +137,30 @@ class TestDeepSpeedKernel(unittest.TestCase):
        kv_gt = clone(kv)
        biases_gt = [clone(b) for b in biases]

-        a = init_attn()
-        out_gt = a(q_gt, kv_gt, biases=biases_gt)
+        # Clone param values and run attention without DS kernel
+        a_gt = init_attn()
+        out_gt = a_gt(q_gt, kv_gt, biases=biases_gt)

        loss_gt = torch.mean(out_gt)
        loss_gt.backward()

-        pairs = zip([q_repro, kv_repro, biases_repro[0], biases_repro[1]],
-                    [q_gt, kv_gt, biases_gt[0], biases_gt[1]])
+        # Compare the grads of attention inputs
+        pairs = zip([q_repro, kv_repro, biases_repro[1]],
+                    [q_gt, kv_gt, biases_gt[1]])
        for i, item in enumerate(pairs):
            t_repro, t_gt = item
            err = torch.max(torch.abs(t_repro.grad.cpu() - t_gt.grad.cpu()))
            self.assertTrue(err < eps, f'Error item #{i}: {err}')

+        # Compare the grads of model weights
+        a_repro_params = dict(a_repro.named_parameters())
+        a_gt_params = dict(a_gt.named_parameters())
+        for name in a_gt_params.keys():
+            t_repro = a_repro_params[name]
+            t_gt = a_gt_params[name]
+            err = torch.max(torch.abs(t_repro.grad.cpu() - t_gt.grad.cpu()))
+            self.assertTrue(err < eps, f'Error item {name}: {err}')
+
    def compare_evoformer(self, dtype, eps):
        """
        Compare Evoformer output with and without using DeepSpeed Evoformer attention kernel.