Merge branch 'main' of https://github.com/hpcaitech/FastFold

b14e47f4 · zhuwenwen · 490cb6f5 · 05681304 · b14e47f4 · b14e47f4
Commit b14e47f4 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/fastfold/model/nn/embedders_multimer.py
+++ b/fastfold/model/nn/embedders_multimer.py
+from functools import partial
+import torch
+import torch.nn as nn
+from typing import Tuple, Dict
+from fastfold.utils import all_atom_multimer
+from fastfold.utils.feats import dgram_from_positions
+from fastfold.model.nn.primitives import Linear, LayerNorm
+from fastfold.model.nn.template import (
+    TemplatePairStack,
+    TemplatePointwiseAttention,
+)
+from fastfold.utils import geometry
+from fastfold.utils.tensor_utils import one_hot, tensor_tree_map, dict_multimap
+class InputEmbedderMultimer(nn.Module):
+    """
+    Embeds a subset of the input features.
+    Implements Algorithms 3 (InputEmbedder) and 4 (relpos).
+    """
+    def __init__(
+        self,
+        tf_dim: int,
+        msa_dim: int,
+        c_z: int,
+        c_m: int,
+        max_relative_idx: int,
+        use_chain_relative: bool,
+        max_relative_chain: int,
+        **kwargs,
+    ):
+        """
+        Args:
+            tf_dim:
+                Final dimension of the target features
+            msa_dim:
+                Final dimension of the MSA features
+            c_z:
+                Pair embedding dimension
+            c_m:
+                MSA embedding dimension
+            relpos_k:
+                Window size used in relative positional encoding
+        """
+        super(InputEmbedderMultimer, self).__init__()
+        self.tf_dim = tf_dim
+        self.msa_dim = msa_dim
+        self.c_z = c_z
+        self.c_m = c_m
+        self.linear_tf_z_i = Linear(tf_dim, c_z)
+        self.linear_tf_z_j = Linear(tf_dim, c_z)
+        self.linear_tf_m = Linear(tf_dim, c_m)
+        self.linear_msa_m = Linear(msa_dim, c_m)
+        # RPE stuff
+        self.max_relative_idx = max_relative_idx
+        self.use_chain_relative = use_chain_relative
+        self.max_relative_chain = max_relative_chain
+        if self.use_chain_relative:
+            self.no_bins = 2 * max_relative_idx + 2 + 1 + 2 * max_relative_chain + 2
+        else:
+            self.no_bins = 2 * max_relative_idx + 1
+        self.linear_relpos = Linear(self.no_bins, c_z)
+    def relpos(self, batch: Dict[str, torch.Tensor]):
+        pos = batch["residue_index"]
+        asym_id = batch["asym_id"]
+        asym_id_same = asym_id[..., None] == asym_id[..., None, :]
+        offset = pos[..., None] - pos[..., None, :]
+        clipped_offset = torch.clamp(
+            offset + self.max_relative_idx, 0, 2 * self.max_relative_idx
+        )
+        rel_feats = []
+        if self.use_chain_relative:
+            final_offset = torch.where(
+                asym_id_same,
+                clipped_offset,
+                (2 * self.max_relative_idx + 1) * torch.ones_like(clipped_offset),
+            )
+            rel_pos = torch.nn.functional.one_hot(
+                final_offset,
+                2 * self.max_relative_idx + 2,
+            )
+            rel_feats.append(rel_pos)
+            entity_id = batch["entity_id"]
+            entity_id_same = entity_id[..., None] == entity_id[..., None, :]
+            rel_feats.append(entity_id_same[..., None])
+            sym_id = batch["sym_id"]
+            rel_sym_id = sym_id[..., None] - sym_id[..., None, :]
+            max_rel_chain = self.max_relative_chain
+            clipped_rel_chain = torch.clamp(
+                rel_sym_id + max_rel_chain,
+                0,
+                2 * max_rel_chain,
+            )
+            final_rel_chain = torch.where(
+                entity_id_same,
+                clipped_rel_chain,
+                (2 * max_rel_chain + 1) * torch.ones_like(clipped_rel_chain),
+            )
+            rel_chain = torch.nn.functional.one_hot(
+                final_rel_chain.long(),
+                2 * max_rel_chain + 2,
+            )
+            rel_feats.append(rel_chain)
+        else:
+            rel_pos = torch.nn.functional.one_hot(
+                clipped_offset,
+                2 * self.max_relative_idx + 1,
+            )
+            rel_feats.append(rel_pos)
+        rel_feat = torch.cat(rel_feats, dim=-1).to(self.linear_relpos.weight.dtype)
+        return self.linear_relpos(rel_feat)
+    def forward(
+        self, batch: Dict[str, torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        tf = batch["target_feat"]
+        msa = batch["msa_feat"]
+        # [*, N_res, c_z]
+        tf_emb_i = self.linear_tf_z_i(tf)
+        tf_emb_j = self.linear_tf_z_j(tf)
+        # [*, N_res, N_res, c_z]
+        pair_emb = tf_emb_i[..., None, :] + tf_emb_j[..., None, :, :]
+        pair_emb = pair_emb + self.relpos(batch)
+        # [*, N_clust, N_res, c_m]
+        n_clust = msa.shape[-3]
+        tf_m = (
+            self.linear_tf_m(tf)
+            .unsqueeze(-3)
+            .expand(((-1,) * len(tf.shape[:-2]) + (n_clust, -1, -1)))
+        )
+        msa_emb = self.linear_msa_m(msa) + tf_m
+        return msa_emb, pair_emb
+class TemplatePairEmbedderMultimer(nn.Module):
+    def __init__(self,
+        c_z: int,
+        c_out: int,
+        c_dgram: int,
+        c_aatype: int,
+    ):
+        super().__init__()
+        self.dgram_linear = Linear(c_dgram, c_out)
+        self.aatype_linear_1 = Linear(c_aatype, c_out)
+        self.aatype_linear_2 = Linear(c_aatype, c_out)
+        self.query_embedding_layer_norm = LayerNorm(c_z)
+        self.query_embedding_linear = Linear(c_z, c_out)
+        self.pseudo_beta_mask_linear = Linear(1, c_out)
+        self.x_linear = Linear(1, c_out)
+        self.y_linear = Linear(1, c_out)
+        self.z_linear = Linear(1, c_out)
+        self.backbone_mask_linear = Linear(1, c_out)
+    def forward(self,
+        template_dgram: torch.Tensor,
+        aatype_one_hot: torch.Tensor,
+        query_embedding: torch.Tensor,
+        pseudo_beta_mask: torch.Tensor,
+        backbone_mask: torch.Tensor,
+        multichain_mask_2d: torch.Tensor,
+        unit_vector: geometry.Vec3Array,
+    ) -> torch.Tensor:
+        act = 0.
+        pseudo_beta_mask_2d = (
+            pseudo_beta_mask[..., None] * pseudo_beta_mask[..., None, :]
+        )
+        pseudo_beta_mask_2d *= multichain_mask_2d
+        template_dgram *= pseudo_beta_mask_2d[..., None]
+        act += self.dgram_linear(template_dgram)
+        act += self.pseudo_beta_mask_linear(pseudo_beta_mask_2d[..., None])
+        aatype_one_hot = aatype_one_hot.to(template_dgram.dtype)
+        act += self.aatype_linear_1(aatype_one_hot[..., None, :, :])
+        act += self.aatype_linear_2(aatype_one_hot[..., None, :])
+        backbone_mask_2d = (
+            backbone_mask[..., None] * backbone_mask[..., None, :]
+        )
+        backbone_mask_2d *= multichain_mask_2d
+        x, y, z = [coord * backbone_mask_2d for coord in unit_vector]
+        act += self.x_linear(x[..., None])
+        act += self.y_linear(y[..., None])
+        act += self.z_linear(z[..., None])
+        act += self.backbone_mask_linear(backbone_mask_2d[..., None]) 
+        query_embedding = self.query_embedding_layer_norm(query_embedding)
+        act += self.query_embedding_linear(query_embedding)
+        return act
+class TemplateSingleEmbedderMultimer(nn.Module):
+    def __init__(self,
+        c_in: int,
+        c_m: int,
+    ):
+        super().__init__()
+        self.template_single_embedder = Linear(c_in, c_m)
+        self.template_projector = Linear(c_m, c_m)
+    def forward(self,
+        batch,
+        atom_pos,
+        aatype_one_hot,
+    ):
+        out = {}
+        template_chi_angles, template_chi_mask = (
+            all_atom_multimer.compute_chi_angles(
+                atom_pos,
+                batch["template_all_atom_mask"],
+                batch["template_aatype"],
+            )
+        )
+        template_features = torch.cat(
+            [
+                aatype_one_hot,
+                torch.sin(template_chi_angles) * template_chi_mask,
+                torch.cos(template_chi_angles) * template_chi_mask,
+                template_chi_mask,
+            ],
+            dim=-1,
+        )
+        template_mask = template_chi_mask[..., 0]
+        template_activations = self.template_single_embedder(
+            template_features
+        )
+        template_activations = torch.nn.functional.relu(
+            template_activations
+        )
+        template_activations = self.template_projector(
+            template_activations,
+        )
+        out["template_single_embedding"] = (
+            template_activations
+        )
+        out["template_mask"] = template_mask
+        return out 
+class TemplateEmbedderMultimer(nn.Module):
+    def __init__(self, config):
+        super(TemplateEmbedderMultimer, self).__init__()
+        self.config = config
+        self.template_pair_embedder = TemplatePairEmbedderMultimer(
+            **config["template_pair_embedder"],
+        )
+        self.template_single_embedder = TemplateSingleEmbedderMultimer(
+            **config["template_single_embedder"],
+        )
+        self.template_pair_stack = TemplatePairStack(
+            **config["template_pair_stack"],
+        )
+        self.linear_t = Linear(config.c_t, config.c_z)
+    def forward(self, 
+        batch, 
+        z, 
+        padding_mask_2d, 
+        templ_dim,
+        chunk_size,
+        multichain_mask_2d,
+    ):
+        template_embeds = []
+        n_templ = batch["template_aatype"].shape[templ_dim]
+        for i in range(n_templ):
+            idx = batch["template_aatype"].new_tensor(i)
+            single_template_feats = tensor_tree_map(
+                lambda t: torch.index_select(t, templ_dim, idx),
+                batch,
+            )
+            single_template_embeds = {}
+            act = 0.
+            template_positions, pseudo_beta_mask = (
+                single_template_feats["template_pseudo_beta"],
+                single_template_feats["template_pseudo_beta_mask"],
+            )
+            template_dgram = dgram_from_positions(
+                template_positions,
+                inf=self.config.inf,
+                **self.config.distogram,
+            )
+            aatype_one_hot = torch.nn.functional.one_hot(
+                single_template_feats["template_aatype"], 22,
+            )
+            raw_atom_pos = single_template_feats["template_all_atom_positions"]
+            atom_pos = geometry.Vec3Array.from_array(raw_atom_pos)
+            rigid, backbone_mask = all_atom_multimer.make_backbone_affine(
+                atom_pos,
+                single_template_feats["template_all_atom_mask"],
+                single_template_feats["template_aatype"],
+            )
+            points = rigid.translation
+            rigid_vec = rigid[..., None].inverse().apply_to_point(points)
+            unit_vector = rigid_vec.normalized()
+            pair_act = self.template_pair_embedder(
+                template_dgram,
+                aatype_one_hot,
+                z,
+                pseudo_beta_mask,
+                backbone_mask,
+                multichain_mask_2d,
+                unit_vector,
+            )
+            single_template_embeds["template_pair_embedding"] = pair_act
+            single_template_embeds.update(
+                self.template_single_embedder(
+                    single_template_feats,
+                    atom_pos,
+                    aatype_one_hot,
+                )
+            )
+            template_embeds.append(single_template_embeds)
+        template_embeds = dict_multimap(
+            partial(torch.cat, dim=templ_dim),
+            template_embeds,
+        )
+        # [*, S_t, N, N, C_z]
+        t = self.template_pair_stack(
+            template_embeds["template_pair_embedding"], 
+            padding_mask_2d.unsqueeze(-3).to(dtype=z.dtype), 
+            chunk_size=chunk_size,
+            _mask_trans=False,
+        )
+        # [*, N, N, C_z]
+        t = torch.sum(t, dim=-4) / n_templ
+        t = torch.nn.functional.relu(t)
+        t = self.linear_t(t)
+        template_embeds["template_pair_embedding"] = t
+        return template_embeds
\ No newline at end of file
--- a/fastfold/model/nn/evoformer.py
+++ b/fastfold/model/nn/evoformer.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+from functools import partial
+from fastfold.model.nn.primitives import Linear, LayerNorm
+from fastfold.model.nn.dropout import DropoutRowwise, DropoutColumnwise
+from fastfold.model.nn.msa import (
+    MSARowAttentionWithPairBias,
+    MSAColumnAttention,
+    MSAColumnGlobalAttention,
+)
+from fastfold.model.nn.outer_product_mean import OuterProductMean
+from fastfold.model.nn.pair_transition import PairTransition
+from fastfold.model.nn.triangular_attention import (
+    TriangleAttentionStartingNode,
+    TriangleAttentionEndingNode,
+)
+from fastfold.model.nn.triangular_multiplicative_update import (
+    TriangleMultiplicationOutgoing,
+    TriangleMultiplicationIncoming,
+)
+from fastfold.utils.checkpointing import checkpoint_blocks, get_checkpoint_fn
+from fastfold.utils.tensor_utils import chunk_layer
+class MSATransition(nn.Module):
+    """
+    Feed-forward network applied to MSA activations after attention.
+    Implements Algorithm 9
+    """
+    def __init__(self, c_m, n):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            n:
+                Factor multiplied to c_m to obtain the hidden channel
+                dimension
+        """
+        super(MSATransition, self).__init__()
+        self.c_m = c_m
+        self.n = n
+        self.layer_norm = LayerNorm(self.c_m)
+        self.linear_1 = Linear(self.c_m, self.n * self.c_m, init="relu")
+        self.relu = nn.ReLU()
+        self.linear_2 = Linear(self.n * self.c_m, self.c_m, init="final")
+    def _transition(self, m, mask):
+        m = self.linear_1(m)
+        m = self.relu(m)
+        m = self.linear_2(m) * mask
+        return m
+    @torch.jit.ignore
+    def _chunk(self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+         return chunk_layer(
+             self._transition,
+             {"m": m, "mask": mask},
+             chunk_size=chunk_size,
+             no_batch_dims=len(m.shape[:-2]),
+         )
+    def forward(
+        self,
+        m: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA activation
+            mask:
+                [*, N_seq, N_res, C_m] MSA mask
+        Returns:
+            m:
+                [*, N_seq, N_res, C_m] MSA activation update
+        """
+        # DISCREPANCY: DeepMind forgets to apply the MSA mask here.
+        if mask is None:
+            mask = m.new_ones(m.shape[:-1])
+        # [*, N_seq, N_res, 1]
+        mask = mask.unsqueeze(-1)
+        m = self.layer_norm(m)
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size)
+        else:
+            m = self._transition(m, mask)
+        return m
+class EvoformerBlockCore(nn.Module):
+    def __init__(
+        self,
+        c_m: int,
+        c_z: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        _is_extra_msa_stack: bool = False,
+        is_multimer: bool = False,
+    ):
+        super(EvoformerBlockCore, self).__init__()
+        self.is_multimer = is_multimer
+        self.msa_transition = MSATransition(
+            c_m=c_m,
+            n=transition_n,
+        )
+        self.outer_product_mean = OuterProductMean(
+            c_m,
+            c_z,
+            c_hidden_opm,
+        )
+        self.tri_mul_out = TriangleMultiplicationOutgoing(
+            c_z,
+            c_hidden_mul,
+        )
+        self.tri_mul_in = TriangleMultiplicationIncoming(
+            c_z,
+            c_hidden_mul,
+        )
+        self.tri_att_start = TriangleAttentionStartingNode(
+            c_z,
+            c_hidden_pair_att,
+            no_heads_pair,
+            inf=inf,
+        )
+        self.tri_att_end = TriangleAttentionEndingNode(
+            c_z,
+            c_hidden_pair_att,
+            no_heads_pair,
+            inf=inf,
+        )
+        self.pair_transition = PairTransition(
+            c_z,
+            transition_n,
+        )
+        self.ps_dropout_row_layer = DropoutRowwise(pair_dropout)
+        self.ps_dropout_col_layer = DropoutColumnwise(pair_dropout)
+    def forward(
+        self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]: 
+        # DeepMind doesn't mask these transitions in the source, so _mask_trans
+        # should be disabled to better approximate the exact activations of
+        # the original.
+        msa_trans_mask = msa_mask if _mask_trans else None
+        pair_trans_mask = pair_mask if _mask_trans else None
+        m = m + self.msa_transition(
+            m, mask=msa_trans_mask, chunk_size=chunk_size
+        )
+        z = z + self.outer_product_mean(
+            m, mask=msa_mask, chunk_size=chunk_size
+        )
+        z = z + self.ps_dropout_row_layer(self.tri_mul_out(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(self.tri_mul_in(z, mask=pair_mask))
+        z = z + self.ps_dropout_row_layer(
+            self.tri_att_start(z, mask=pair_mask, chunk_size=chunk_size)
+        )
+        z = z + self.ps_dropout_col_layer(
+            self.tri_att_end(z, mask=pair_mask, chunk_size=chunk_size)
+        )
+        z = z + self.pair_transition(
+            z, mask=pair_trans_mask, chunk_size=chunk_size
+        )
+        return m, z
+class EvoformerBlock(nn.Module):
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        is_multimer: bool,
+    ):
+        super(EvoformerBlock, self).__init__()
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+        )
+        self.msa_att_col = MSAColumnAttention(
+            c_m,
+            c_hidden_msa_att,
+            no_heads_msa,
+            inf=inf,
+        )
+        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
+        self.core = EvoformerBlockCore(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden_opm=c_hidden_opm,
+            c_hidden_mul=c_hidden_mul,
+            c_hidden_pair_att=c_hidden_pair_att,
+            no_heads_msa=no_heads_msa,
+            no_heads_pair=no_heads_pair,
+            transition_n=transition_n,
+            pair_dropout=pair_dropout,
+            inf=inf,
+            eps=eps,
+        )
+        self.is_multimer = is_multimer
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = m + self.msa_dropout_layer(
+            self.msa_att_row(m, z=z, mask=msa_mask, chunk_size=chunk_size)
+        )
+        m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+        m, z = self.core(
+            m, 
+            z, 
+            msa_mask=msa_mask, 
+            pair_mask=pair_mask, 
+            chunk_size=chunk_size, 
+            _mask_trans=_mask_trans,
+        )
+        return m, z
+class ExtraMSABlock(nn.Module):
+    """ 
+        Almost identical to the standard EvoformerBlock, except in that the
+        ExtraMSABlock uses GlobalAttention for MSA column attention and
+        requires more fine-grained control over checkpointing. Separated from
+        its twin to preserve the TorchScript-ability of the latter.
+    """
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        ckpt: bool,
+        is_multimer: bool,
+    ):
+        super(ExtraMSABlock, self).__init__()
+        self.ckpt = ckpt
+        self.msa_att_row = MSARowAttentionWithPairBias(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+        )
+        self.msa_att_col = MSAColumnGlobalAttention(
+            c_in=c_m,
+            c_hidden=c_hidden_msa_att,
+            no_heads=no_heads_msa,
+            inf=inf,
+            eps=eps,
+        )
+        self.msa_dropout_layer = DropoutRowwise(msa_dropout)
+        self.core = EvoformerBlockCore(
+            c_m=c_m,
+            c_z=c_z,
+            c_hidden_opm=c_hidden_opm,
+            c_hidden_mul=c_hidden_mul,
+            c_hidden_pair_att=c_hidden_pair_att,
+            no_heads_msa=no_heads_msa,
+            no_heads_pair=no_heads_pair,
+            transition_n=transition_n,
+            pair_dropout=pair_dropout,
+            inf=inf,
+            eps=eps,
+        )
+        self.is_multimer = is_multimer
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: Optional[int] = None,
+        _chunk_logits: Optional[int] = 1024,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        m = m + self.msa_dropout_layer(
+            self.msa_att_row(
+                m.clone(), 
+                z=z.clone(), 
+                mask=msa_mask, 
+                chunk_size=chunk_size,
+                _chunk_logits=_chunk_logits if torch.is_grad_enabled() else None,
+                _checkpoint_chunks=
+                    self.ckpt if torch.is_grad_enabled() else False,
+            )
+        )
+        def fn(m, z):
+            m = m + self.msa_att_col(m, mask=msa_mask, chunk_size=chunk_size)
+            m, z = self.core(
+                m, z, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size
+            )
+            return m, z
+        if(torch.is_grad_enabled() and self.ckpt):
+            checkpoint_fn = get_checkpoint_fn()
+            m, z = checkpoint_fn(fn, m, z)
+        else:
+            m, z = fn(m, z)
+        return m, z
+class EvoformerStack(nn.Module):
+    """
+    Main Evoformer trunk.
+    Implements Algorithm 6.
+    """
+    def __init__(
+        self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        c_s: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        no_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        blocks_per_ckpt: int,
+        inf: float,
+        eps: float,
+        clear_cache_between_blocks: bool = False, 
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_z:
+                Pair channel dimension
+            c_hidden_msa_att:
+                Hidden dimension in MSA attention
+            c_hidden_opm:
+                Hidden dimension in outer product mean module
+            c_hidden_mul:
+                Hidden dimension in multiplicative updates
+            c_hidden_pair_att:
+                Hidden dimension in triangular attention
+            c_s:
+                Channel dimension of the output "single" embedding
+            no_heads_msa:
+                Number of heads used for MSA attention
+            no_heads_pair:
+                Number of heads used for pair attention
+            no_blocks:
+                Number of Evoformer blocks in the stack
+            transition_n:
+                Factor by which to multiply c_m to obtain the MSATransition
+                hidden dimension
+            msa_dropout:
+                Dropout rate for MSA activations
+            pair_dropout:
+                Dropout used for pair activations
+            blocks_per_ckpt:
+                Number of Evoformer blocks in each activation checkpoint
+            clear_cache_between_blocks:
+                Whether to clear CUDA's GPU memory cache between blocks of the
+                stack. Slows down each block but can reduce fragmentation
+        """
+        super(EvoformerStack, self).__init__()
+        self.blocks_per_ckpt = blocks_per_ckpt
+        self.clear_cache_between_blocks = clear_cache_between_blocks
+        self.blocks = nn.ModuleList()
+        for _ in range(no_blocks):
+            block = EvoformerBlock(
+                c_m=c_m,
+                c_z=c_z,
+                c_hidden_msa_att=c_hidden_msa_att,
+                c_hidden_opm=c_hidden_opm,
+                c_hidden_mul=c_hidden_mul,
+                c_hidden_pair_att=c_hidden_pair_att,
+                no_heads_msa=no_heads_msa,
+                no_heads_pair=no_heads_pair,
+                transition_n=transition_n,
+                msa_dropout=msa_dropout,
+                pair_dropout=pair_dropout,
+                inf=inf,
+                eps=eps,
+                is_multimer=is_multimer,
+            )
+            self.blocks.append(block)
+        self.linear = Linear(c_m, c_s)
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        msa_mask: torch.Tensor,
+        pair_mask: torch.Tensor,
+        chunk_size: int,
+        _mask_trans: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            msa_mask:
+                [*, N_seq, N_res] MSA mask
+            pair_mask:
+                [*, N_res, N_res] pair mask
+        Returns:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            s:
+                [*, N_res, C_s] single embedding (or None if extra MSA stack)
+        """
+        blocks = [
+            partial(
+                b,
+                msa_mask=msa_mask,
+                pair_mask=pair_mask,
+                chunk_size=chunk_size,
+                _mask_trans=_mask_trans,
+            )
+            for b in self.blocks
+        ]
+        if(self.clear_cache_between_blocks):
+            def block_with_cache_clear(block, *args):
+                torch.cuda.empty_cache()
+                return block(*args)
+            blocks = [partial(block_with_cache_clear, b) for b in blocks]
+        m, z = checkpoint_blocks(
+            blocks,
+            args=(m, z),
+            blocks_per_ckpt=self.blocks_per_ckpt if self.training else None,
+        )
+        s = self.linear(m[..., 0, :, :])
+        return m, z, s
+class ExtraMSAStack(nn.Module):
+    """
+    Implements Algorithm 18.
+    """
+    def __init__(self,
+        c_m: int,
+        c_z: int,
+        c_hidden_msa_att: int,
+        c_hidden_opm: int,
+        c_hidden_mul: int,
+        c_hidden_pair_att: int,
+        no_heads_msa: int,
+        no_heads_pair: int,
+        no_blocks: int,
+        transition_n: int,
+        msa_dropout: float,
+        pair_dropout: float,
+        inf: float,
+        eps: float,
+        ckpt: bool,
+        clear_cache_between_blocks: bool = False,
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        super(ExtraMSAStack, self).__init__()
+        self.clear_cache_between_blocks = clear_cache_between_blocks
+        self.blocks = nn.ModuleList()
+        for _ in range(no_blocks):
+            block = ExtraMSABlock(
+                c_m=c_m,
+                c_z=c_z,
+                c_hidden_msa_att=c_hidden_msa_att,
+                c_hidden_opm=c_hidden_opm,
+                c_hidden_mul=c_hidden_mul,
+                c_hidden_pair_att=c_hidden_pair_att,
+                no_heads_msa=no_heads_msa,
+                no_heads_pair=no_heads_pair,
+                transition_n=transition_n,
+                msa_dropout=msa_dropout,
+                pair_dropout=pair_dropout,
+                inf=inf,
+                eps=eps,
+                ckpt=ckpt,
+                is_multimer=is_multimer,
+            )
+            self.blocks.append(block)
+    def forward(self,
+        m: torch.Tensor,
+        z: torch.Tensor,
+        chunk_size: int,
+        msa_mask: Optional[torch.Tensor] = None,
+        pair_mask: Optional[torch.Tensor] = None,
+        _mask_trans: bool = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_extra, N_res, C_m] extra MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+            msa_mask:
+                Optional [*, N_extra, N_res] MSA mask
+            pair_mask:
+                Optional [*, N_res, N_res] pair mask
+        Returns:
+            [*, N_res, N_res, C_z] pair update
+        """ 
+        #checkpoint_fn = get_checkpoint_fn()
+        #blocks = [
+        #    partial(b, msa_mask=msa_mask, pair_mask=pair_mask, chunk_size=chunk_size, _chunk_logits=None) for b in self.blocks
+        #]
+        #def dodo(b, *args):
+        #    torch.cuda.empty_cache()
+        #    return b(*args)
+        #blocks = [partial(dodo, b) for b in blocks]
+        #for b in blocks:
+        #    if(torch.is_grad_enabled()):
+        #        m, z = checkpoint_fn(b, *(m, z))
+        #    else:
+        #        m, z = b(m, z)
+        for b in self.blocks:
+            m, z = b(m, z, msa_mask, pair_mask, chunk_size=chunk_size)
+            if(self.clear_cache_between_blocks):
+                torch.cuda.empty_cache()
+        return z
\ No newline at end of file
--- a/fastfold/model/nn/heads.py
+++ b/fastfold/model/nn/heads.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from fastfold.model.nn.primitives import Linear, LayerNorm
+from fastfold.model.hub.loss import (
+    compute_plddt,
+    compute_tm,
+    compute_predicted_aligned_error,
+)
+class AuxiliaryHeads(nn.Module):
+    def __init__(self, config):
+        super(AuxiliaryHeads, self).__init__()
+        self.plddt = PerResidueLDDTCaPredictor(
+            **config["lddt"],
+        )
+        self.distogram = DistogramHead(
+            **config["distogram"],
+        )
+        self.masked_msa = MaskedMSAHead(
+            **config["masked_msa"],
+        )
+        self.experimentally_resolved = ExperimentallyResolvedHead(
+            **config["experimentally_resolved"],
+        )
+        if config.tm.enabled:
+            self.tm = TMScoreHead(
+                **config.tm,
+            )
+        self.config = config
+    def forward(self, outputs):
+        aux_out = {}
+        lddt_logits = self.plddt(outputs["sm"]["single"])
+        aux_out["lddt_logits"] = lddt_logits
+        # Required for relaxation later on
+        aux_out["plddt"] = compute_plddt(lddt_logits)
+        distogram_logits = self.distogram(outputs["pair"])
+        aux_out["distogram_logits"] = distogram_logits
+        masked_msa_logits = self.masked_msa(outputs["msa"])
+        aux_out["masked_msa_logits"] = masked_msa_logits
+        experimentally_resolved_logits = self.experimentally_resolved(
+            outputs["single"]
+        )
+        aux_out[
+            "experimentally_resolved_logits"
+        ] = experimentally_resolved_logits
+        if self.config.tm.enabled:
+            tm_logits = self.tm(outputs["pair"])
+            aux_out["tm_logits"] = tm_logits
+            aux_out["predicted_tm_score"] = compute_tm(
+                tm_logits, **self.config.tm
+            )
+            aux_out.update(
+                compute_predicted_aligned_error(
+                    tm_logits,
+                    **self.config.tm,
+                )
+            )
+        return aux_out
+class PerResidueLDDTCaPredictor(nn.Module):
+    def __init__(self, no_bins, c_in, c_hidden):
+        super(PerResidueLDDTCaPredictor, self).__init__()
+        self.no_bins = no_bins
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.layer_norm = LayerNorm(self.c_in)
+        self.linear_1 = Linear(self.c_in, self.c_hidden, init="relu")
+        self.linear_2 = Linear(self.c_hidden, self.c_hidden, init="relu")
+        self.linear_3 = Linear(self.c_hidden, self.no_bins, init="final")
+        self.relu = nn.ReLU()
+    def forward(self, s):
+        s = self.layer_norm(s)
+        s = self.linear_1(s)
+        s = self.relu(s)
+        s = self.linear_2(s)
+        s = self.relu(s)
+        s = self.linear_3(s)
+        return s
+class DistogramHead(nn.Module):
+    """
+    Computes a distogram probability distribution.
+    For use in computation of distogram loss, subsection 1.9.8
+    """
+    def __init__(self, c_z, no_bins, **kwargs):
+        """
+        Args:
+            c_z:
+                Input channel dimension
+            no_bins:
+                Number of distogram bins
+        """
+        super(DistogramHead, self).__init__()
+        self.c_z = c_z
+        self.no_bins = no_bins
+        self.linear = Linear(self.c_z, self.no_bins, init="final")
+    def forward(self, z):  # [*, N, N, C_z]
+        """
+        Args:
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+        Returns:
+            [*, N, N, no_bins] distogram probability distribution
+        """
+        # [*, N, N, no_bins]
+        logits = self.linear(z)
+        logits = logits + logits.transpose(-2, -3)
+        return logits
+class TMScoreHead(nn.Module):
+    """
+    For use in computation of TM-score, subsection 1.9.7
+    """
+    def __init__(self, c_z, no_bins, **kwargs):
+        """
+        Args:
+            c_z:
+                Input channel dimension
+            no_bins:
+                Number of bins
+        """
+        super(TMScoreHead, self).__init__()
+        self.c_z = c_z
+        self.no_bins = no_bins
+        self.linear = Linear(self.c_z, self.no_bins, init="final")
+    def forward(self, z):
+        """
+        Args:
+            z:
+                [*, N_res, N_res, C_z] pairwise embedding
+        Returns:
+            [*, N_res, N_res, no_bins] prediction
+        """
+        # [*, N, N, no_bins]
+        logits = self.linear(z)
+        return logits
+class MaskedMSAHead(nn.Module):
+    """
+    For use in computation of masked MSA loss, subsection 1.9.9
+    """
+    def __init__(self, c_m, c_out, **kwargs):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_out:
+                Output channel dimension
+        """
+        super(MaskedMSAHead, self).__init__()
+        self.c_m = c_m
+        self.c_out = c_out
+        self.linear = Linear(self.c_m, self.c_out, init="final")
+    def forward(self, m):
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+        Returns:
+            [*, N_seq, N_res, C_out] reconstruction
+        """
+        # [*, N_seq, N_res, C_out]
+        logits = self.linear(m)
+        return logits
+class ExperimentallyResolvedHead(nn.Module):
+    """
+    For use in computation of "experimentally resolved" loss, subsection
+    1.9.10
+    """
+    def __init__(self, c_s, c_out, **kwargs):
+        """
+        Args:
+            c_s:
+                Input channel dimension
+            c_out:
+                Number of distogram bins
+        """
+        super(ExperimentallyResolvedHead, self).__init__()
+        self.c_s = c_s
+        self.c_out = c_out
+        self.linear = Linear(self.c_s, self.c_out, init="final")
+    def forward(self, s):
+        """
+        Args:
+            s:
+                [*, N_res, C_s] single embedding
+        Returns:
+            [*, N, C_out] logits
+        """
+        # [*, N, C_out]
+        logits = self.linear(s)
+        return logits
--- a/fastfold/model/nn/msa.py
+++ b/fastfold/model/nn/msa.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+import torch.nn as nn
+from typing import Optional, List, Tuple
+from fastfold.model.nn.primitives import (
+    Linear, 
+    LayerNorm,
+    Attention, 
+    GlobalAttention, 
+    _attention_chunked_trainable,
+)
+from fastfold.utils.checkpointing import get_checkpoint_fn
+from fastfold.utils.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+class MSAAttention(nn.Module):
+    def __init__(
+        self,
+        c_in,
+        c_hidden,
+        no_heads,
+        pair_bias=False,
+        c_z=None,
+        inf=1e9,
+    ):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            pair_bias:
+                Whether to use pair embedding bias
+            c_z:
+                Pair embedding channel dimension. Ignored unless pair_bias
+                is true
+            inf:
+                A large number to be used in computing the attention mask
+        """
+        super(MSAAttention, self).__init__()
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.pair_bias = pair_bias
+        self.c_z = c_z
+        self.inf = inf
+        self.layer_norm_m = LayerNorm(self.c_in)
+        self.layer_norm_z = None
+        self.linear_z = None
+        if self.pair_bias:
+            self.layer_norm_z = LayerNorm(self.c_z)
+            self.linear_z = Linear(
+                self.c_z, self.no_heads, bias=False, init="normal"
+            )
+        self.mha = Attention(
+            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
+        )
+    @torch.jit.ignore
+    def _chunk(self, 
+        m: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self.mha,
+            {"q_x": m, "kv_x": m, "biases": biases},
+            chunk_size=chunk_size,
+            no_batch_dims=len(m.shape[:-2]),
+        )
+    def _prep_inputs(self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, N_seq, N_res, C_m]
+        m = self.layer_norm_m(m)
+        n_seq, n_res = m.shape[-3:-1]
+        if mask is None:
+            # [*, N_seq, N_res]
+            mask = m.new_ones(
+                m.shape[:-3] + (n_seq, n_res),
+            )
+        # [*, N_seq, 1, 1, N_res]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+        # This step simply returns a larger view of the bias, and does not
+        # consume additional memory.
+        # [*, N_seq, no_heads, N_res, N_res]
+        #bias = bias.expand(
+        #    ((-1,) * len(bias.shape[:-4])) + (-1, self.no_heads, n_res, -1)
+        #)
+        if (self.pair_bias and 
+            z is not None and                       # For the 
+            self.layer_norm_z is not None and       # benefit of
+            self.linear_z is not None               # TorchScript
+        ):
+            # [*, N_res, N_res, C_z]
+            z = self.layer_norm_z(z)
+            # [*, N_res, N_res, no_heads]
+            z = self.linear_z(z)
+            # [*, 1, no_heads, N_res, N_res]
+            z = permute_final_dims(z, (2, 0, 1)).unsqueeze(-4)
+        return m, mask_bias, z
+    @torch.jit.ignore
+    def _chunked_msa_attn(self,
+        m: torch.Tensor,
+        z: Optional[torch.Tensor],
+        mask: Optional[torch.Tensor],
+        chunk_logits: int,
+        checkpoint: bool,
+    ) -> torch.Tensor:
+        MSA_DIM = -4
+        def _get_qkv(m, z):
+            m, mask_bias, z = self._prep_inputs(m, z, mask)
+            q, k, v = self.mha._prep_qkv(m, m)
+            return m, q, k, v, mask_bias, z
+        checkpoint_fn = get_checkpoint_fn()
+        if(torch.is_grad_enabled() and checkpoint):
+            m, q, k, v, mask_bias, z = checkpoint_fn(_get_qkv, m, z)
+        else:
+            m, q, k, v, mask_bias, z = _get_qkv(m, z)
+        o = _attention_chunked_trainable(
+            query=q, 
+            key=k, 
+            value=v, 
+            biases=[mask_bias, z], 
+            chunk_size=chunk_logits, 
+            chunk_dim=MSA_DIM,
+            checkpoint=checkpoint,
+        )
+        if(torch.is_grad_enabled() and checkpoint):
+            # Storing an additional m here is far from ideal
+            m = checkpoint_fn(self.mha._wrap_up, o, m)
+        else:
+            m = self.mha._wrap_up(o, m)
+        return m
+    def forward(self, 
+        m: torch.Tensor, 
+        z: Optional[torch.Tensor] = None, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None,
+        _chunk_logits: Optional[int] = None,
+        _checkpoint_chunks: Optional[bool] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            z:
+                [*, N_res, N_res, C_z] pair embedding. Required only if
+                pair_bias is True
+            mask:
+                [*, N_seq, N_res] MSA mask
+            chunk_size:
+                Size of chunks into which the inputs are split along their
+                batch dimensions. A low value decreases memory overhead at the 
+                cost of slower execution. Chunking is not performed by default.
+        """
+        if(_chunk_logits is not None):
+            return self._chunked_msa_attn(
+                m=m, z=z, mask=mask, 
+                chunk_logits=_chunk_logits, checkpoint=_checkpoint_chunks
+            )           
+        m, mask_bias, z = self._prep_inputs(m, z, mask)
+        biases = [mask_bias]
+        if(z is not None):
+            biases.append(z)
+        if chunk_size is not None:
+            m = self._chunk(m, biases, chunk_size)
+        else:
+            m = self.mha(
+                q_x=m, 
+                kv_x=m, 
+                biases=biases 
+            )
+        return m
+class MSARowAttentionWithPairBias(MSAAttention):
+    """
+    Implements Algorithm 7.
+    """
+    def __init__(self, c_m, c_z, c_hidden, no_heads, inf=1e9):
+        """
+        Args:
+            c_m:
+                Input channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            inf:
+                Large number used to construct attention masks
+        """
+        super(MSARowAttentionWithPairBias, self).__init__(
+            c_m,
+            c_hidden,
+            no_heads,
+            pair_bias=True,
+            c_z=c_z,
+            inf=inf,
+        )
+class MSAColumnAttention(nn.Module):
+    """
+    Implements Algorithm 8.
+    By rights, this should also be a subclass of MSAAttention. Alas,
+    most inheritance isn't supported by TorchScript.
+    """
+    def __init__(self, c_m, c_hidden, no_heads, inf=1e9):
+        """
+        Args:
+            c_m:
+                MSA channel dimension
+            c_hidden:
+                Per-head hidden channel dimension
+            no_heads:
+                Number of attention heads
+            inf:
+                Large number used to construct attention masks
+        """
+        super(MSAColumnAttention, self).__init__()
+        self.c_m = c_m
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self._msa_att = MSAAttention(
+            c_in=c_m,
+            c_hidden=c_hidden,
+            no_heads=no_heads,
+            pair_bias=False,
+            c_z=None,
+            inf=inf,
+        )
+    def forward(self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            mask:
+                [*, N_seq, N_res] MSA mask
+            chunk_size:
+                Size of chunks into which the inputs are split along their
+                batch dimensions. A low value decreases memory overhead at the 
+                cost of slower execution. Chunking is not performed by default.
+        """ 
+        # [*, N_res, N_seq, C_in]
+        m = m.transpose(-2, -3)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+        m = self._msa_att(m, mask=mask, chunk_size=chunk_size)
+        # [*, N_seq, N_res, C_in]
+        m = m.transpose(-2, -3)
+        if mask is not None:
+            mask = mask.transpose(-1, -2)
+        return m
+class MSAColumnGlobalAttention(nn.Module):
+    def __init__(
+        self, c_in, c_hidden, no_heads, inf=1e9, eps=1e-10,
+    ):
+        super(MSAColumnGlobalAttention, self).__init__()
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.eps = eps
+        self.layer_norm_m = nn.LayerNorm(c_in)
+        self.global_attention = GlobalAttention(
+            c_in=c_in,
+            c_hidden=c_hidden,
+            no_heads=no_heads,
+            inf=inf,
+            eps=eps,
+        )
+    @torch.jit.ignore
+    def _chunk(self,
+        m: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_input = {
+            "m": m,
+            "mask": mask,
+        }
+        return chunk_layer(
+            self.global_attention,
+            mha_input,
+            chunk_size=chunk_size,
+            no_batch_dims=len(m.shape[:-2]),
+        )
+    def forward(
+        self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None, 
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        n_seq, n_res, c_in = m.shape[-3:]
+        if mask is None:
+            # [*, N_seq, N_res]
+            mask = torch.ones(
+                m.shape[:-1],
+                dtype=m.dtype,
+                device=m.device,
+            ).detach()
+        # [*, N_res, N_seq, C_in]
+        m = m.transpose(-2, -3)
+        mask = mask.transpose(-1, -2)
+        # [*, N_res, N_seq, C_in]
+        m = self.layer_norm_m(m)
+        if chunk_size is not None:
+            m = self._chunk(m, mask, chunk_size) 
+        else:
+            m = self.global_attention(m=m, mask=mask)
+        # [*, N_seq, N_res, C_in]
+        m = m.transpose(-2, -3)
+        return m
--- a/fastfold/model/nn/outer_product_mean.py
+++ b/fastfold/model/nn/outer_product_mean.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Optional
+import torch
+import torch.nn as nn
+from fastfold.model.nn.primitives import Linear
+from fastfold.utils.tensor_utils import chunk_layer
+class OuterProductMean(nn.Module):
+    """
+    Implements Algorithm 10.
+    """
+    def __init__(self, c_m, c_z, c_hidden, eps=1e-3):
+        """
+        Args:
+            c_m:
+                MSA embedding channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Hidden channel dimension
+        """
+        super(OuterProductMean, self).__init__()
+        self.c_m = c_m
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self.eps = eps
+        self.layer_norm = nn.LayerNorm(c_m)
+        self.linear_1 = Linear(c_m, c_hidden)
+        self.linear_2 = Linear(c_m, c_hidden)
+        self.linear_out = Linear(c_hidden ** 2, c_z, init="final")
+    def _opm(self, a, b):
+        # [*, N_res, N_res, C, C]
+        outer = torch.einsum("...bac,...dae->...bdce", a, b)
+        # [*, N_res, N_res, C * C]
+        outer = outer.reshape(outer.shape[:-2] + (-1,))
+        # [*, N_res, N_res, C_z]
+        outer = self.linear_out(outer)
+        return outer
+    @torch.jit.ignore
+    def _chunk(self, 
+        a: torch.Tensor, 
+        b: torch.Tensor, 
+        chunk_size: int
+    ) -> torch.Tensor:
+        # Since the "batch dim" in this case is not a true batch dimension
+        # (in that the shape of the output depends on it), we need to
+        # iterate over it ourselves
+        a_reshape = a.reshape((-1,) + a.shape[-3:])
+        b_reshape = b.reshape((-1,) + b.shape[-3:])
+        out = []
+        for a_prime, b_prime in zip(a_reshape, b_reshape):
+            outer = chunk_layer(
+                partial(self._opm, b=b_prime),
+                {"a": a_prime},
+                chunk_size=chunk_size,
+                no_batch_dims=1,
+            )
+            out.append(outer)
+        outer = torch.stack(out, dim=0)
+        outer = outer.reshape(a.shape[:-3] + outer.shape[1:])
+        return outer
+    def forward(self, 
+        m: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            m:
+                [*, N_seq, N_res, C_m] MSA embedding
+            mask:
+                [*, N_seq, N_res] MSA mask
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        if mask is None:
+            mask = m.new_ones(m.shape[:-1])
+        # [*, N_seq, N_res, C_m]
+        m = self.layer_norm(m)
+        # [*, N_seq, N_res, C]
+        mask = mask.unsqueeze(-1)
+        a = self.linear_1(m) * mask
+        b = self.linear_2(m) * mask
+        a = a.transpose(-2, -3)
+        b = b.transpose(-2, -3)
+        if chunk_size is not None:
+            outer = self._chunk(a, b, chunk_size)
+        else:
+            outer = self._opm(a, b)
+        # [*, N_res, N_res, 1]
+        norm = torch.einsum("...abc,...adc->...bdc", mask, mask)
+        # [*, N_res, N_res, C_z]
+        outer = outer / (self.eps + norm)
+        return outer
--- a/fastfold/model/nn/pair_transition.py
+++ b/fastfold/model/nn/pair_transition.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+import torch.nn as nn
+from fastfold.model.nn.primitives import Linear, LayerNorm
+from fastfold.utils.tensor_utils import chunk_layer
+class PairTransition(nn.Module):
+    """
+    Implements Algorithm 15.
+    """
+    def __init__(self, c_z, n):
+        """
+        Args:
+            c_z:
+                Pair transition channel dimension
+            n:
+                Factor by which c_z is multiplied to obtain hidden channel
+                dimension
+        """
+        super(PairTransition, self).__init__()
+        self.c_z = c_z
+        self.n = n
+        self.layer_norm = LayerNorm(self.c_z)
+        self.linear_1 = Linear(self.c_z, self.n * self.c_z, init="relu")
+        self.relu = nn.ReLU()
+        self.linear_2 = Linear(self.n * self.c_z, c_z, init="final")
+    def _transition(self, z, mask):
+        # [*, N_res, N_res, C_hidden]
+        z = self.linear_1(z)
+        z = self.relu(z)
+        # [*, N_res, N_res, C_z]
+        z = self.linear_2(z) * mask
+        return z
+    @torch.jit.ignore
+    def _chunk(self,
+        z: torch.Tensor,
+        mask: torch.Tensor,
+        chunk_size: int,
+    ) -> torch.Tensor:
+        return chunk_layer(
+            self._transition,
+            {"z": z, "mask": mask},
+            chunk_size=chunk_size,
+            no_batch_dims=len(z.shape[:-2]),
+        )
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            z:
+                [*, N_res, N_res, C_z] pair embedding
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        # DISCREPANCY: DeepMind forgets to apply the mask in this module.
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+        # [*, N_res, N_res, 1]
+        mask = mask.unsqueeze(-1)
+        # [*, N_res, N_res, C_z]
+        z = self.layer_norm(z)
+        if chunk_size is not None:
+            z = self._chunk(z, mask, chunk_size)
+        else:
+            z = self._transition(z=z, mask=mask)
+        return z
--- a/fastfold/model/nn/primitives.py
+++ b/fastfold/model/nn/primitives.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import math
+from typing import Optional, Callable, List, Tuple, Sequence
+import numpy as np
+import torch
+import torch.nn as nn
+from scipy.stats import truncnorm
+from fastfold.utils.checkpointing import get_checkpoint_fn
+from fastfold.utils.tensor_utils import (
+    permute_final_dims,
+    flatten_final_dims,
+    _chunk_slice,
+)
+import fastfold.habana as habana
+def _prod(nums):
+    out = 1
+    for n in nums:
+        out = out * n
+    return out
+def _calculate_fan(linear_weight_shape, fan="fan_in"):
+    fan_out, fan_in = linear_weight_shape
+    if fan == "fan_in":
+        f = fan_in
+    elif fan == "fan_out":
+        f = fan_out
+    elif fan == "fan_avg":
+        f = (fan_in + fan_out) / 2
+    else:
+        raise ValueError("Invalid fan option")
+    return f
+def trunc_normal_init_(weights, scale=1.0, fan="fan_in"):
+    shape = weights.shape
+    f = _calculate_fan(shape, fan)
+    scale = scale / max(1, f)
+    a = -2
+    b = 2
+    std = math.sqrt(scale) / truncnorm.std(a=a, b=b, loc=0, scale=1)
+    size = _prod(shape)
+    samples = truncnorm.rvs(a=a, b=b, loc=0, scale=std, size=size)
+    samples = np.reshape(samples, shape)
+    with torch.no_grad():
+        weights.copy_(torch.tensor(samples, device=weights.device))
+def lecun_normal_init_(weights):
+    trunc_normal_init_(weights, scale=1.0)
+def he_normal_init_(weights):
+    trunc_normal_init_(weights, scale=2.0)
+def glorot_uniform_init_(weights):
+    nn.init.xavier_uniform_(weights, gain=1)
+def final_init_(weights):
+    with torch.no_grad():
+        weights.fill_(0.0)
+def gating_init_(weights):
+    with torch.no_grad():
+        weights.fill_(0.0)
+def normal_init_(weights):
+    torch.nn.init.kaiming_normal_(weights, nonlinearity="linear")
+def ipa_point_weights_init_(weights):
+    with torch.no_grad():
+        softplus_inverse_1 = 0.541324854612918
+        weights.fill_(softplus_inverse_1)
+class Linear(nn.Linear):
+    """
+    A Linear layer with built-in nonstandard initializations. Called just
+    like torch.nn.Linear.
+    Implements the initializers in 1.11.4, plus some additional ones found
+    in the code.
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        bias: bool = True,
+        init: str = "default",
+        init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
+    ):
+        """
+        Args:
+            in_dim:
+                The final dimension of inputs to the layer
+            out_dim:
+                The final dimension of layer outputs
+            bias:
+                Whether to learn an additive bias. True by default
+            init:
+                The initializer to use. Choose from:
+                "default": LeCun fan-in truncated normal initialization
+                "relu": He initialization w/ truncated normal distribution
+                "glorot": Fan-average Glorot uniform initialization
+                "gating": Weights=0, Bias=1
+                "normal": Normal initialization with std=1/sqrt(fan_in)
+                "final": Weights=0, Bias=0
+                Overridden by init_fn if the latter is not None.
+            init_fn:
+                A custom initializer taking weight and bias as inputs.
+                Overrides init if not None.
+        """
+        super(Linear, self).__init__(in_dim, out_dim, bias=bias)
+        if bias:
+            with torch.no_grad():
+                self.bias.fill_(0)
+        if init_fn is not None:
+            init_fn(self.weight, self.bias)
+        else:
+            if init == "default":
+                lecun_normal_init_(self.weight)
+            elif init == "relu":
+                he_normal_init_(self.weight)
+            elif init == "glorot":
+                glorot_uniform_init_(self.weight)
+            elif init == "gating":
+                gating_init_(self.weight)
+                if bias:
+                    with torch.no_grad():
+                        self.bias.fill_(1.0)
+            elif init == "normal":
+                normal_init_(self.weight)
+            elif init == "final":
+                final_init_(self.weight)
+            else:
+                raise ValueError("Invalid init string.")
+class LayerNorm(nn.Module):
+    def __init__(self, c_in, eps=1e-5):
+        super(LayerNorm, self).__init__()
+        self.c_in = (c_in,)
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(c_in))
+        self.bias = nn.Parameter(torch.zeros(c_in))
+    def forward(self, x):
+        out = nn.functional.layer_norm(
+            x,
+            self.c_in,
+            self.weight,
+            self.bias,
+            self.eps,
+        )
+        return out
+@torch.jit.ignore
+def softmax(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    """
+        Softmax, but without automatic casting to fp32 when the input is of
+        type bfloat16
+    """
+    s = torch.nn.functional.softmax(t, dim=dim)
+    return s
+#@torch.jit.script
+def _attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor,
+               biases: List[torch.Tensor]) -> torch.Tensor:
+    # [*, H, Q, C_hidden]
+    query = permute_final_dims(query, (1, 0, 2))
+    # [*, H, C_hidden, K]
+    key = permute_final_dims(key, (1, 2, 0))
+    # [*, H, V, C_hidden]
+    value = permute_final_dims(value, (1, 0, 2))
+    # [*, H, Q, K]
+    a = torch.matmul(query, key)
+    if habana.is_habana():
+        from fastfold.habana.fastnn.custom_op import fused_softmax, fused_softmax_bias
+        if len(biases) == 1:
+            a = fused_softmax(a, biases[0], -1)
+        else:
+            a = fused_softmax_bias(a, biases[0], biases[1], -1)
+    else:
+        for b in biases:
+            a += b
+        a = softmax(a, -1)
+    # [*, H, Q, C_hidden]
+    a = a.to(dtype=value.dtype)
+    a = torch.matmul(a, value)
+    # [*, Q, H, C_hidden]
+    a = a.transpose(-2, -3)
+    return a
+@torch.jit.ignore
+def _attention_chunked_trainable(
+    query,
+    key,
+    value,
+    biases,
+    chunk_size,
+    chunk_dim,
+    checkpoint,
+):
+    if (checkpoint and len(biases) > 2):
+        raise ValueError("Checkpointed version permits only permits two bias terms")
+    def _checkpointable_attention(q, k, v, b1, b2):
+        bs = [b for b in [b1, b2] if b is not None]
+        return _attention(q, k, v, bs)
+    o_chunks = []
+    checkpoint_fn = get_checkpoint_fn()
+    count = query.shape[chunk_dim]
+    for start in range(0, count, chunk_size):
+        end = start + chunk_size
+        idx = [slice(None)] * len(query.shape)
+        idx[chunk_dim] = slice(start, end)
+        idx_tup = tuple(idx)
+        q_chunk = query[idx_tup]
+        k_chunk = key[idx_tup]
+        v_chunk = value[idx_tup]
+        def _slice_bias(b):
+            idx[chunk_dim] = (slice(start, end) if b.shape[chunk_dim] != 1 else slice(None))
+            return b[tuple(idx)]
+        if (checkpoint):
+            bias_1_chunk, bias_2_chunk = [
+                _slice_bias(b) if b is not None else None for b in (biases + [None, None])[:2]
+            ]
+            o_chunk = checkpoint_fn(_checkpointable_attention, q_chunk, k_chunk, v_chunk,
+                                    bias_1_chunk, bias_2_chunk)
+        else:
+            bias_chunks = [_slice_bias(b) for b in biases]
+            o_chunk = _attention(q_chunk, k_chunk, v_chunk, bias_chunks)
+        o_chunks.append(o_chunk)
+    o = torch.cat(o_chunks, dim=chunk_dim)
+    return o
+class Attention(nn.Module):
+    """
+    Standard multi-head attention using AlphaFold's default layer
+    initialization. Allows multiple bias vectors.
+    """
+    def __init__(
+        self,
+        c_q: int,
+        c_k: int,
+        c_v: int,
+        c_hidden: int,
+        no_heads: int,
+        gating: bool = True,
+    ):
+        """
+        Args:
+            c_q:
+                Input dimension of query data
+            c_k:
+                Input dimension of key data
+            c_v:
+                Input dimension of value data
+            c_hidden:
+                Per-head hidden dimension
+            no_heads:
+                Number of attention heads
+            gating:
+                Whether the output should be gated using query data
+        """
+        super(Attention, self).__init__()
+        self.c_q = c_q
+        self.c_k = c_k
+        self.c_v = c_v
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.gating = gating
+        # DISCREPANCY: c_hidden is not the per-head channel dimension, as
+        # stated in the supplement, but the overall channel dimension.
+        self.linear_q = Linear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_k = Linear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_v = Linear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
+        self.linear_o = Linear(self.c_hidden * self.no_heads, self.c_q, init="final")
+        self.linear_g = None
+        if self.gating:
+            self.linear_g = Linear(self.c_q, self.c_hidden * self.no_heads, init="gating")
+        self.sigmoid = nn.Sigmoid()
+    def _prep_qkv(self, q_x: torch.Tensor,
+                  kv_x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # [*, Q/K/V, H * C_hidden]
+        q = self.linear_q(q_x)
+        k = self.linear_k(kv_x)
+        v = self.linear_v(kv_x)
+        # [*, Q/K, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+        k = k.view(k.shape[:-1] + (self.no_heads, -1))
+        v = v.view(v.shape[:-1] + (self.no_heads, -1))
+        q /= math.sqrt(self.c_hidden)
+        return q, k, v
+    def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
+        if (self.linear_g is not None):
+            g = self.sigmoid(self.linear_g(q_x))
+            # [*, Q, H, C_hidden]
+            g = g.view(g.shape[:-1] + (self.no_heads, -1))
+            o = o * g
+        # [*, Q, H * C_hidden]
+        o = flatten_final_dims(o, 2)
+        # [*, Q, C_q]
+        o = self.linear_o(o)
+        return o
+    def forward(
+        self,
+        q_x: torch.Tensor,
+        kv_x: torch.Tensor,
+        biases: Optional[List[torch.Tensor]] = None,
+        use_lma: bool = False,
+        q_chunk_size: Optional[int] = None,
+        kv_chunk_size: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            q_x:
+                [*, Q, C_q] query data
+            kv_x:
+                [*, K, C_k] key data
+            biases:
+                List of biases that broadcast to [*, H, Q, K]
+            use_lma:
+                Whether to use low-memory attention
+            q_chunk_size:
+                Query chunk size (for LMA)
+            kv_chunk_size:
+                Key/Value chunk size (for LMA)
+        Returns
+            [*, Q, C_q] attention update
+        """
+        if (biases is None):
+            biases = []
+        if (use_lma and (q_chunk_size is None or kv_chunk_size is None)):
+            raise ValueError("If use_lma is specified, q_chunk_size and kv_chunk_size must "
+                             "be provided")
+        q, k, v = self._prep_qkv(q_x, kv_x)
+        if (use_lma):
+            biases = [b.expand(b.shape[:-2] + (q_x.shape[-2],) + (kv_x.shape[-2],)) for b in biases]
+            o = _lma(q, k, v, biases, q_chunk_size, kv_chunk_size)
+        else:
+            o = _attention(q, k, v, biases)
+        o = self._wrap_up(o, q_x)
+        return o
+class GlobalAttention(nn.Module):
+    def __init__(self, c_in, c_hidden, no_heads, inf, eps):
+        super(GlobalAttention, self).__init__()
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.eps = eps
+        self.linear_q = Linear(c_in, c_hidden * no_heads, bias=False, init="glorot")
+        self.linear_k = Linear(
+            c_in,
+            c_hidden,
+            bias=False,
+            init="glorot",
+        )
+        self.linear_v = Linear(
+            c_in,
+            c_hidden,
+            bias=False,
+            init="glorot",
+        )
+        self.linear_g = Linear(c_in, c_hidden * no_heads, init="gating")
+        self.linear_o = Linear(c_hidden * no_heads, c_in, init="final")
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, m: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        # [*, N_res, C_in]
+        q = torch.sum(m * mask.unsqueeze(-1),
+                      dim=-2) / (torch.sum(mask, dim=-1)[..., None] + self.eps)
+        # [*, N_res, H * C_hidden]
+        q = self.linear_q(q)
+        q *= (self.c_hidden**(-0.5))
+        # [*, N_res, H, C_hidden]
+        q = q.view(q.shape[:-1] + (self.no_heads, -1))
+        # [*, N_res, N_seq, C_hidden]
+        k = self.linear_k(m)
+        v = self.linear_v(m)
+        # [*, N_res, H, N_seq]
+        a = torch.matmul(
+            q,
+            k.transpose(-1, -2),  # [*, N_res, C_hidden, N_seq]
+        )
+        bias = (self.inf * (mask - 1))[..., :, None, :]
+        if habana.is_habana():
+            from fastfold.habana.fastnn.custom_op import fused_softmax, fused_softmax_bias
+            a = fused_softmax(a, bias, -1)
+        else:
+            a += bias
+            a = softmax(a)
+        # [*, N_res, H, C_hidden]
+        a = a.to(dtype=v.dtype)
+        o = torch.matmul(
+            a,
+            v,
+        )
+        # [*, N_res, N_seq, C_hidden]
+        g = self.sigmoid(self.linear_g(m))
+        # [*, N_res, N_seq, H, C_hidden]
+        g = g.view(g.shape[:-1] + (self.no_heads, -1))
+        # [*, N_res, N_seq, H, C_hidden]
+        o = o.unsqueeze(-3) * g
+        # [*, N_res, N_seq, H * C_hidden]
+        o = o.reshape(o.shape[:-2] + (-1,))
+        # [*, N_res, N_seq, C_in]
+        m = self.linear_o(o)
+        return m
+def _lma(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    biases: List[torch.Tensor],
+    q_chunk_size: int,
+    kv_chunk_size: int,
+):
+    no_q, no_kv = q.shape[-3], k.shape[-3]
+    # [*, Q, H, C_hidden]
+    o = q.new_zeros(q.shape)
+    for q_s in range(0, no_q, q_chunk_size):
+        q_chunk = q[..., q_s:q_s + q_chunk_size, :, :]
+        large_bias_chunks = [b[..., q_s:q_s + q_chunk_size, :] for b in biases]
+        maxes = []
+        weights = []
+        values = []
+        for kv_s in range(0, no_kv, kv_chunk_size):
+            k_chunk = k[..., kv_s:kv_s + kv_chunk_size, :, :]
+            v_chunk = v[..., kv_s:kv_s + kv_chunk_size, :, :]
+            small_bias_chunks = [b[..., kv_s:kv_s + kv_chunk_size] for b in large_bias_chunks]
+            a = torch.einsum(
+                "...qhd,...khd->...hqk",
+                q_chunk,
+                k_chunk,
+            )
+            for b in small_bias_chunks:
+                a += b
+            a = a.transpose(-2, -3)
+            max_a = torch.max(a, dim=-1, keepdim=True)[0]
+            exp_a = torch.exp(a - max_a)
+            exp_v = torch.einsum("...vhf,...qhv->...qhf", v_chunk, exp_a)
+            maxes.append(max_a.detach().squeeze(-1))
+            weights.append(torch.sum(exp_a, dim=-1))
+            values.append(exp_v)
+        chunk_max = torch.stack(maxes, dim=-3)
+        chunk_weights = torch.stack(weights, dim=-3)
+        chunk_values = torch.stack(values, dim=-4)
+        global_max = torch.max(chunk_max, dim=-3, keepdim=True)[0]
+        max_diffs = torch.exp(chunk_max - global_max)
+        chunk_values *= max_diffs.unsqueeze(-1)
+        chunk_weights *= max_diffs
+        all_values = torch.sum(chunk_values, dim=-4)
+        all_weights = torch.sum(chunk_weights.unsqueeze(-1), dim=-4)
+        q_chunk_out = all_values / all_weights
+        o[..., q_s:q_s + q_chunk_size, :, :] = q_chunk_out
+    return o
--- a/fastfold/model/nn/structure_module.py
+++ b/fastfold/model/nn/structure_module.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+import torch.nn as nn
+from typing import Any, Dict, Optional, Tuple, Union
+from fastfold.model.nn.primitives import Linear, LayerNorm, ipa_point_weights_init_
+from fastfold.common.residue_constants import (
+    restype_rigid_group_default_frame,
+    restype_atom14_to_rigid_group,
+    restype_atom14_mask,
+    restype_atom14_rigid_group_positions,
+)
+from fastfold.utils.geometry.quat_rigid import QuatRigid
+from fastfold.utils.geometry.rigid_matrix_vector import Rigid3Array
+from fastfold.utils.geometry.vector import Vec3Array
+from fastfold.utils.feats import (
+    frames_and_literature_positions_to_atom14_pos,
+    torsion_angles_to_frames,
+)
+from fastfold.utils.rigid_utils import Rotation, Rigid
+from fastfold.utils.tensor_utils import (
+    dict_multimap,
+    permute_final_dims,
+    flatten_final_dims,
+)
+import fastfold.habana as habana
+class AngleResnetBlock(nn.Module):
+    def __init__(self, c_hidden):
+        """
+        Args:
+            c_hidden:
+                Hidden channel dimension
+        """
+        super(AngleResnetBlock, self).__init__()
+        self.c_hidden = c_hidden
+        self.linear_1 = Linear(self.c_hidden, self.c_hidden, init="relu")
+        self.linear_2 = Linear(self.c_hidden, self.c_hidden, init="final")
+        self.relu = nn.ReLU()
+    def forward(self, a: torch.Tensor) -> torch.Tensor:
+        s_initial = a
+        a = self.relu(a)
+        a = self.linear_1(a)
+        a = self.relu(a)
+        a = self.linear_2(a)
+        return a + s_initial
+class AngleResnet(nn.Module):
+    """
+    Implements Algorithm 20, lines 11-14
+    """
+    def __init__(
+        self, c_in: int, c_hidden: int, no_blocks: int, no_angles: int, epsilon: float
+    ):        
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Hidden channel dimension
+            no_blocks:
+                Number of resnet blocks
+            no_angles:
+                Number of torsion angles to generate
+            epsilon:
+                Small constant for normalization
+        """
+        super(AngleResnet, self).__init__()
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_blocks = no_blocks
+        self.no_angles = no_angles
+        self.eps = epsilon
+        self.linear_in = Linear(self.c_in, self.c_hidden)
+        self.linear_initial = Linear(self.c_in, self.c_hidden)
+        self.layers = nn.ModuleList()
+        for _ in range(self.no_blocks):
+            layer = AngleResnetBlock(c_hidden=self.c_hidden)
+            self.layers.append(layer)
+        self.linear_out = Linear(self.c_hidden, self.no_angles * 2)
+        self.relu = nn.ReLU()
+    def forward(
+        self, s: torch.Tensor, s_initial: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            s:
+                [*, C_hidden] single embedding
+            s_initial:
+                [*, C_hidden] single embedding as of the start of the
+                StructureModule
+        Returns:
+            [*, no_angles, 2] predicted angles
+        """
+        # NOTE: The ReLU's applied to the inputs are absent from the supplement
+        # pseudocode but present in the source. For maximal compatibility with
+        # the pretrained weights, I'm going with the source.
+        # [*, C_hidden]
+        s_initial = self.relu(s_initial)
+        s_initial = self.linear_initial(s_initial)
+        s = self.relu(s)
+        s = self.linear_in(s)
+        s = s + s_initial
+        for l in self.layers:
+            s = l(s)
+        s = self.relu(s)
+        # [*, no_angles * 2]
+        s = self.linear_out(s)
+        # [*, no_angles, 2]
+        s = s.view(s.shape[:-1] + (-1, 2))
+        unnormalized_s = s
+        norm_denom = torch.sqrt(
+            torch.clamp(
+                torch.sum(s**2, dim=-1, keepdim=True),
+                min=self.eps,
+            )
+        )
+        s = s / norm_denom
+        return unnormalized_s, s
+class PointProjection(nn.Module):
+    def __init__(
+        self,
+        c_hidden: int,
+        num_points: int,
+        no_heads: int,
+        return_local_points: bool = False,
+    ):
+        super().__init__()
+        self.return_local_points = return_local_points
+        self.no_heads = no_heads
+        self.linear = Linear(c_hidden, no_heads * 3 * num_points)
+    def forward(
+        self,
+        activations: torch.Tensor,
+        rigids: Rigid3Array,
+    ) -> Union[Vec3Array, Tuple[Vec3Array, Vec3Array]]:
+        # TODO: Needs to run in high precision during training
+        points_local = self.linear(activations)
+        points_local = points_local.reshape(
+            *points_local.shape[:-1],
+            self.no_heads,
+            -1,
+        )
+        points_local = torch.split(points_local, points_local.shape[-1] // 3, dim=-1)
+        points_local = Vec3Array(*points_local)
+        points_global = rigids[..., None, None].apply_to_point(points_local)
+        if self.return_local_points:
+            return points_global, points_local
+        return points_global
+class InvariantPointAttention(nn.Module):
+    """
+    Implements Algorithm 22.
+    """
+    def __init__(
+        self,
+        c_s: int,
+        c_z: int,
+        c_hidden: int,
+        no_heads: int,
+        no_qk_points: int,
+        no_v_points: int,
+        inf: float = 1e5,
+        eps: float = 1e-8,
+        is_multimer: bool = False,
+    ):
+        """
+        Args:
+            c_s:
+                Single representation channel dimension
+            c_z:
+                Pair representation channel dimension
+            c_hidden:
+                Hidden channel dimension
+            no_heads:
+                Number of attention heads
+            no_qk_points:
+                Number of query/key points to generate
+            no_v_points:
+                Number of value points to generate
+        """
+        super(InvariantPointAttention, self).__init__()
+        self.c_s = c_s
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.no_qk_points = no_qk_points
+        self.no_v_points = no_v_points
+        self.inf = inf
+        self.eps = eps
+        self.is_multimer = is_multimer
+        # These linear layers differ from their specifications in the
+        # supplement. There, they lack bias and use Glorot initialization.
+        # Here as in the official source, they have bias and use the default
+        # Lecun initialization.
+        if not self.is_multimer:
+            hc = self.c_hidden * self.no_heads
+            self.linear_q = Linear(self.c_s, hc, bias=(not is_multimer))
+            self.linear_kv = Linear(self.c_s, 2 * hc)
+            hpq = self.no_heads * self.no_qk_points * 3
+            self.linear_q_points = Linear(self.c_s, hpq)
+            hpkv = self.no_heads * (self.no_qk_points + self.no_v_points) * 3
+            self.linear_kv_points = Linear(self.c_s, hpkv)
+            # hpv = self.no_heads * self.no_v_points * 3
+        else:
+            hc = self.c_hidden * self.no_heads
+            self.linear_q = Linear(self.c_s, hc, bias=(not is_multimer))
+            self.linear_q_points = PointProjection(
+                self.c_s, self.no_qk_points, self.no_heads
+            )
+            self.linear_k = Linear(self.c_s, hc, bias=False)
+            self.linear_v = Linear(self.c_s, hc, bias=False)
+            self.linear_k_points = PointProjection(
+                self.c_s,
+                self.no_qk_points,
+                self.no_heads,
+            )
+            self.linear_v_points = PointProjection(
+                self.c_s,
+                self.no_v_points,
+                self.no_heads,
+            )
+        self.linear_b = Linear(self.c_z, self.no_heads)
+        self.head_weights = nn.Parameter(torch.zeros((no_heads)))
+        ipa_point_weights_init_(self.head_weights)
+        concat_out_dim = self.no_heads * (
+            self.c_z + self.c_hidden + self.no_v_points * 4
+        )
+        self.linear_out = Linear(concat_out_dim, self.c_s, init="final")
+        self.softmax = nn.Softmax(dim=-1)
+        self.softplus = nn.Softplus()
+    def forward(
+        self,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        r: Union[Rigid, Rigid3Array],
+        mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            s:
+                [*, N_res, C_s] single representation
+            z:
+                [*, N_res, N_res, C_z] pair representation
+            r:
+                [*, N_res] transformation object
+            mask:
+                [*, N_res] mask
+        Returns:
+            [*, N_res, C_s] single representation update
+        """
+        #######################################
+        # Generate scalar and point activations
+        #######################################
+        # The following two blocks are equivalent
+        # They're separated only to preserve compatibility with old AF weights
+        if self.is_multimer:
+            # [*, N_res, H * C_hidden]
+            q = self.linear_q(s)
+            # [*, N_res, H, C_hidden]
+            q = q.view(q.shape[:-1] + (self.no_heads, -1))
+            # [*, N_res, H, P_qk]
+            q_pts = self.linear_q_points(s, r)
+            # [*, N_res, H * C_hidden]
+            k = self.linear_k(s)
+            v = self.linear_v(s)
+            # [*, N_res, H, C_hidden]
+            k = k.view(k.shape[:-1] + (self.no_heads, -1))
+            v = v.view(v.shape[:-1] + (self.no_heads, -1))
+            # [*, N_res, H, P_qk, 3]
+            k_pts = self.linear_k_points(s, r)
+            # [*, N_res, H, P_v, 3]
+            v_pts = self.linear_v_points(s, r)
+        else:
+            # [*, N_res, H * C_hidden]
+            q = self.linear_q(s)
+            kv = self.linear_kv(s)
+            # [*, N_res, H, C_hidden]
+            q = q.view(q.shape[:-1] + (self.no_heads, -1))
+            # [*, N_res, H, 2 * C_hidden]
+            kv = kv.view(kv.shape[:-1] + (self.no_heads, -1))
+            # [*, N_res, H, C_hidden]
+            k, v = torch.split(kv, self.c_hidden, dim=-1)
+            # [*, N_res, H * P_q * 3]
+            q_pts = self.linear_q_points(s)
+            # This is kind of clunky, but it's how the original does it
+            # [*, N_res, H * P_q, 3]
+            q_pts = torch.split(q_pts, q_pts.shape[-1] // 3, dim=-1)
+            q_pts = torch.stack(q_pts, dim=-1)
+            q_pts = r[..., None].apply(q_pts)
+            # [*, N_res, H, P_q, 3]
+            q_pts = q_pts.view(q_pts.shape[:-2] + (self.no_heads, self.no_qk_points, 3))
+            # [*, N_res, H * (P_q + P_v) * 3]
+            kv_pts = self.linear_kv_points(s)
+            # [*, N_res, H * (P_q + P_v), 3]
+            kv_pts = torch.split(kv_pts, kv_pts.shape[-1] // 3, dim=-1)
+            kv_pts = torch.stack(kv_pts, dim=-1)
+            kv_pts = r[..., None].apply(kv_pts)
+            # [*, N_res, H, (P_q + P_v), 3]
+            kv_pts = kv_pts.view(kv_pts.shape[:-2] + (self.no_heads, -1, 3))
+            # [*, N_res, H, P_q/P_v, 3]
+            k_pts, v_pts = torch.split(
+                kv_pts, [self.no_qk_points, self.no_v_points], dim=-2
+            )
+        ##########################
+        # Compute attention scores
+        ##########################
+        # [*, N_res, N_res, H]
+        b = self.linear_b(z)
+        # [*, H, N_res, N_res]
+        a = torch.matmul(
+            permute_final_dims(q, (1, 0, 2)),  # [*, H, N_res, C_hidden]
+            permute_final_dims(k, (1, 2, 0)),  # [*, H, C_hidden, N_res]
+        )
+        a *= math.sqrt(1.0 / (3 * self.c_hidden))
+        a += math.sqrt(1.0 / 3) * permute_final_dims(b, (2, 0, 1))
+        if self.is_multimer:
+            # [*, N_res, N_res, H, P_q, 3]
+            pt_att = q_pts[..., None, :, :] - k_pts[..., None, :, :, :]
+            # [*, N_res, N_res, H, P_q]
+            pt_att = sum([c**2 for c in pt_att])
+        else:
+            # [*, N_res, N_res, H, P_q, 3]
+            ######################################
+            q_pts_t0 = q_pts.unsqueeze(-4)
+            q_shape = q_pts_t0.shape
+            q_pts_t0 = q_pts_t0.reshape([q_shape[0], q_shape[1], -1])
+            k_pts_t0 = k_pts.unsqueeze(-5)
+            k_shape = k_pts_t0.shape
+            k_pts_t0 = k_pts_t0.reshape([k_shape[0], k_shape[1], -1])
+            q_k = q_pts_t0 - k_pts_t0
+            q_k = q_k ** 2
+            q_k_shape = q_k.shape
+            pt_att = q_k.reshape(q_k_shape[:2] + q_shape[-3:])
+            #####################################
+            pt_att = pt_att.permute(0, 4, 1, 2, 3)
+            pt_att = torch.sum(pt_att, 1)
+        head_weights = self.softplus(self.head_weights).view(
+            *((1,) * len(pt_att.shape[:-2]) + (-1, 1))
+        )
+        head_weights = head_weights * math.sqrt(
+            1.0 / (3 * (self.no_qk_points * 9.0 / 2))
+        )
+        ##############################
+        pt_att_t0 = pt_att.permute(0, 3, 1, 2)
+        head_weights_t0 = head_weights.permute(0, 3, 1, 2)
+        pt_att_o = pt_att_t0 * head_weights_t0
+        pt_att = pt_att_o.permute(0, 2,3, 1)
+        ##############################
+        # [*, N_res, N_res, H]
+        pt_att = torch.sum(pt_att, dim=-1) * (-0.5)
+        # [*, N_res, N_res]
+        square_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2)
+        square_mask = self.inf * (square_mask - 1)
+        # [*, H, N_res, N_res]
+        pt_att = permute_final_dims(pt_att, (2, 0, 1))
+        a = a + pt_att
+        a = a + square_mask.unsqueeze(-3)
+        a = self.softmax(a)
+        ################
+        # Compute output
+        ################
+        # [*, N_res, H, C_hidden]
+        o = torch.matmul(a, v.transpose(-2, -3).to(dtype=a.dtype)).transpose(-2, -3)
+        # [*, N_res, H * C_hidden]
+        o = flatten_final_dims(o, 2)
+        # As DeepMind explains, this manual matmul ensures that the operation
+        # happens in float32.
+        if self.is_multimer:
+            # [*, N_res, H, P_v]
+            o_pt = v_pts * permute_final_dims(a, (1, 2, 0)).unsqueeze(-1)
+            o_pt = o_pt.sum(dim=-3)
+            # [*, N_res, H, P_v]
+            o_pt = r[..., None, None].apply_inverse_to_point(o_pt)
+            # [*, N_res, H * P_v, 3]
+            o_pt = o_pt.reshape(o_pt.shape[:-2] + (-1,))
+            # [*, N_res, H * P_v]
+            o_pt_norm = o_pt.norm(self.eps)
+        else:
+            # [*, H, 3, N_res, P_v]
+            ###################################
+            a1 = a[..., None, :, :, None]
+            a1 = a1.permute(0, 1, 2, 4, 3)
+            b = permute_final_dims(v_pts, (1, 3, 0, 2))[..., None, :, :]
+            b = b.permute(0, 1, 2, 4, 3)
+            c = a1 * b
+            o_pt = torch.sum(c, -1)
+            ###################################
+            # [*, N_res, H, P_v, 3]
+            o_pt = permute_final_dims(o_pt, (2, 0, 3, 1))
+            o_pt = r[..., None, None].invert_apply(o_pt)
+            # [*, N_res, H * P_v]
+            o_pt_norm = flatten_final_dims(
+                torch.sqrt(torch.sum(o_pt**2, dim=-1) + self.eps), 2
+            )
+            # [*, N_res, H * P_v, 3]
+            o_pt = o_pt.reshape(*o_pt.shape[:-3], -1, 3)
+        # [*, N_res, H, C_z]
+        o_pair = torch.matmul(a.transpose(-2, -3), z.to(dtype=a.dtype))
+        # [*, N_res, H * C_z]
+        o_pair = flatten_final_dims(o_pair, 2)
+        # [*, N_res, C_s]
+        if self.is_multimer:
+            s = self.linear_out(
+                torch.cat((o, *o_pt, o_pt_norm, o_pair), dim=-1).to(dtype=z.dtype)
+            )
+        else:
+            s = self.linear_out(
+                torch.cat(
+                    (o, *torch.unbind(o_pt, dim=-1), o_pt_norm, o_pair), dim=-1
+                ).to(dtype=z.dtype)
+            )
+        return s
+class BackboneUpdate(nn.Module):
+    """
+    Implements part of Algorithm 23.
+    """
+    def __init__(self, c_s: int):
+        """
+        Args:
+            c_s:
+                Single representation channel dimension
+        """
+        super(BackboneUpdate, self).__init__()
+        self.c_s = c_s
+        self.linear = Linear(self.c_s, 6, init="final")
+    def forward(self, s: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            [*, N_res, C_s] single representation
+        Returns:
+            [*, N_res, 6] update vector 
+        """
+        # [*, 6]
+        update = self.linear(s)
+        return update 
+class StructureModuleTransitionLayer(nn.Module):
+    def __init__(self, c: int):
+        super(StructureModuleTransitionLayer, self).__init__()
+        self.c = c
+        self.linear_1 = Linear(self.c, self.c, init="relu")
+        self.linear_2 = Linear(self.c, self.c, init="relu")
+        self.linear_3 = Linear(self.c, self.c, init="final")
+        self.relu = nn.ReLU()
+    def forward(self, s: torch.Tensor):
+        s_initial = s
+        s = self.linear_1(s)
+        s = self.relu(s)
+        s = self.linear_2(s)
+        s = self.relu(s)
+        s = self.linear_3(s)
+        s = s + s_initial
+        return s
+class StructureModuleTransition(nn.Module):
+    def __init__(self, c: int, num_layers: int, dropout_rate: float):
+        super(StructureModuleTransition, self).__init__()
+        self.c = c
+        self.num_layers = num_layers
+        self.dropout_rate = dropout_rate
+        self.layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            l = StructureModuleTransitionLayer(self.c)
+            self.layers.append(l)
+        self.dropout = nn.Dropout(self.dropout_rate)
+        self.layer_norm = LayerNorm(self.c)
+    def forward(self, s: torch.Tensor) -> torch.Tensor:
+        for l in self.layers:
+            s = l(s)
+        s = self.dropout(s)
+        s = self.layer_norm(s)
+        return s
+class StructureModule(nn.Module):
+    def __init__(
+        self,
+        c_s: int,
+        c_z: int,
+        c_ipa: int,
+        c_resnet: int,
+        no_heads_ipa: int,
+        no_qk_points: int,
+        no_v_points: int,
+        dropout_rate: float,
+        no_blocks: int,
+        no_transition_layers: int,
+        no_resnet_blocks: int,
+        no_angles: int,
+        trans_scale_factor: float,
+        epsilon: float,
+        inf: float,
+        is_multimer: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            c_s:
+                Single representation channel dimension
+            c_z:
+                Pair representation channel dimension
+            c_ipa:
+                IPA hidden channel dimension
+            c_resnet:
+                Angle resnet (Alg. 23 lines 11-14) hidden channel dimension
+            no_heads_ipa:
+                Number of IPA heads
+            no_qk_points:
+                Number of query/key points to generate during IPA
+            no_v_points:
+                Number of value points to generate during IPA
+            dropout_rate:
+                Dropout rate used throughout the layer
+            no_blocks:
+                Number of structure module blocks
+            no_transition_layers:
+                Number of layers in the single representation transition
+                (Alg. 23 lines 8-9)
+            no_resnet_blocks:
+                Number of blocks in the angle resnet
+            no_angles:
+                Number of angles to generate in the angle resnet
+            trans_scale_factor:
+                Scale of single representation transition hidden dimension
+            epsilon:
+                Small number used in angle resnet normalization
+            inf:
+                Large number used for attention masking
+            is_multimer:
+                whether running under multimer mode
+        """
+        super(StructureModule, self).__init__()
+        self.c_s = c_s
+        self.c_z = c_z
+        self.c_ipa = c_ipa
+        self.c_resnet = c_resnet
+        self.no_heads_ipa = no_heads_ipa
+        self.no_qk_points = no_qk_points
+        self.no_v_points = no_v_points
+        self.dropout_rate = dropout_rate
+        self.no_blocks = no_blocks
+        self.no_transition_layers = no_transition_layers
+        self.no_resnet_blocks = no_resnet_blocks
+        self.no_angles = no_angles
+        self.trans_scale_factor = trans_scale_factor
+        self.epsilon = epsilon
+        self.inf = inf
+        self.is_multimer = is_multimer
+        # To be lazily initialized later
+        self.default_frames = None
+        self.group_idx = None
+        self.atom_mask = None
+        self.lit_positions = None
+        self.layer_norm_s = LayerNorm(self.c_s)
+        self.layer_norm_z = LayerNorm(self.c_z)
+        self.linear_in = Linear(self.c_s, self.c_s)
+        self.ipa = InvariantPointAttention(
+            self.c_s,
+            self.c_z,
+            self.c_ipa,
+            self.no_heads_ipa,
+            self.no_qk_points,
+            self.no_v_points,
+            inf=self.inf,
+            eps=self.epsilon,
+            is_multimer=self.is_multimer,
+        )
+        self.ipa_dropout = nn.Dropout(self.dropout_rate)
+        self.layer_norm_ipa = LayerNorm(self.c_s)
+        self.transition = StructureModuleTransition(
+            self.c_s,
+            self.no_transition_layers,
+            self.dropout_rate,
+        )
+        if is_multimer:
+            self.bb_update = QuatRigid(self.c_s, full_quat=False)
+        else:
+            self.bb_update = BackboneUpdate(self.c_s)
+        self.angle_resnet = AngleResnet(
+            self.c_s,
+            self.c_resnet,
+            self.no_resnet_blocks,
+            self.no_angles,
+            self.epsilon,
+        )
+    def _forward_monomer(
+        self,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        aatype: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Dict[str, Any]:
+        """
+        Args:
+            s:
+                [*, N_res, C_s] single representation
+            z:
+                [*, N_res, N_res, C_z] pair representation
+            aatype:
+                [*, N_res] amino acid indices
+            mask:
+                Optional [*, N_res] sequence mask
+        Returns:
+            A dictionary of outputs
+        """
+        if mask is None:
+            # [*, N]
+            mask = s.new_ones(s.shape[:-1])
+        # [*, N, C_s]
+        s = self.layer_norm_s(s)
+        # [*, N, N, C_z]
+        z = self.layer_norm_z(z)
+        # [*, N, C_s]
+        s_initial = s
+        s = self.linear_in(s)
+        # [*, N]
+        rigids = Rigid.identity(
+            s.shape[:-1], 
+            s.dtype, 
+            s.device, 
+            self.training,
+            fmt="quat",
+        )
+        outputs = []
+        for i in range(self.no_blocks):
+            # [*, N, C_s]
+            s = s + self.ipa(s, z, rigids, mask)
+            s = self.ipa_dropout(s)
+            s = self.layer_norm_ipa(s)
+            s = self.transition(s)
+            # [*, N]
+            rigids = rigids.compose_q_update_vec(self.bb_update(s))
+            # To hew as closely as possible to AlphaFold, we convert our
+            # quaternion-based transformations to rotation-matrix ones
+            # here
+            backb_to_global = Rigid(
+                Rotation(
+                    rot_mats=rigids.get_rots().get_rot_mats(), 
+                    quats=None
+                ),
+                rigids.get_trans(),
+            )
+            backb_to_global = backb_to_global.scale_translation(
+                self.trans_scale_factor
+            )
+            # [*, N, 7, 2]
+            unnormalized_angles, angles = self.angle_resnet(s, s_initial)
+            all_frames_to_global = self.torsion_angles_to_frames(
+                backb_to_global,
+                angles,
+                aatype,
+            )
+            pred_xyz = self.frames_and_literature_positions_to_atom14_pos(
+                all_frames_to_global,
+                aatype,
+            )
+            scaled_rigids = rigids.scale_translation(self.trans_scale_factor)
+            preds = {
+                "frames": scaled_rigids.to_tensor_7(),
+                "sidechain_frames": all_frames_to_global.to_tensor_4x4(),
+                "unnormalized_angles": unnormalized_angles,
+                "angles": angles,
+                "positions": pred_xyz,
+            }
+            outputs.append(preds)
+            if i < (self.no_blocks - 1):
+                rigids = rigids.stop_rot_gradient()
+            if habana.is_habana():
+                import habana_frameworks.torch.core as htcore
+                htcore.mark_step()
+        outputs = dict_multimap(torch.stack, outputs)
+        outputs["single"] = s
+        return outputs
+    def _forward_multimer(
+        self,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        aatype: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> Dict[str, Any]:
+        if mask is None:
+            # [*, N]
+            mask = s.new_ones(s.shape[:-1])
+        # [*, N, C_s]
+        s = self.layer_norm_s(s)
+        # [*, N, N, C_z]
+        z = self.layer_norm_z(z)
+        # [*, N, C_s]
+        s_initial = s
+        s = self.linear_in(s)
+        # [*, N]
+        rigids = Rigid3Array.identity(
+            s.shape[:-1],
+            s.device,
+        )
+        outputs = []
+        for i in range(self.no_blocks):
+            # [*, N, C_s]
+            s = s + self.ipa(s, z, rigids, mask)
+            s = self.ipa_dropout(s)
+            s = self.layer_norm_ipa(s)
+            s = self.transition(s)
+            # [*, N]
+            rigids = rigids @ self.bb_update(s)
+            # [*, N, 7, 2]
+            unnormalized_angles, angles = self.angle_resnet(s, s_initial)
+            all_frames_to_global = self.torsion_angles_to_frames(
+                rigids.scale_translation(self.trans_scale_factor),
+                angles,
+                aatype,
+            )
+            pred_xyz = self.frames_and_literature_positions_to_atom14_pos(
+                all_frames_to_global,
+                aatype,
+            )
+            preds = {
+                "frames": rigids.scale_translation(self.trans_scale_factor).to_tensor(),
+                "sidechain_frames": all_frames_to_global.to_tensor_4x4(),
+                "unnormalized_angles": unnormalized_angles,
+                "angles": angles,
+                "positions": pred_xyz.to_tensor(),
+            }
+            outputs.append(preds)
+            if i < (self.no_blocks - 1):
+                rigids = rigids.stop_rot_gradient()
+        outputs = dict_multimap(torch.stack, outputs)
+        outputs["single"] = s
+        return outputs
+    def forward(
+        self,
+        s: torch.Tensor,
+        z: torch.Tensor,
+        aatype: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            s:
+                [*, N_res, C_s] single representation
+            z:
+                [*, N_res, N_res, C_z] pair representation
+            aatype:
+                [*, N_res] amino acid indices
+            mask:
+                Optional [*, N_res] sequence mask
+        Returns:
+            A dictionary of outputs
+        """
+        if self.is_multimer:
+            outputs = self._forward_multimer(s, z, aatype, mask)
+        else:
+            outputs = self._forward_monomer(s, z, aatype, mask)
+        return outputs
+    def _init_residue_constants(self, float_dtype: torch.dtype, device: torch.device):
+        if self.default_frames is None:
+            self.default_frames = torch.tensor(
+                restype_rigid_group_default_frame,
+                dtype=float_dtype,
+                device=device,
+                requires_grad=False,
+            )
+        if self.group_idx is None:
+            self.group_idx = torch.tensor(
+                restype_atom14_to_rigid_group,
+                device=device,
+                requires_grad=False,
+            )
+        if self.atom_mask is None:
+            self.atom_mask = torch.tensor(
+                restype_atom14_mask,
+                dtype=float_dtype,
+                device=device,
+                requires_grad=False,
+            )
+        if self.lit_positions is None:
+            self.lit_positions = torch.tensor(
+                restype_atom14_rigid_group_positions,
+                dtype=float_dtype,
+                device=device,
+                requires_grad=False,
+            )
+    def torsion_angles_to_frames(
+        self, r: Union[Rigid, Rigid3Array], alpha: torch.Tensor, f
+    ):
+        # Lazily initialize the residue constants on the correct device
+        self._init_residue_constants(alpha.dtype, alpha.device)
+        # Separated purely to make testing less annoying
+        return torsion_angles_to_frames(r, alpha, f, self.default_frames)
+    def frames_and_literature_positions_to_atom14_pos(
+        self, r: Union[Rigid, Rigid3Array], f  # [*, N, 8]  # [*, N]
+    ):
+        # Lazily initialize the residue constants on the correct device
+        if type(r) == Rigid:
+            self._init_residue_constants(r.get_rots().dtype, r.get_rots().device)
+        elif type(r) == Rigid3Array:
+            self._init_residue_constants(r.dtype, r.device)
+        else:
+            raise ValueError("Unknown rigid type")
+        return frames_and_literature_positions_to_atom14_pos(
+            r,
+            f,
+            self.default_frames,
+            self.group_idx,
+            self.atom_mask,
+            self.lit_positions,
+        )
--- a/fastfold/model/nn/template.py
+++ b/fastfold/model/nn/template.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import math
+from typing import Optional, List
+import torch
+import torch.nn as nn
+from fastfold.model.nn.primitives import Linear, LayerNorm, Attention
+from fastfold.model.nn.dropout import (
+    DropoutRowwise,
+    DropoutColumnwise,
+)
+from fastfold.model.nn.pair_transition import PairTransition
+from fastfold.model.nn.triangular_attention import (
+    TriangleAttentionStartingNode,
+    TriangleAttentionEndingNode,
+)
+from fastfold.model.nn.triangular_multiplicative_update import (
+    TriangleMultiplicationOutgoing,
+    TriangleMultiplicationIncoming,
+)
+from fastfold.utils.checkpointing import checkpoint_blocks
+from fastfold.utils.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+import fastfold.habana as habana
+class TemplatePointwiseAttention(nn.Module):
+    """
+    Implements Algorithm 17.
+    """
+    def __init__(self, c_t, c_z, c_hidden, no_heads, inf, **kwargs):
+        """
+        Args:
+            c_t:
+                Template embedding channel dimension
+            c_z:
+                Pair embedding channel dimension
+            c_hidden:
+                Hidden channel dimension
+        """
+        super(TemplatePointwiseAttention, self).__init__()
+        self.c_t = c_t
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.inf = inf
+        self.mha = Attention(
+            self.c_z,
+            self.c_t,
+            self.c_t,
+            self.c_hidden,
+            self.no_heads,
+            gating=False,
+        )
+    def _chunk(self,
+        z: torch.Tensor,
+        t: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_inputs = {
+            "q_x": z,
+            "kv_x": t,
+            "biases": biases,
+        }
+        return chunk_layer(
+            self.mha,
+            mha_inputs,
+            chunk_size=chunk_size,
+            no_batch_dims=len(z.shape[:-2]),
+        )
+    def forward(self, 
+        t: torch.Tensor, 
+        z: torch.Tensor, 
+        template_mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            t:
+                [*, N_templ, N_res, N_res, C_t] template embedding
+            z:
+                [*, N_res, N_res, C_t] pair embedding
+            template_mask:
+                [*, N_templ] template mask
+        Returns:
+            [*, N_res, N_res, C_z] pair embedding update
+        """
+        if template_mask is None:
+            template_mask = t.new_ones(t.shape[:-3])
+        bias = self.inf * (template_mask[..., None, None, None, None, :] - 1)
+        # [*, N_res, N_res, 1, C_z]
+        z = z.unsqueeze(-2)
+        # [*, N_res, N_res, N_temp, C_t]
+        t = permute_final_dims(t, (1, 2, 0, 3))
+        # [*, N_res, N_res, 1, C_z]
+        biases = [bias]
+        if habana.is_habana():
+            z = self.mha(q_x=z, kv_x=t, biases=biases)
+        else:
+            if chunk_size is not None:
+                z = self._chunk(z, t, biases, chunk_size)
+            else:
+                z = self.mha(q_x=z, kv_x=t, biases=biases)
+        # [*, N_res, N_res, C_z]
+        z = z.squeeze(-2)
+        return z
+class TemplatePairStackBlock(nn.Module):
+    def __init__(
+        self,
+        c_t: int,
+        c_hidden_tri_att: int,
+        c_hidden_tri_mul: int,
+        no_heads: int,
+        pair_transition_n: int,
+        dropout_rate: float,
+        inf: float,
+        is_multimer: bool=False,
+        **kwargs,
+    ):
+        super(TemplatePairStackBlock, self).__init__()
+        self.c_t = c_t
+        self.c_hidden_tri_att = c_hidden_tri_att
+        self.c_hidden_tri_mul = c_hidden_tri_mul
+        self.no_heads = no_heads
+        self.pair_transition_n = pair_transition_n
+        self.dropout_rate = dropout_rate
+        self.inf = inf
+        self.is_multimer = is_multimer
+        self.dropout_row = DropoutRowwise(self.dropout_rate)
+        self.dropout_col = DropoutColumnwise(self.dropout_rate)
+        self.tri_att_start = TriangleAttentionStartingNode(
+            self.c_t,
+            self.c_hidden_tri_att,
+            self.no_heads,
+            inf=inf,
+        )
+        self.tri_att_end = TriangleAttentionEndingNode(
+            self.c_t,
+            self.c_hidden_tri_att,
+            self.no_heads,
+            inf=inf,
+        )
+        self.tri_mul_out = TriangleMultiplicationOutgoing(
+            self.c_t,
+            self.c_hidden_tri_mul,
+        )
+        self.tri_mul_in = TriangleMultiplicationIncoming(
+            self.c_t,
+            self.c_hidden_tri_mul,
+        )
+        self.pair_transition = PairTransition(
+            self.c_t,
+            self.pair_transition_n,
+        )
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: torch.Tensor, 
+        chunk_size: Optional[int] = None, 
+        _mask_trans: bool = True
+    ):
+        single_templates = [
+            t.unsqueeze(-4) for t in torch.unbind(z, dim=-4)
+        ]
+        single_templates_masks = [
+            m.unsqueeze(-3) for m in torch.unbind(mask, dim=-3)
+        ]
+        if not self.is_multimer:
+            for i in range(len(single_templates)):
+                single = single_templates[i]
+                single_mask = single_templates_masks[i]
+                single = single + self.dropout_row(
+                    self.tri_att_start(
+                        single,
+                        chunk_size=chunk_size,
+                        mask=single_mask
+                    )
+                )
+                single = single + self.dropout_col(
+                    self.tri_att_end(
+                        single,
+                        chunk_size=chunk_size,
+                        mask=single_mask
+                    )
+                )
+                single = single + self.dropout_row(
+                    self.tri_mul_out(
+                        single,
+                        mask=single_mask
+                    )
+                )
+                single = single + self.dropout_row(
+                    self.tri_mul_in(
+                        single,
+                        mask=single_mask
+                    )
+                )
+                single = single + self.pair_transition(
+                    single,
+                    mask=single_mask if _mask_trans else None,
+                    chunk_size=chunk_size,
+                )
+                single_templates[i] = single
+        else:
+            for i in range(len(single_templates)):
+                single = single_templates[i]
+                single_mask = single_templates_masks[i]
+                single = single + self.dropout_row(
+                    self.tri_att_start(single, chunk_size=chunk_size, mask=single_mask)
+                )
+                single = single + self.dropout_col(
+                    self.tri_att_end(single, chunk_size=chunk_size, mask=single_mask)
+                )
+                single = single + self.dropout_row(
+                    self.tri_mul_out(single, mask=single_mask)
+                )
+                single = single + self.dropout_row(
+                    self.tri_mul_in(single, mask=single_mask)
+                )
+                single = single + self.pair_transition(
+                    single,
+                    mask=single_mask if _mask_trans else None,
+                    chunk_size=chunk_size,
+                )
+                single_templates[i] = single
+        z = torch.cat(single_templates, dim=-4)
+        return z
+class TemplatePairStack(nn.Module):
+    """
+    Implements Algorithm 16.
+    """
+    def __init__(
+        self,
+        c_t,
+        c_hidden_tri_att,
+        c_hidden_tri_mul,
+        no_blocks,
+        no_heads,
+        pair_transition_n,
+        dropout_rate,
+        blocks_per_ckpt,
+        inf=1e9,
+        **kwargs,
+    ):
+        """
+        Args:
+            c_t:
+                Template embedding channel dimension
+            c_hidden_tri_att:
+                Per-head hidden dimension for triangular attention
+            c_hidden_tri_att:
+                Hidden dimension for triangular multiplication
+            no_blocks:
+                Number of blocks in the stack
+            pair_transition_n:
+                Scale of pair transition (Alg. 15) hidden dimension
+            dropout_rate:
+                Dropout rate used throughout the stack
+            blocks_per_ckpt:
+                Number of blocks per activation checkpoint. None disables
+                activation checkpointing
+        """
+        super(TemplatePairStack, self).__init__()
+        self.blocks_per_ckpt = blocks_per_ckpt
+        self.blocks = nn.ModuleList()
+        for _ in range(no_blocks):
+            block = TemplatePairStackBlock(
+                c_t=c_t,
+                c_hidden_tri_att=c_hidden_tri_att,
+                c_hidden_tri_mul=c_hidden_tri_mul,
+                no_heads=no_heads,
+                pair_transition_n=pair_transition_n,
+                dropout_rate=dropout_rate,
+                inf=inf,
+            )
+            self.blocks.append(block)
+        self.layer_norm = LayerNorm(c_t)
+    def forward(
+        self,
+        t: torch.tensor,
+        mask: torch.tensor,
+        chunk_size: int,
+        _mask_trans: bool = True,
+    ):
+        """
+        Args:
+            t:
+                [*, N_templ, N_res, N_res, C_t] template embedding
+            mask:
+                [*, N_templ, N_res, N_res] mask
+        Returns:
+            [*, N_templ, N_res, N_res, C_t] template embedding update
+        """
+        if(mask.shape[-3] == 1):
+            expand_idx = list(mask.shape)
+            expand_idx[-3] = t.shape[-4]
+            mask = mask.expand(*expand_idx)
+        t, = checkpoint_blocks(
+            blocks=[
+                partial(
+                    b,
+                    mask=mask,
+                    chunk_size=chunk_size,
+                    _mask_trans=_mask_trans,
+                )
+                for b in self.blocks
+            ],
+            args=(t,),
+            blocks_per_ckpt=self.blocks_per_ckpt if self.training else None,
+        )
+        t = self.layer_norm(t)
+        return t
\ No newline at end of file
--- a/fastfold/model/nn/triangular_attention.py
+++ b/fastfold/model/nn/triangular_attention.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partialmethod, partial
+import math
+from typing import Optional, List
+import torch
+import torch.nn as nn
+from fastfold.model.nn.primitives import Linear, LayerNorm, Attention
+from fastfold.utils.tensor_utils import (
+    chunk_layer,
+    permute_final_dims,
+    flatten_final_dims,
+)
+class TriangleAttention(nn.Module):
+    def __init__(
+        self, c_in, c_hidden, no_heads, starting, inf=1e9
+    ):
+        """
+        Args:
+            c_in:
+                Input channel dimension
+            c_hidden:
+                Overall hidden channel dimension (not per-head)
+            no_heads:
+                Number of attention heads
+        """
+        super(TriangleAttention, self).__init__()
+        self.c_in = c_in
+        self.c_hidden = c_hidden
+        self.no_heads = no_heads
+        self.starting = starting
+        self.inf = inf
+        self.layer_norm = LayerNorm(self.c_in)
+        self.linear = Linear(c_in, self.no_heads, bias=False, init="normal")
+        self.mha = Attention(
+            self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads
+        )
+    @torch.jit.ignore
+    def _chunk(self,
+        x: torch.Tensor,
+        biases: List[torch.Tensor],
+        chunk_size: int,
+    ) -> torch.Tensor:
+        mha_inputs = {
+            "q_x": x,
+            "kv_x": x,
+            "biases": biases,
+        }
+        return chunk_layer(
+            partial(self.mha),
+            mha_inputs,
+            chunk_size=chunk_size,
+            no_batch_dims=len(x.shape[:-2]),
+        )
+    def forward(self, 
+        x: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None,
+        chunk_size: Optional[int] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, I, J, C_in] input tensor (e.g. the pair representation)
+        Returns:
+            [*, I, J, C_in] output tensor
+        """
+        if mask is None:
+            # [*, I, J]
+            mask = x.new_ones(
+                x.shape[:-1],
+            )
+        # Shape annotations assume self.starting. Else, I and J are flipped
+        if not self.starting:
+            x = x.transpose(-2, -3)
+            mask = mask.transpose(-1, -2)
+        # [*, I, J, C_in]
+        x = self.layer_norm(x)
+        # [*, I, 1, 1, J]
+        mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
+        # [*, H, I, J]
+        triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
+        # [*, 1, H, I, J]
+        triangle_bias = triangle_bias.unsqueeze(-4)
+        biases = [mask_bias, triangle_bias]
+        if chunk_size is not None:
+            x = self._chunk(x, biases, chunk_size)
+        else:
+            x = self.mha(q_x=x, kv_x=x, biases=biases)
+        if not self.starting:
+            x = x.transpose(-2, -3)
+        return x
+class TriangleAttentionStartingNode(TriangleAttention):
+    """
+    Implements Algorithm 13.
+    """
+    __init__ = partialmethod(TriangleAttention.__init__, starting=True)
+class TriangleAttentionEndingNode(TriangleAttention):
+    """
+    Implements Algorithm 14.
+    """
+    __init__ = partialmethod(TriangleAttention.__init__, starting=False)
--- a/fastfold/model/nn/triangular_multiplicative_update.py
+++ b/fastfold/model/nn/triangular_multiplicative_update.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partialmethod
+from typing import Optional
+import torch
+import torch.nn as nn
+from fastfold.model.nn.primitives import Linear, LayerNorm
+from fastfold.utils.tensor_utils import permute_final_dims
+_FUSED_TRIANGLE_MULTIPLICATION = False
+def set_fused_triangle_multiplication():
+    global _FUSED_TRIANGLE_MULTIPLICATION
+    _FUSED_TRIANGLE_MULTIPLICATION = True
+def is_fused_triangle_multiplication():
+    global _FUSED_TRIANGLE_MULTIPLICATION
+    return _FUSED_TRIANGLE_MULTIPLICATION
+class TriangleMultiplicativeUpdate(nn.Module):
+    """
+    Implements Algorithms 11 and 12.
+    """
+    def __init__(self, c_z, c_hidden, _outgoing=True):
+        """
+        Args:
+            c_z:
+                Input channel dimension
+            c:
+                Hidden channel dimension
+        """
+        super(TriangleMultiplicativeUpdate, self).__init__()
+        self.c_z = c_z
+        self.c_hidden = c_hidden
+        self._outgoing = _outgoing
+        if _FUSED_TRIANGLE_MULTIPLICATION:
+            self.linear_p = Linear(self.c_z, 2 * self.c_hidden)
+            self.linear_g = Linear(self.c_z, 2 * self.c_hidden, init="gating")
+            self.linear_gate = Linear(self.c_z, self.c_z, init="gating")
+        else:
+            self.linear_a_p = Linear(self.c_z, self.c_hidden)
+            self.linear_a_g = Linear(self.c_z, self.c_hidden, init="gating")
+            self.linear_b_p = Linear(self.c_z, self.c_hidden)
+            self.linear_b_g = Linear(self.c_z, self.c_hidden, init="gating")
+            self.linear_g = Linear(self.c_z, self.c_z, init="gating")
+        self.linear_z = Linear(self.c_hidden, self.c_z, init="final")
+        self.layer_norm_in = LayerNorm(self.c_z)
+        self.layer_norm_out = LayerNorm(self.c_hidden)
+        self.sigmoid = nn.Sigmoid()
+    def _combine_projections(self,
+        a: torch.Tensor,
+        b: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError("This method needs to be overridden")
+    def forward(self, 
+        z: torch.Tensor, 
+        mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:
+                [*, N_res, N_res, C_z] input tensor
+            mask:
+                [*, N_res, N_res] input mask
+        Returns:
+            [*, N_res, N_res, C_z] output tensor
+        """
+        if mask is None:
+            mask = z.new_ones(z.shape[:-1])
+        mask = mask.unsqueeze(-1)
+        z = self.layer_norm_in(z)
+        if _FUSED_TRIANGLE_MULTIPLICATION:
+            a = self.linear_p(z) * mask
+            a = self.sigmoid(self.linear_g(z))
+            a, b = a.chunk(2, dim=-1)
+        else:
+            a = self.linear_a_p(z) * self.sigmoid(self.linear_a_g(z))
+            a = a * mask
+            b = self.linear_b_p(z) * self.sigmoid(self.linear_b_g(z))
+            b = b * mask
+        x = self._combine_projections(a, b)
+        x = self.layer_norm_out(x)
+        x = self.linear_z(x)
+        if _FUSED_TRIANGLE_MULTIPLICATION:
+            g = self.sigmoid(self.linear_gate(z))
+        else:
+            g = self.sigmoid(self.linear_g(z))
+        z = x * g
+        return z
+class TriangleMultiplicationOutgoing(TriangleMultiplicativeUpdate):
+    """
+    Implements Algorithm 11.
+    """
+    def _combine_projections(self,
+        a: torch.Tensor,  # [*, N_i, N_k, C]
+        b: torch.Tensor,  # [*, N_j, N_k, C]
+    ):
+        # [*, C, N_i, N_j]
+        p = torch.matmul(
+            permute_final_dims(a, (2, 0, 1)),
+            permute_final_dims(b, (2, 1, 0)),
+        )
+        # [*, N_i, N_j, C]
+        return permute_final_dims(p, (1, 2, 0))
+class TriangleMultiplicationIncoming(TriangleMultiplicativeUpdate):
+    """
+    Implements Algorithm 12.
+    """
+    def _combine_projections(self,
+        a: torch.Tensor,  # [*, N_k, N_i, C]
+        b: torch.Tensor,  # [*, N_k, N_j, C]
+    ):
+        # [*, C, N_i, N_j]
+        p = torch.matmul(
+            permute_final_dims(a, (2, 1, 0)),
+            permute_final_dims(b, (2, 0, 1)),
+        )
+        # [*, N_i, N_j, C]
+        return permute_final_dims(p, (1, 2, 0))
--- a/fastfold/relax/__init__.py
+++ b/fastfold/relax/__init__.py
--- a/fastfold/relax/amber_minimize.py
+++ b/fastfold/relax/amber_minimize.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Restrained Amber Minimization of a structure."""
+import io
+import time
+from typing import Collection, Optional, Sequence, Dict
+from absl import logging
+from fastfold.common import (
+    protein,
+    residue_constants,
+)
+import fastfold.model.hub.loss as loss
+from fastfold.relax import cleanup, utils
+import ml_collections
+import numpy as np
+import openmm
+from openmm import unit
+from openmm import app as openmm_app
+from openmm.app.internal.pdbstructure import PdbStructure
+ENERGY = unit.kilocalories_per_mole
+LENGTH = unit.angstroms
+def will_restrain(atom: openmm_app.Atom, rset: str) -> bool:
+    """Returns True if the atom will be restrained by the given restraint set."""
+    if rset == "non_hydrogen":
+        return atom.element.name != "hydrogen"
+    elif rset == "c_alpha":
+        return atom.name == "CA"
+def _add_restraints(
+    system: openmm.System,
+    reference_pdb: openmm_app.PDBFile,
+    stiffness: unit.Unit,
+    rset: str,
+    exclude_residues: Sequence[int],
+):
+    """Adds a harmonic potential that restrains the system to a structure."""
+    assert rset in ["non_hydrogen", "c_alpha"]
+    force = openmm.CustomExternalForce("0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)")
+    force.addGlobalParameter("k", stiffness)
+    for p in ["x0", "y0", "z0"]:
+        force.addPerParticleParameter(p)
+    for i, atom in enumerate(reference_pdb.topology.atoms()):
+        if atom.residue.index in exclude_residues:
+            continue
+        if will_restrain(atom, rset):
+            force.addParticle(i, reference_pdb.positions[i])
+    logging.info(
+        "Restraining %d / %d particles.",
+        force.getNumParticles(),
+        system.getNumParticles(),
+    )
+    system.addForce(force)
+def _openmm_minimize(
+    pdb_str: str,
+    max_iterations: int,
+    tolerance: unit.Unit,
+    stiffness: unit.Unit,
+    restraint_set: str,
+    exclude_residues: Sequence[int],
+    use_gpu: bool,
+):
+    """Minimize energy via openmm."""
+    pdb_file = io.StringIO(pdb_str)
+    pdb = openmm_app.PDBFile(pdb_file)
+    force_field = openmm_app.ForceField("amber99sb.xml")
+    constraints = openmm_app.HBonds
+    system = force_field.createSystem(pdb.topology, constraints=constraints)
+    if stiffness > 0 * ENERGY / (LENGTH**2):
+        _add_restraints(system, pdb, stiffness, restraint_set, exclude_residues)
+    integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
+    platform = openmm.Platform.getPlatformByName("CUDA" if use_gpu else "CPU")
+    simulation = openmm_app.Simulation(pdb.topology, system, integrator, platform)
+    simulation.context.setPositions(pdb.positions)
+    ret = {}
+    state = simulation.context.getState(getEnergy=True, getPositions=True)
+    ret["einit"] = state.getPotentialEnergy().value_in_unit(ENERGY)
+    ret["posinit"] = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+    simulation.minimizeEnergy(maxIterations=max_iterations, tolerance=tolerance)
+    state = simulation.context.getState(getEnergy=True, getPositions=True)
+    ret["efinal"] = state.getPotentialEnergy().value_in_unit(ENERGY)
+    ret["pos"] = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+    ret["min_pdb"] = _get_pdb_string(simulation.topology, state.getPositions())
+    return ret
+def _get_pdb_string(topology: openmm_app.Topology, positions: unit.Quantity):
+    """Returns a pdb string provided OpenMM topology and positions."""
+    with io.StringIO() as f:
+        openmm_app.PDBFile.writeFile(topology, positions, f)
+        return f.getvalue()
+def _check_cleaned_atoms(pdb_cleaned_string: str, pdb_ref_string: str):
+    """Checks that no atom positions have been altered by cleaning."""
+    cleaned = openmm_app.PDBFile(io.StringIO(pdb_cleaned_string))
+    reference = openmm_app.PDBFile(io.StringIO(pdb_ref_string))
+    cl_xyz = np.array(cleaned.getPositions().value_in_unit(LENGTH))
+    ref_xyz = np.array(reference.getPositions().value_in_unit(LENGTH))
+    for ref_res, cl_res in zip(reference.topology.residues(), cleaned.topology.residues()):
+        assert ref_res.name == cl_res.name
+        for rat in ref_res.atoms():
+            for cat in cl_res.atoms():
+                if cat.name == rat.name:
+                    if not np.array_equal(cl_xyz[cat.index], ref_xyz[rat.index]):
+                        raise ValueError(f"Coordinates of cleaned atom {cat} do not match "
+                                         f"coordinates of reference atom {rat}.")
+def _check_residues_are_well_defined(prot: protein.Protein):
+    """Checks that all residues contain non-empty atom sets."""
+    if (prot.atom_mask.sum(axis=-1) == 0).any():
+        raise ValueError("Amber minimization can only be performed on proteins with"
+                         " well-defined residues. This protein contains at least"
+                         " one residue with no atoms.")
+def _check_atom_mask_is_ideal(prot):
+    """Sanity-check the atom mask is ideal, up to a possible OXT."""
+    atom_mask = prot.atom_mask
+    ideal_atom_mask = protein.ideal_atom_mask(prot)
+    utils.assert_equal_nonterminal_atom_types(atom_mask, ideal_atom_mask)
+def clean_protein(prot: protein.Protein, checks: bool = True):
+    """Adds missing atoms to Protein instance.
+    Args:
+      prot: A `protein.Protein` instance.
+      checks: A `bool` specifying whether to add additional checks to the cleaning
+        process.
+    Returns:
+      pdb_string: A string of the cleaned protein.
+    """
+    _check_atom_mask_is_ideal(prot)
+    # Clean pdb.
+    prot_pdb_string = protein.to_pdb(prot)
+    pdb_file = io.StringIO(prot_pdb_string)
+    alterations_info = {}
+    fixed_pdb = cleanup.fix_pdb(pdb_file, alterations_info)
+    fixed_pdb_file = io.StringIO(fixed_pdb)
+    pdb_structure = PdbStructure(fixed_pdb_file)
+    cleanup.clean_structure(pdb_structure, alterations_info)
+    logging.info("alterations info: %s", alterations_info)
+    # Write pdb file of cleaned structure.
+    as_file = openmm_app.PDBFile(pdb_structure)
+    pdb_string = _get_pdb_string(as_file.getTopology(), as_file.getPositions())
+    if checks:
+        _check_cleaned_atoms(pdb_string, prot_pdb_string)
+    return pdb_string
+def make_atom14_positions(prot):
+    """Constructs denser atom positions (14 dimensions instead of 37)."""
+    restype_atom14_to_atom37 = []  # mapping (restype, atom14) --> atom37
+    restype_atom37_to_atom14 = []  # mapping (restype, atom37) --> atom14
+    restype_atom14_mask = []
+    for rt in residue_constants.restypes:
+        atom_names = residue_constants.restype_name_to_atom14_names[
+            residue_constants.restype_1to3[rt]]
+        restype_atom14_to_atom37.append([
+            (residue_constants.atom_order[name] if name else 0) for name in atom_names
+        ])
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14.append([
+            (atom_name_to_idx14[name] if name in atom_name_to_idx14 else 0)
+            for name in residue_constants.atom_types
+        ])
+        restype_atom14_mask.append([(1.0 if name else 0.0) for name in atom_names])
+    # Add dummy mapping for restype 'UNK'.
+    restype_atom14_to_atom37.append([0] * 14)
+    restype_atom37_to_atom14.append([0] * 37)
+    restype_atom14_mask.append([0.0] * 14)
+    restype_atom14_to_atom37 = np.array(restype_atom14_to_atom37, dtype=np.int32)
+    restype_atom37_to_atom14 = np.array(restype_atom37_to_atom14, dtype=np.int32)
+    restype_atom14_mask = np.array(restype_atom14_mask, dtype=np.float32)
+    # Create the mapping for (residx, atom14) --> atom37, i.e. an array
+    # with shape (num_res, 14) containing the atom37 indices for this protein.
+    residx_atom14_to_atom37 = restype_atom14_to_atom37[prot["aatype"]]
+    residx_atom14_mask = restype_atom14_mask[prot["aatype"]]
+    # Create a mask for known ground truth positions.
+    residx_atom14_gt_mask = residx_atom14_mask * np.take_along_axis(
+        prot["all_atom_mask"], residx_atom14_to_atom37, axis=1).astype(np.float32)
+    # Gather the ground truth positions.
+    residx_atom14_gt_positions = residx_atom14_gt_mask[:, :, None] * (np.take_along_axis(
+        prot["all_atom_positions"],
+        residx_atom14_to_atom37[..., None],
+        axis=1,
+    ))
+    prot["atom14_atom_exists"] = residx_atom14_mask
+    prot["atom14_gt_exists"] = residx_atom14_gt_mask
+    prot["atom14_gt_positions"] = residx_atom14_gt_positions
+    prot["residx_atom14_to_atom37"] = residx_atom14_to_atom37.astype(np.int64)
+    # Create the gather indices for mapping back.
+    residx_atom37_to_atom14 = restype_atom37_to_atom14[prot["aatype"]]
+    prot["residx_atom37_to_atom14"] = residx_atom37_to_atom14.astype(np.int64)
+    # Create the corresponding mask.
+    restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+    for restype, restype_letter in enumerate(residue_constants.restypes):
+        restype_name = residue_constants.restype_1to3[restype_letter]
+        atom_names = residue_constants.residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = residue_constants.atom_order[atom_name]
+            restype_atom37_mask[restype, atom_type] = 1
+    residx_atom37_mask = restype_atom37_mask[prot["aatype"]]
+    prot["atom37_atom_exists"] = residx_atom37_mask
+    # As the atom naming is ambiguous for 7 of the 20 amino acids, provide
+    # alternative ground truth coordinates where the naming is swapped
+    restype_3 = [residue_constants.restype_1to3[res] for res in residue_constants.restypes]
+    restype_3 += ["UNK"]
+    # Matrices for renaming ambiguous atoms.
+    all_matrices = {res: np.eye(14, dtype=np.float32) for res in restype_3}
+    for resname, swap in residue_constants.residue_atom_renaming_swaps.items():
+        correspondences = np.arange(14)
+        for source_atom_swap, target_atom_swap in swap.items():
+            source_index = residue_constants.restype_name_to_atom14_names[resname].index(
+                source_atom_swap)
+            target_index = residue_constants.restype_name_to_atom14_names[resname].index(
+                target_atom_swap)
+            correspondences[source_index] = target_index
+            correspondences[target_index] = source_index
+            renaming_matrix = np.zeros((14, 14), dtype=np.float32)
+            for index, correspondence in enumerate(correspondences):
+                renaming_matrix[index, correspondence] = 1.0
+        all_matrices[resname] = renaming_matrix.astype(np.float32)
+    renaming_matrices = np.stack([all_matrices[restype] for restype in restype_3])
+    # Pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14).
+    renaming_transform = renaming_matrices[prot["aatype"]]
+    # Apply it to the ground truth positions. shape (num_res, 14, 3).
+    alternative_gt_positions = np.einsum("rac,rab->rbc", residx_atom14_gt_positions,
+                                         renaming_transform)
+    prot["atom14_alt_gt_positions"] = alternative_gt_positions
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position).
+    alternative_gt_mask = np.einsum("ra,rab->rb", residx_atom14_gt_mask, renaming_transform)
+    prot["atom14_alt_gt_exists"] = alternative_gt_mask
+    # Create an ambiguous atoms mask.  shape: (21, 14).
+    restype_atom14_is_ambiguous = np.zeros((21, 14), dtype=np.float32)
+    for resname, swap in residue_constants.residue_atom_renaming_swaps.items():
+        for atom_name1, atom_name2 in swap.items():
+            restype = residue_constants.restype_order[residue_constants.restype_3to1[resname]]
+            atom_idx1 = residue_constants.restype_name_to_atom14_names[resname].index(atom_name1)
+            atom_idx2 = residue_constants.restype_name_to_atom14_names[resname].index(atom_name2)
+            restype_atom14_is_ambiguous[restype, atom_idx1] = 1
+            restype_atom14_is_ambiguous[restype, atom_idx2] = 1
+    # From this create an ambiguous_mask for the given sequence.
+    prot["atom14_atom_is_ambiguous"] = restype_atom14_is_ambiguous[prot["aatype"]]
+    return prot
+def find_violations(prot_np: protein.Protein):
+    """Analyzes a protein and returns structural violation information.
+    Args:
+      prot_np: A protein.
+    Returns:
+      violations: A `dict` of structure components with structural violations.
+      violation_metrics: A `dict` of violation metrics.
+    """
+    batch = {
+        "aatype": prot_np.aatype,
+        "all_atom_positions": prot_np.atom_positions.astype(np.float32),
+        "all_atom_mask": prot_np.atom_mask.astype(np.float32),
+        "residue_index": prot_np.residue_index,
+    }
+    batch["seq_mask"] = np.ones_like(batch["aatype"], np.float32)
+    batch = make_atom14_positions(batch)
+    violations = loss.find_structural_violations_np(
+        batch=batch,
+        atom14_pred_positions=batch["atom14_gt_positions"],
+        config=ml_collections.ConfigDict({
+            "violation_tolerance_factor": 12,  # Taken from model config.
+            "clash_overlap_tolerance": 1.5,  # Taken from model config.
+        }),
+    )
+    violation_metrics = loss.compute_violation_metrics_np(
+        batch=batch,
+        atom14_pred_positions=batch["atom14_gt_positions"],
+        violations=violations,
+    )
+    return violations, violation_metrics
+def get_violation_metrics(prot: protein.Protein):
+    """Computes violation and alignment metrics."""
+    structural_violations, struct_metrics = find_violations(prot)
+    violation_idx = np.flatnonzero(structural_violations["total_per_residue_violations_mask"])
+    struct_metrics["residue_violations"] = violation_idx
+    struct_metrics["num_residue_violations"] = len(violation_idx)
+    struct_metrics["structural_violations"] = structural_violations
+    return struct_metrics
+def _run_one_iteration(
+    *,
+    pdb_string: str,
+    max_iterations: int,
+    tolerance: float,
+    stiffness: float,
+    restraint_set: str,
+    max_attempts: int,
+    exclude_residues: Optional[Collection[int]] = None,
+    use_gpu: bool,
+):
+    """Runs the minimization pipeline.
+    Args:
+      pdb_string: A pdb string.
+      max_iterations: An `int` specifying the maximum number of L-BFGS iterations.
+      A value of 0 specifies no limit.
+      tolerance: kcal/mol, the energy tolerance of L-BFGS.
+      stiffness: kcal/mol A**2, spring constant of heavy atom restraining
+        potential.
+      restraint_set: The set of atoms to restrain.
+      max_attempts: The maximum number of minimization attempts.
+      exclude_residues: An optional list of zero-indexed residues to exclude from
+          restraints.
+      use_gpu: Whether to run relaxation on GPU
+    Returns:
+      A `dict` of minimization info.
+    """
+    exclude_residues = exclude_residues or []
+    # Assign physical dimensions.
+    tolerance = tolerance * ENERGY
+    stiffness = stiffness * ENERGY / (LENGTH**2)
+    start = time.perf_counter()
+    minimized = False
+    attempts = 0
+    while not minimized and attempts < max_attempts:
+        attempts += 1
+        try:
+            logging.info("Minimizing protein, attempt %d of %d.", attempts, max_attempts)
+            ret = _openmm_minimize(
+                pdb_string,
+                max_iterations=max_iterations,
+                tolerance=tolerance,
+                stiffness=stiffness,
+                restraint_set=restraint_set,
+                exclude_residues=exclude_residues,
+                use_gpu=use_gpu,
+            )
+            minimized = True
+        except Exception as e:  # pylint: disable=broad-except
+            print(e)
+            logging.info(e)
+    if not minimized:
+        raise ValueError(f"Minimization failed after {max_attempts} attempts.")
+    ret["opt_time"] = time.perf_counter() - start
+    ret["min_attempts"] = attempts
+    return ret
+def run_pipeline(
+    prot: protein.Protein,
+    stiffness: float,
+    use_gpu: bool,
+    max_outer_iterations: int = 1,
+    place_hydrogens_every_iteration: bool = True,
+    max_iterations: int = 0,
+    tolerance: float = 2.39,
+    restraint_set: str = "non_hydrogen",
+    max_attempts: int = 100,
+    checks: bool = True,
+    exclude_residues: Optional[Sequence[int]] = None,
+):
+    """Run iterative amber relax.
+    Successive relax iterations are performed until all violations have been
+    resolved. Each iteration involves a restrained Amber minimization, with
+    restraint exclusions determined by violation-participating residues.
+    Args:
+      prot: A protein to be relaxed.
+      stiffness: kcal/mol A**2, the restraint stiffness.
+      use_gpu: Whether to run on GPU
+      max_outer_iterations: The maximum number of iterative minimization.
+      place_hydrogens_every_iteration: Whether hydrogens are re-initialized
+          prior to every minimization.
+      max_iterations: An `int` specifying the maximum number of L-BFGS steps
+          per relax iteration. A value of 0 specifies no limit.
+      tolerance: kcal/mol, the energy tolerance of L-BFGS.
+          The default value is the OpenMM default.
+      restraint_set: The set of atoms to restrain.
+      max_attempts: The maximum number of minimization attempts per iteration.
+      checks: Whether to perform cleaning checks.
+      exclude_residues: An optional list of zero-indexed residues to exclude from
+          restraints.
+    Returns:
+      out: A dictionary of output values.
+    """
+    # `protein.to_pdb` will strip any poorly-defined residues so we need to
+    # perform this check before `clean_protein`.
+    _check_residues_are_well_defined(prot)
+    pdb_string = clean_protein(prot, checks=checks)
+    exclude_residues = exclude_residues or []
+    exclude_residues = set(exclude_residues)
+    violations = np.inf
+    iteration = 0
+    while violations > 0 and iteration < max_outer_iterations:
+        ret = _run_one_iteration(
+            pdb_string=pdb_string,
+            exclude_residues=exclude_residues,
+            max_iterations=max_iterations,
+            tolerance=tolerance,
+            stiffness=stiffness,
+            restraint_set=restraint_set,
+            max_attempts=max_attempts,
+            use_gpu=use_gpu,
+        )
+        prot = protein.from_pdb_string(ret["min_pdb"])
+        if place_hydrogens_every_iteration:
+            pdb_string = clean_protein(prot, checks=True)
+        else:
+            pdb_string = ret["min_pdb"]
+        ret.update(get_violation_metrics(prot))
+        ret.update({
+            "num_exclusions": len(exclude_residues),
+            "iteration": iteration,
+        })
+        violations = ret["violations_per_residue"]
+        exclude_residues = exclude_residues.union(ret["residue_violations"])
+        logging.info(
+            "Iteration completed: Einit %.2f Efinal %.2f Time %.2f s "
+            "num residue violations %d num residue exclusions %d ",
+            ret["einit"],
+            ret["efinal"],
+            ret["opt_time"],
+            ret["num_residue_violations"],
+            ret["num_exclusions"],
+        )
+        iteration += 1
+    return ret
+def get_initial_energies(
+    pdb_strs: Sequence[str],
+    stiffness: float = 0.0,
+    restraint_set: str = "non_hydrogen",
+    exclude_residues: Optional[Sequence[int]] = None,
+):
+    """Returns initial potential energies for a sequence of PDBs.
+    Assumes the input PDBs are ready for minimization, and all have the same
+    topology.
+    Allows time to be saved by not pdbfixing / rebuilding the system.
+    Args:
+      pdb_strs: List of PDB strings.
+      stiffness: kcal/mol A**2, spring constant of heavy atom restraining
+          potential.
+      restraint_set: Which atom types to restrain.
+      exclude_residues: An optional list of zero-indexed residues to exclude from
+          restraints.
+    Returns:
+      A list of initial energies in the same order as pdb_strs.
+    """
+    exclude_residues = exclude_residues or []
+    openmm_pdbs = [openmm_app.PDBFile(PdbStructure(io.StringIO(p))) for p in pdb_strs]
+    force_field = openmm_app.ForceField("amber99sb.xml")
+    system = force_field.createSystem(openmm_pdbs[0].topology, constraints=openmm_app.HBonds)
+    stiffness = stiffness * ENERGY / (LENGTH**2)
+    if stiffness > 0 * ENERGY / (LENGTH**2):
+        _add_restraints(system, openmm_pdbs[0], stiffness, restraint_set, exclude_residues)
+    simulation = openmm_app.Simulation(
+        openmm_pdbs[0].topology,
+        system,
+        openmm.LangevinIntegrator(0, 0.01, 0.0),
+        openmm.Platform.getPlatformByName("CPU"),
+    )
+    energies = []
+    for pdb in openmm_pdbs:
+        try:
+            simulation.context.setPositions(pdb.positions)
+            state = simulation.context.getState(getEnergy=True)
+            energies.append(state.getPotentialEnergy().value_in_unit(ENERGY))
+        except Exception as e:  # pylint: disable=broad-except
+            logging.error("Error getting initial energy, returning large value %s", e)
+            energies.append(unit.Quantity(1e20, ENERGY))
+    return energies
--- a/fastfold/relax/cleanup.py
+++ b/fastfold/relax/cleanup.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Cleans up a PDB file using pdbfixer in preparation for OpenMM simulations.
+fix_pdb uses a third-party tool. We also support fixing some additional edge
+cases like removing chains of length one (see clean_structure).
+"""
+import io
+import pdbfixer
+from openmm import app
+from openmm.app import element
+def fix_pdb(pdbfile, alterations_info):
+  """Apply pdbfixer to the contents of a PDB file; return a PDB string result.
+  1) Replaces nonstandard residues.
+  2) Removes heterogens (non protein residues) including water.
+  3) Adds missing residues and missing atoms within existing residues.
+  4) Adds hydrogens assuming pH=7.0.
+  5) KeepIds is currently true, so the fixer must keep the existing chain and
+     residue identifiers. This will fail for some files in wider PDB that have
+     invalid IDs.
+  Args:
+    pdbfile: Input PDB file handle.
+    alterations_info: A dict that will store details of changes made.
+  Returns:
+    A PDB string representing the fixed structure.
+  """
+  fixer = pdbfixer.PDBFixer(pdbfile=pdbfile)
+  fixer.findNonstandardResidues()
+  alterations_info['nonstandard_residues'] = fixer.nonstandardResidues
+  fixer.replaceNonstandardResidues()
+  _remove_heterogens(fixer, alterations_info, keep_water=False)
+  fixer.findMissingResidues()
+  alterations_info['missing_residues'] = fixer.missingResidues
+  fixer.findMissingAtoms()
+  alterations_info['missing_heavy_atoms'] = fixer.missingAtoms
+  alterations_info['missing_terminals'] = fixer.missingTerminals
+  fixer.addMissingAtoms(seed=0)
+  fixer.addMissingHydrogens()
+  out_handle = io.StringIO()
+  app.PDBFile.writeFile(fixer.topology, fixer.positions, out_handle,
+                        keepIds=True)
+  return out_handle.getvalue()
+def clean_structure(pdb_structure, alterations_info):
+  """Applies additional fixes to an OpenMM structure, to handle edge cases.
+  Args:
+    pdb_structure: An OpenMM structure to modify and fix.
+    alterations_info: A dict that will store details of changes made.
+  """
+  _replace_met_se(pdb_structure, alterations_info)
+  _remove_chains_of_length_one(pdb_structure, alterations_info)
+def _remove_heterogens(fixer, alterations_info, keep_water):
+  """Removes the residues that Pdbfixer considers to be heterogens.
+  Args:
+    fixer: A Pdbfixer instance.
+    alterations_info: A dict that will store details of changes made.
+    keep_water: If True, water (HOH) is not considered to be a heterogen.
+  """
+  initial_resnames = set()
+  for chain in fixer.topology.chains():
+    for residue in chain.residues():
+      initial_resnames.add(residue.name)
+  fixer.removeHeterogens(keepWater=keep_water)
+  final_resnames = set()
+  for chain in fixer.topology.chains():
+    for residue in chain.residues():
+      final_resnames.add(residue.name)
+  alterations_info['removed_heterogens'] = (
+      initial_resnames.difference(final_resnames))
+def _replace_met_se(pdb_structure, alterations_info):
+  """Replace the Se in any MET residues that were not marked as modified."""
+  modified_met_residues = []
+  for res in pdb_structure.iter_residues():
+    name = res.get_name_with_spaces().strip()
+    if name == 'MET':
+      s_atom = res.get_atom('SD')
+      if s_atom.element_symbol == 'Se':
+        s_atom.element_symbol = 'S'
+        s_atom.element = element.get_by_symbol('S')
+        modified_met_residues.append(s_atom.residue_number)
+  alterations_info['Se_in_MET'] = modified_met_residues
+def _remove_chains_of_length_one(pdb_structure, alterations_info):
+  """Removes chains that correspond to a single amino acid.
+  A single amino acid in a chain is both N and C terminus. There is no force
+  template for this case.
+  Args:
+    pdb_structure: An OpenMM pdb_structure to modify and fix.
+    alterations_info: A dict that will store details of changes made.
+  """
+  removed_chains = {}
+  for model in pdb_structure.iter_models():
+    valid_chains = [c for c in model.iter_chains() if len(c) > 1]
+    invalid_chain_ids = [c.chain_id for c in model.iter_chains() if len(c) <= 1]
+    model.chains = valid_chains
+    for chain_id in invalid_chain_ids:
+      model.chains_by_id.pop(chain_id)
+    removed_chains[model.number] = invalid_chain_ids
+  alterations_info['removed_chains'] = removed_chains
--- a/fastfold/relax/relax.py
+++ b/fastfold/relax/relax.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Amber relaxation."""
+from typing import Any, Dict, Sequence, Tuple
+import numpy as np
+from fastfold.common import protein
+from fastfold.relax import amber_minimize, utils
+class AmberRelaxation(object):
+    """Amber relaxation."""
+    def __init__(
+        self,
+        *,
+        max_iterations: int,
+        tolerance: float,
+        stiffness: float,
+        exclude_residues: Sequence[int],
+        max_outer_iterations: int,
+        use_gpu: bool,
+    ):
+        """Initialize Amber Relaxer.
+        Args:
+          max_iterations: Maximum number of L-BFGS iterations. 0 means no max.
+          tolerance: kcal/mol, the energy tolerance of L-BFGS.
+          stiffness: kcal/mol A**2, spring constant of heavy atom restraining
+            potential.
+          exclude_residues: Residues to exclude from per-atom restraining.
+            Zero-indexed.
+          max_outer_iterations: Maximum number of violation-informed relax
+           iterations. A value of 1 will run the non-iterative procedure used in
+           CASP14. Use 20 so that >95% of the bad cases are relaxed. Relax finishes
+           as soon as there are no violations, hence in most cases this causes no
+           slowdown. In the worst case we do 20 outer iterations.
+          use_gpu: Whether to run on GPU
+        """
+        self._max_iterations = max_iterations
+        self._tolerance = tolerance
+        self._stiffness = stiffness
+        self._exclude_residues = exclude_residues
+        self._max_outer_iterations = max_outer_iterations
+        self._use_gpu = use_gpu
+    def process(
+        self, *, prot: protein.Protein
+    ) -> Tuple[str, Dict[str, Any], np.ndarray]:
+        """Runs Amber relax on a prediction, adds hydrogens, returns PDB string."""
+        out = amber_minimize.run_pipeline(
+            prot=prot,
+            max_iterations=self._max_iterations,
+            tolerance=self._tolerance,
+            stiffness=self._stiffness,
+            exclude_residues=self._exclude_residues,
+            max_outer_iterations=self._max_outer_iterations,
+            use_gpu=self._use_gpu,
+        )
+        min_pos = out["pos"]
+        start_pos = out["posinit"]
+        rmsd = np.sqrt(np.sum((start_pos - min_pos) ** 2) / start_pos.shape[0])
+        debug_data = {
+            "initial_energy": out["einit"],
+            "final_energy": out["efinal"],
+            "attempts": out["min_attempts"],
+            "rmsd": rmsd,
+        }
+        pdb_str = amber_minimize.clean_protein(prot)
+        min_pdb = utils.overwrite_pdb_coordinates(pdb_str, min_pos)
+        min_pdb = utils.overwrite_b_factors(min_pdb, prot.b_factors)
+        utils.assert_equal_nonterminal_atom_types(
+            protein.from_pdb_string(min_pdb).atom_mask, prot.atom_mask
+        )
+        violations = out["structural_violations"][
+            "total_per_residue_violations_mask"
+        ]
+        return min_pdb, debug_data, violations
--- a/fastfold/relax/utils.py
+++ b/fastfold/relax/utils.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utils for minimization."""
+import collections
+import io
+import numbers
+import numpy as np
+from Bio import PDB
+from openmm import app as openmm_app
+from openmm.app.internal.pdbstructure import PdbStructure
+from fastfold.common import residue_constants
+def overwrite_pdb_coordinates(pdb_str: str, pos) -> str:
+    pdb_file = io.StringIO(pdb_str)
+    structure = PdbStructure(pdb_file)
+    topology = openmm_app.PDBFile(structure).getTopology()
+    with io.StringIO() as f:
+        openmm_app.PDBFile.writeFile(topology, pos, f)
+        return f.getvalue()
+def overwrite_b_factors(pdb_str: str, bfactors: np.ndarray) -> str:
+    """Overwrites the B-factors in pdb_str with contents of bfactors array.
+  Args:
+    pdb_str: An input PDB string.
+    bfactors: A numpy array with shape [1, n_residues, 37]. We assume that the
+      B-factors are per residue; i.e. that the nonzero entries are identical in
+      [0, i, :].
+  Returns:
+    A new PDB string with the B-factors replaced.
+  """
+    if bfactors.shape[-1] != residue_constants.atom_type_num:
+        raise ValueError(f'Invalid final dimension size for bfactors: {bfactors.shape[-1]}.')
+    parser = PDB.PDBParser(QUIET=True)
+    handle = io.StringIO(pdb_str)
+    structure = parser.get_structure('', handle)
+    curr_resid = ('', '', '')
+    idx = -1
+    for atom in structure.get_atoms():
+        atom_resid = atom.parent.get_id()
+        if atom_resid != curr_resid:
+            idx += 1
+            if idx >= bfactors.shape[0]:
+                raise ValueError('Index into bfactors exceeds number of residues. '
+                                 'B-factors shape: {shape}, idx: {idx}.')
+        curr_resid = atom_resid
+        atom.bfactor = bfactors[idx, residue_constants.atom_order['CA']]
+    new_pdb = io.StringIO()
+    pdb_io = PDB.PDBIO()
+    pdb_io.set_structure(structure)
+    pdb_io.save(new_pdb)
+    return new_pdb.getvalue()
+def assert_equal_nonterminal_atom_types(atom_mask: np.ndarray, ref_atom_mask: np.ndarray):
+    """Checks that pre- and post-minimized proteins have same atom set."""
+    # Ignore any terminal OXT atoms which may have been added by minimization.
+    oxt = residue_constants.atom_order['OXT']
+    no_oxt_mask = np.ones(shape=atom_mask.shape, dtype=np.bool)
+    no_oxt_mask[..., oxt] = False
+    np.testing.assert_almost_equal(ref_atom_mask[no_oxt_mask], atom_mask[no_oxt_mask])
+def mask_mean_np(mask, value, axis=None, drop_mask_channel=False, eps=1e-10):
+    """Masked mean with numpy."""
+    if drop_mask_channel:
+        mask = mask[..., 0]
+    mask_shape = mask.shape
+    value_shape = value.shape
+    assert len(mask_shape) == len(value_shape)
+    if isinstance(axis, numbers.Integral):
+        axis = [axis]
+    elif axis is None:
+        axis = list(range(len(mask_shape)))
+    assert isinstance(
+        axis, collections.Iterable), ('axis needs to be either an iterable, integer or "None"')
+    broadcast_factor = 1.
+    for axis_ in axis:
+        value_size = value_shape[axis_]
+        mask_size = mask_shape[axis_]
+        if mask_size == 1:
+            broadcast_factor *= value_size
+        else:
+            assert mask_size == value_size
+    axis = tuple(axis)
+    return (np.sum(mask * value, axis=axis) / (np.sum(mask, axis=axis) * broadcast_factor + eps))
--- a/fastfold/utils/__init__.py
+++ b/fastfold/utils/__init__.py
--- a/fastfold/utils/all_atom_multimer.py
+++ b/fastfold/utils/all_atom_multimer.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ops for all atom representations."""
+from functools import partial
+from typing import Dict, Text, Tuple
+import torch
+from fastfold.common import residue_constants as rc
+from fastfold.utils import geometry, tensor_utils
+import numpy as np
+def squared_difference(x, y):
+    return np.square(x - y)
+def get_rc_tensor(rc_np, aatype):
+    return torch.tensor(rc_np, device=aatype.device)[aatype]
+def atom14_to_atom37(
+    atom14_data: torch.Tensor,  # (*, N, 14, ...)
+    aatype: torch.Tensor # (*, N)
+) -> torch.Tensor:    # (*, N, 37, ...)
+    """Convert atom14 to atom37 representation."""
+    idx_atom37_to_atom14 = get_rc_tensor(rc.RESTYPE_ATOM37_TO_ATOM14, aatype)
+    no_batch_dims = len(aatype.shape) - 1
+    atom37_data = tensor_utils.batched_gather(
+        atom14_data, 
+        idx_atom37_to_atom14, 
+        dim=no_batch_dims + 1, 
+        no_batch_dims=no_batch_dims + 1
+    )
+    atom37_mask = get_rc_tensor(rc.RESTYPE_ATOM37_MASK, aatype) 
+    if len(atom14_data.shape) == no_batch_dims + 2:
+        atom37_data *= atom37_mask
+    elif len(atom14_data.shape) == no_batch_dims + 3:
+        atom37_data *= atom37_mask[..., None].astype(atom37_data.dtype)
+    else:
+        raise ValueError("Incorrectly shaped data")
+    return atom37_data
+def atom37_to_atom14(aatype, all_atom_pos, all_atom_mask):
+    """Convert Atom37 positions to Atom14 positions."""
+    residx_atom14_to_atom37 = get_rc_tensor(
+        rc.RESTYPE_ATOM14_TO_ATOM37, aatype
+    )
+    no_batch_dims = len(aatype.shape)
+    atom14_mask = tensor_utils.batched_gather(
+        all_atom_mask, 
+        residx_atom14_to_atom37, 
+        dim=no_batch_dims + 1,
+        no_batch_dims=no_batch_dims + 1,
+    ).to(torch.float32)
+    # create a mask for known groundtruth positions
+    atom14_mask *= get_rc_tensor(rc.RESTYPE_ATOM14_MASK, aatype) 
+    # gather the groundtruth positions
+    atom14_positions = tensor_utils.batched_gather(
+        all_atom_pos, 
+        residx_atom14_to_atom37, 
+        dim=no_batch_dims + 1,
+        no_batch_dims=no_batch_dims + 1,
+    ),
+    atom14_positions = atom14_mask * atom14_positions
+    return atom14_positions, atom14_mask
+def get_alt_atom14(aatype, positions: torch.Tensor, mask):
+    """Get alternative atom14 positions."""
+    # pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14)
+    renaming_transform = get_rc_tensor(rc.RENAMING_MATRICES, aatype) 
+    alternative_positions = torch.sum(
+        positions[..., None, :] * renaming_transform[..., None],
+        dim=-2
+    )
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position)
+    alternative_mask = torch.sum(mask[..., None] * renaming_transform, dim=-2)
+    return alternative_positions, alternative_mask
+def atom37_to_frames(
+    aatype: torch.Tensor,    # (...)
+    all_atom_positions: torch.Tensor,    # (..., 37)
+    all_atom_mask: torch.Tensor,    # (..., 37)
+) -> Dict[Text, torch.Tensor]:
+    """Computes the frames for the up to 8 rigid groups for each residue."""
+    # 0: 'backbone group',
+    # 1: 'pre-omega-group', (empty)
+    # 2: 'phi-group', (currently empty, because it defines only hydrogens)
+    # 3: 'psi-group',
+    # 4,5,6,7: 'chi1,2,3,4-group'
+    no_batch_dims = len(aatype.shape) - 1
+    # Compute the gather indices for all residues in the chain.
+    # shape (N, 8, 3)
+    residx_rigidgroup_base_atom37_idx = get_rc_tensor(
+        rc.RESTYPE_RIGIDGROUP_BASE_ATOM37_IDX, aatype
+    )
+    # Gather the base atom positions for each rigid group.
+    base_atom_pos = tensor_utils.batched_gather(
+        all_atom_positions,
+        residx_rigidgroup_base_atom37_idx,
+        dim = no_batch_dims + 1,
+        batch_dims = no_batch_dims + 1,
+    )
+    # Compute the Rigids.
+    point_on_neg_x_axis = base_atom_pos[..., :, :, 0]
+    origin = base_atom_pos[..., :, :, 1]
+    point_on_xy_plane = base_atom_pos[..., :, :, 2]
+    gt_rotation = geometry.Rot3Array.from_two_vectors(
+        origin - point_on_neg_x_axis, point_on_xy_plane - origin
+    )
+    gt_frames = geometry.Rigid3Array(gt_rotation, origin)
+    # Compute a mask whether the group exists.
+    # (N, 8)
+    group_exists = get_rc_tensor(rc.RESTYPE_RIGIDGROUP_MASK, aatype)
+    # Compute a mask whether ground truth exists for the group
+    gt_atoms_exist = tensor_utils.batched_gather(    # shape (N, 8, 3)
+        all_atom_mask.to(dtype=torch.float32),
+        residx_rigidgroup_base_atom37_idx,
+        batch_dims=no_batch_dims + 1,
+    )
+    gt_exists = torch.min(gt_atoms_exist, dim=-1) * group_exists    # (N, 8)
+    # Adapt backbone frame to old convention (mirror x-axis and z-axis).
+    rots = np.tile(np.eye(3, dtype=np.float32), [8, 1, 1])
+    rots[0, 0, 0] = -1
+    rots[0, 2, 2] = -1
+    gt_frames = gt_frames.compose_rotation(
+        geometry.Rot3Array.from_array(
+            torch.tensor(rots, device=aatype.device)
+        )
+    )
+    # The frames for ambiguous rigid groups are just rotated by 180 degree around
+    # the x-axis. The ambiguous group is always the last chi-group.
+    restype_rigidgroup_is_ambiguous = np.zeros([21, 8], dtype=np.float32)
+    restype_rigidgroup_rots = np.tile(
+        np.eye(3, dtype=np.float32), [21, 8, 1, 1]
+    )
+    for resname, _ in rc.residue_atom_renaming_swaps.items():
+        restype = rc.restype_order[
+            rc.restype_3to1[resname]
+        ]
+        chi_idx = int(sum(rc.chi_angles_mask[restype]) - 1)
+        restype_rigidgroup_is_ambiguous[restype, chi_idx + 4] = 1
+        restype_rigidgroup_rots[restype, chi_idx + 4, 1, 1] = -1
+        restype_rigidgroup_rots[restype, chi_idx + 4, 2, 2] = -1
+    # Gather the ambiguity information for each residue.
+    residx_rigidgroup_is_ambiguous = torch.tensor(
+        restype_rigidgroup_is_ambiguous,
+        device=aatype.device,
+    )[aatype]
+    ambiguity_rot = torch.tensor(
+        restype_rigidgroup_rots, 
+        device=aatype.device,
+    )[aatype]
+    ambiguity_rot = geometry.Rot3Array.from_array(
+        torch.Tensor(ambiguity_rot, device=aatype.device)
+    )
+    # Create the alternative ground truth frames.
+    alt_gt_frames = gt_frames.compose_rotation(ambiguity_rot)
+    fix_shape = lambda x: x.reshape(x.shape[:-2] + (8,))
+    # reshape back to original residue layout
+    gt_frames = fix_shape(gt_frames) 
+    gt_exists = fix_shape(gt_exists)
+    group_exists = fix_shape(group_exists)
+    residx_rigidgroup_is_ambiguous = fix_shape(residx_rigidgroup_is_ambiguous)
+    alt_gt_frames = fix_shape(alt_gt_frames)
+    return {
+            'rigidgroups_gt_frames': gt_frames,    # Rigid (..., 8)
+            'rigidgroups_gt_exists': gt_exists,    # (..., 8)
+            'rigidgroups_group_exists': group_exists,    # (..., 8)
+            'rigidgroups_group_is_ambiguous':
+                    residx_rigidgroup_is_ambiguous,    # (..., 8)
+            'rigidgroups_alt_gt_frames': alt_gt_frames,    # Rigid (..., 8)
+    }
+def torsion_angles_to_frames(
+    aatype: torch.Tensor,    # (N)
+    backb_to_global: geometry.Rigid3Array,    # (N)
+    torsion_angles_sin_cos: torch.Tensor    # (N, 7, 2)
+) -> geometry.Rigid3Array:    # (N, 8)
+    """Compute rigid group frames from torsion angles."""
+    # Gather the default frames for all rigid groups.
+    # geometry.Rigid3Array with shape (N, 8)
+    m = get_rc_tensor(rc.restype_rigid_group_default_frame, aatype)
+    default_frames = geometry.Rigid3Array.from_array4x4(m)
+    # Create the rotation matrices according to the given angles (each frame is
+    # defined such that its rotation is around the x-axis).
+    sin_angles = torsion_angles_sin_cos[..., 0]
+    cos_angles = torsion_angles_sin_cos[..., 1]
+    # insert zero rotation for backbone group.
+    num_residues = aatype.shape[-1]
+    sin_angles = torch.cat(
+        [
+            torch.zeros_like(aatype).unsqueeze(), 
+            sin_angles, 
+        ],
+        dim=-1)
+    cos_angles = torch.cat(
+        [
+            torch.ones_like(aatype).unsqueeze(), 
+            cos_angles
+        ],
+        dim=-1
+    )
+    zeros = torch.zeros_like(sin_angles)
+    ones = torch.ones_like(sin_angles)
+    # all_rots are geometry.Rot3Array with shape (..., N, 8)
+    all_rots = geometry.Rot3Array(
+        ones, zeros, zeros,
+        zeros, cos_angles, -sin_angles,
+        zeros, sin_angles, cos_angles
+    )
+    # Apply rotations to the frames.
+    all_frames = default_frames.compose_rotation(all_rots)
+    # chi2, chi3, and chi4 frames do not transform to the backbone frame but to
+    # the previous frame. So chain them up accordingly.
+    chi1_frame_to_backb = all_frames[..., 4]
+    chi2_frame_to_backb = chi1_frame_to_backb @ all_frames[..., 5]
+    chi3_frame_to_backb = chi2_frame_to_backb @ all_frames[..., 6]
+    chi4_frame_to_backb = chi3_frame_to_backb @ all_frames[..., 7]
+    all_frames_to_backb = Rigid3Array.cat(
+        [
+            all_frames[..., 0:5],
+            chi2_frame_to_backb[..., None], 
+            chi3_frame_to_backb[..., None],
+            chi4_frame_to_backb[..., None]
+        ], 
+        dim=-1
+    )
+    # Create the global frames.
+    # shape (N, 8)
+    all_frames_to_global = backb_to_global[..., None] @ all_frames_to_backb
+    return all_frames_to_global
+def frames_and_literature_positions_to_atom14_pos(
+    aatype: torch.Tensor,    # (*, N)
+    all_frames_to_global: geometry.Rigid3Array    # (N, 8)
+) -> geometry.Vec3Array:    # (*, N, 14)
+    """Put atom literature positions (atom14 encoding) in each rigid group."""
+    # Pick the appropriate transform for every atom.
+    residx_to_group_idx = get_rc_tensor(
+        rc.restype_atom14_to_rigid_group, 
+        aatype
+    )
+    group_mask = torch.nn.functional.one_hot(
+        residx_to_group_idx, 
+        num_classes=8
+    )    # shape (*, N, 14, 8)
+    # geometry.Rigid3Array with shape (N, 14)
+    map_atoms_to_global = all_frames_to_global[..., None, :] * group_mask
+    map_atoms_to_global = map_atoms_to_global.map_tensor_fn(
+        partial(torch.sum, dim=-1)
+    )
+    # Gather the literature atom positions for each residue.
+    # geometry.Vec3Array with shape (N, 14)
+    lit_positions = geometry.Vec3Array.from_array(
+        get_rc_tensor(
+            rc.restype_atom14_rigid_group_positions, 
+            aatype
+        )
+    )
+    # Transform each atom from its local frame to the global frame.
+    # geometry.Vec3Array with shape (N, 14)
+    pred_positions = map_atoms_to_global.apply_to_point(lit_positions)
+    # Mask out non-existing atoms.
+    mask = get_rc_tensor(rc.restype_atom14_mask, aatype)
+    pred_positions = pred_positions * mask
+    return pred_positions
+def extreme_ca_ca_distance_violations(
+    positions: geometry.Vec3Array,    # (N, 37(14))
+    mask: torch.Tensor,    # (N, 37(14))
+    residue_index: torch.Tensor,    # (N)
+    max_angstrom_tolerance=1.5,
+    eps: float = 1e-6
+) -> torch.Tensor:
+    """Counts residues whose Ca is a large distance from its neighbor."""
+    this_ca_pos = positions[..., :-1, 1]    # (N - 1,)
+    this_ca_mask = mask[..., :-1, 1]                 # (N - 1)
+    next_ca_pos = positions[..., 1:, 1]    # (N - 1,)
+    next_ca_mask = mask[..., 1:, 1]    # (N - 1)
+    has_no_gap_mask = (
+        (residue_index[..., 1:] - residue_index[..., :-1]) == 1.0
+    ).astype(torch.float32)
+    ca_ca_distance = geometry.euclidean_distance(this_ca_pos, next_ca_pos, eps)
+    violations = (ca_ca_distance - rc.ca_ca) > max_angstrom_tolerance
+    mask = this_ca_mask * next_ca_mask * has_no_gap_mask
+    return tensor_utils.masked_mean(mask=mask, value=violations, dim=-1)
+def get_chi_atom_indices(device: torch.device):
+    """Returns atom indices needed to compute chi angles for all residue types.
+    Returns:
+        A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are
+        in the order specified in rc.restypes + unknown residue type
+        at the end. For chi angles which are not defined on the residue, the
+        positions indices are by default set to 0.
+    """
+    chi_atom_indices = []
+    for residue_name in rc.restypes:
+        residue_name = rc.restype_1to3[residue_name]
+        residue_chi_angles = rc.chi_angles_atoms[residue_name]
+        atom_indices = []
+        for chi_angle in residue_chi_angles:
+            atom_indices.append(
+                [rc.atom_order[atom] for atom in chi_angle]
+            )
+        for _ in range(4 - len(atom_indices)):
+            atom_indices.append([0, 0, 0, 0])    # For chi angles not defined on the AA.
+        chi_atom_indices.append(atom_indices)
+    chi_atom_indices.append([[0, 0, 0, 0]] * 4)    # For UNKNOWN residue.
+    return torch.tensor(chi_atom_indices, device=device)
+def compute_chi_angles(
+    positions: geometry.Vec3Array,
+    mask: torch.Tensor,
+    aatype: torch.Tensor
+):
+    """Computes the chi angles given all atom positions and the amino acid type.
+    Args:
+        positions: A Vec3Array of shape
+            [num_res, rc.atom_type_num], with positions of
+            atoms needed to calculate chi angles. Supports up to 1 batch dimension.
+        mask: An optional tensor of shape
+            [num_res, rc.atom_type_num] that masks which atom
+            positions are set for each residue. If given, then the chi mask will be
+            set to 1 for a chi angle only if the amino acid has that chi angle and all
+            the chi atoms needed to calculate that chi angle are set. If not given
+            (set to None), the chi mask will be set to 1 for a chi angle if the amino
+            acid has that chi angle and whether the actual atoms needed to calculate
+            it were set will be ignored.
+        aatype: A tensor of shape [num_res] with amino acid type integer
+            code (0 to 21). Supports up to 1 batch dimension.
+    Returns:
+        A tuple of tensors (chi_angles, mask), where both have shape
+        [num_res, 4]. The mask masks out unused chi angles for amino acid
+        types that have less than 4 chi angles. If atom_positions_mask is set, the
+        chi mask will also mask out uncomputable chi angles.
+    """
+    # Don't assert on the num_res and batch dimensions as they might be unknown.
+    assert positions.shape[-1] == rc.atom_type_num
+    assert mask.shape[-1] == rc.atom_type_num
+    no_batch_dims = len(aatype.shape) - 1
+    # Compute the table of chi angle indices. Shape: [restypes, chis=4, atoms=4].
+    chi_atom_indices = get_chi_atom_indices(aatype.device) 
+    # DISCREPANCY: DeepMind doesn't remove the gaps here. I don't know why 
+    # theirs works.
+    aatype_gapless = torch.clamp(aatype, max=20)
+    # Select atoms to compute chis. Shape: [*, num_res, chis=4, atoms=4]. 
+    atom_indices = chi_atom_indices[aatype_gapless]
+    # Gather atom positions. Shape: [num_res, chis=4, atoms=4, xyz=3].
+    chi_angle_atoms = positions.map_tensor_fn(
+        partial(
+            tensor_utils.batched_gather, 
+            inds=atom_indices, 
+            dim=-1, 
+            no_batch_dims=no_batch_dims + 1
+        )
+    )
+    a, b, c, d = [chi_angle_atoms[..., i] for i in range(4)]
+    chi_angles = geometry.dihedral_angle(a, b, c, d)
+    # Copy the chi angle mask, add the UNKNOWN residue. Shape: [restypes, 4].
+    chi_angles_mask = list(rc.chi_angles_mask)
+    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
+    chi_angles_mask = torch.tensor(chi_angles_mask, device=aatype.device)
+    # Compute the chi angle mask. Shape [num_res, chis=4].
+    chi_mask = chi_angles_mask[aatype_gapless]
+    # The chi_mask is set to 1 only when all necessary chi angle atoms were set.
+    # Gather the chi angle atoms mask. Shape: [num_res, chis=4, atoms=4].
+    chi_angle_atoms_mask = tensor_utils.batched_gather(
+        mask, 
+        atom_indices, 
+        dim=-1, 
+        no_batch_dims=no_batch_dims + 1
+    )
+    # Check if all 4 chi angle atoms were set. Shape: [num_res, chis=4].
+    chi_angle_atoms_mask = torch.prod(chi_angle_atoms_mask, dim=-1)
+    chi_mask = chi_mask * chi_angle_atoms_mask.to(torch.float32)
+    return chi_angles, chi_mask
+def make_transform_from_reference(
+    a_xyz: geometry.Vec3Array,
+    b_xyz: geometry.Vec3Array,
+    c_xyz: geometry.Vec3Array
+) -> geometry.Rigid3Array:
+    """Returns rotation and translation matrices to convert from reference.
+    Note that this method does not take care of symmetries. If you provide the
+    coordinates in the non-standard way, the A atom will end up in the negative
+    y-axis rather than in the positive y-axis. You need to take care of such
+    cases in your code.
+    Args:
+        a_xyz: A Vec3Array.
+        b_xyz: A Vec3Array.
+        c_xyz: A Vec3Array.
+    Returns:
+        A Rigid3Array which, when applied to coordinates in a canonicalized
+        reference frame, will give coordinates approximately equal
+        the original coordinates (in the global frame).
+    """
+    rotation = geometry.Rot3Array.from_two_vectors(
+        c_xyz - b_xyz,
+        a_xyz - b_xyz
+    )
+    return geometry.Rigid3Array(rotation, b_xyz)
+def make_backbone_affine(
+    positions: geometry.Vec3Array,
+    mask: torch.Tensor,
+    aatype: torch.Tensor,
+) -> Tuple[geometry.Rigid3Array, torch.Tensor]:
+    a = rc.atom_order['N']
+    b = rc.atom_order['CA']
+    c = rc.atom_order['C']
+    rigid_mask = (mask[..., a] * mask[..., b] * mask[..., c])
+    rigid = make_transform_from_reference(
+        a_xyz=positions[..., a],
+        b_xyz=positions[..., b],
+        c_xyz=positions[..., c],
+    )
+    return rigid, rigid_mask
--- a/fastfold/utils/checkpointing.py
+++ b/fastfold/utils/checkpointing.py
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.utils.checkpoint
+from typing import Any, Tuple, List, Callable, Optional
+BLOCK_ARG = Any
+BLOCK_ARGS = List[BLOCK_ARG]
+def get_checkpoint_fn():
+    checkpoint = torch.utils.checkpoint.checkpoint
+    return checkpoint
+@torch.jit.ignore
+def checkpoint_blocks(
+    blocks: List[Callable],
+    args: BLOCK_ARGS,
+    blocks_per_ckpt: Optional[int],
+) -> BLOCK_ARGS:
+    """
+    Chunk a list of blocks and run each chunk with activation
+    checkpointing. We define a "block" as a callable whose only inputs are
+    the outputs of the previous block.
+    Implements Subsection 1.11.8
+    Args:
+        blocks:
+            List of blocks
+        args:
+            Tuple of arguments for the first block.
+        blocks_per_ckpt:
+            Size of each chunk. A higher value corresponds to fewer 
+            checkpoints, and trades memory for speed. If None, no checkpointing 
+            is performed.
+    Returns:
+        The output of the final block
+    """
+    def wrap(a):
+        return (a,) if type(a) is not tuple else a
+    def exec(b, a):
+        for block in b:
+            a = wrap(block(*a))
+        return a
+    def chunker(s, e):
+        def exec_sliced(*a):
+            return exec(blocks[s:e], a)
+        return exec_sliced
+    # Avoids mishaps when the blocks take just one argument
+    args = wrap(args)
+    if blocks_per_ckpt is None:
+        return exec(blocks, args)
+    elif blocks_per_ckpt < 1 or blocks_per_ckpt > len(blocks):
+        raise ValueError("blocks_per_ckpt must be between 1 and len(blocks)")
+    checkpoint = get_checkpoint_fn() 
+    for s in range(0, len(blocks), blocks_per_ckpt):
+        e = s + blocks_per_ckpt
+        args = checkpoint(chunker(s, e), *args)
+        args = wrap(args)
+    return args
--- a/fastfold/utils/feats.py
+++ b/fastfold/utils/feats.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+from typing import Any, Dict, Optional, Tuple, Union
+from fastfold.common import protein
+import fastfold.common.residue_constants as rc
+from fastfold.utils.geometry.rigid_matrix_vector import Rigid3Array
+from fastfold.utils.geometry.rotation_matrix import Rot3Array
+from fastfold.utils.rigid_utils import Rotation, Rigid
+from fastfold.utils.tensor_utils import (
+    batched_gather,
+    one_hot,
+    tree_map,
+    tensor_tree_map,
+)
+def dgram_from_positions(
+    pos: torch.Tensor, 
+    min_bin: float = 3.25, 
+    max_bin: float = 50.75, 
+    no_bins: float = 39, 
+    inf: float = 1e8,
+) -> torch.Tensor:
+    dgram = torch.sum(
+        (pos[..., None, :] - pos[..., None, :, :]) ** 2, dim=-1, keepdim=True
+    )
+    lower = torch.linspace(min_bin, max_bin, no_bins, device=pos.device) ** 2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower).type(dgram.dtype) * (dgram < upper)).type(dgram.dtype)
+    return dgram
+def pseudo_beta_fn(
+    aatype, all_atom_positions: torch.Tensor, all_atom_masks: torch.Tensor
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    is_gly = aatype == rc.restype_order["G"]
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+    if all_atom_masks is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly,
+            all_atom_masks[..., ca_idx],
+            all_atom_masks[..., cb_idx],
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta, None
+def atom14_to_atom37(atom14, batch: Dict[str, Any]):
+    atom37_data = batched_gather(
+        atom14,
+        batch["residx_atom37_to_atom14"],
+        dim=-2,
+        no_batch_dims=len(atom14.shape[:-2]),
+    )
+    atom37_data = atom37_data * batch["atom37_atom_exists"][..., None]
+    return atom37_data
+def build_template_angle_feat(template_feats: Dict[str, Any]) -> torch.Tensor:
+    template_aatype = template_feats["template_aatype"]
+    torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"]
+    alt_torsion_angles_sin_cos = template_feats["template_alt_torsion_angles_sin_cos"]
+    torsion_angles_mask = template_feats["template_torsion_angles_mask"]
+    template_angle_feat = torch.cat(
+        [
+            nn.functional.one_hot(template_aatype, 22).to(torch.float32),
+            torsion_angles_sin_cos.reshape(*torsion_angles_sin_cos.shape[:-2], 14),
+            alt_torsion_angles_sin_cos.reshape(
+                *alt_torsion_angles_sin_cos.shape[:-2], 14
+            ),
+            torsion_angles_mask,
+        ],
+        dim=-1,
+    )
+    return template_angle_feat
+def build_template_pair_feat(
+    batch: Dict[str, Any],
+    min_bin: float,
+    max_bin: float,
+    no_bins: int,
+    use_unit_vector: bool = False,
+    eps: float = 1e-20,
+    inf: float = 1e8,
+    chunk=None
+):  
+    if chunk and 1 <= chunk <= 4:
+        for k, v in batch.items():
+            batch[k] = v.cpu()
+    template_mask = batch["template_pseudo_beta_mask"]
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+    # Compute distogram (this seems to differ slightly from Alg. 5)
+    tpb = batch["template_pseudo_beta"]
+    dgram = dgram_from_positions(tpb, min_bin, max_bin, no_bins, inf)
+    to_concat = [dgram, template_mask_2d[..., None]]
+    aatype_one_hot = nn.functional.one_hot(
+        batch["template_aatype"],
+        rc.restype_num + 2,
+    )
+    n_res = batch["template_aatype"].shape[-1]
+    to_concat.append(
+        aatype_one_hot[..., None, :, :].expand(
+            *aatype_one_hot.shape[:-2], n_res, -1, -1
+        ).to(dgram.dtype)
+    )
+    to_concat.append(
+        aatype_one_hot[..., None, :].expand(*aatype_one_hot.shape[:-2], -1, n_res, -1).to(dgram.dtype)
+    )
+    n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]]
+    rigids = Rigid.make_transform_from_reference(
+        n_xyz=batch["template_all_atom_positions"][..., n, :],
+        ca_xyz=batch["template_all_atom_positions"][..., ca, :],
+        c_xyz=batch["template_all_atom_positions"][..., c, :],
+        eps=eps,
+    )
+    points = rigids.get_trans()[..., None, :, :]
+    rigid_vec = rigids[..., None].invert_apply(points)
+    del rigids, points
+    inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec**2, dim=-1))
+    t_aa_masks = batch["template_all_atom_mask"]
+    template_mask = t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c]
+    del t_aa_masks, n, ca, c
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+    inv_distance_scalar = inv_distance_scalar * template_mask_2d
+    unit_vector = rigid_vec * inv_distance_scalar[..., None]
+    if not use_unit_vector:
+        unit_vector = unit_vector * 0.0
+    to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1))
+    to_concat.append(template_mask_2d[..., None])
+    del unit_vector, rigid_vec, inv_distance_scalar
+    act = torch.cat(to_concat, dim=-1)
+    act = act * template_mask_2d[..., None]
+    return act
+def build_extra_msa_feat(batch: Dict[str, Any]) -> torch.Tensor:
+    msa_1hot = nn.functional.one_hot(batch["extra_msa"], 23)
+    msa_feat = [
+        msa_1hot.to(torch.float32),
+        batch["extra_has_deletion"].unsqueeze(-1),
+        batch["extra_deletion_value"].unsqueeze(-1),
+    ]
+    return torch.cat(msa_feat, dim=-1)
+def torsion_angles_to_frames(
+    r: Union[Rigid3Array, Rigid],
+    alpha: torch.Tensor,
+    aatype: torch.Tensor,
+    rrgdf: torch.Tensor,
+) -> Union[Rigid, Rigid3Array]:
+    # [*, N, 8, 4, 4]
+    default_4x4 = rrgdf[aatype, ...]
+    # [*, N, 8] transformations, i.e.
+    #   One [*, N, 8, 3, 3] rotation matrix and
+    #   One [*, N, 8, 3]    translation matrix
+    default_r = r.from_tensor_4x4(default_4x4)
+    bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2))
+    bb_rot[..., 1] = 1
+    # [*, N, 8, 2]
+    alpha = torch.cat([bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2)
+    # [*, N, 8, 3, 3]
+    # Produces rotation matrices of the form:
+    # [
+    #   [1, 0  , 0  ],
+    #   [0, a_2,-a_1],
+    #   [0, a_1, a_2]
+    # ]
+    # This follows the original code rather than the supplement, which uses
+    # different indices.
+    if type(r) == Rigid3Array:
+        all_rots = alpha.new_zeros(default_r.shape + (3, 3))
+    elif type(r) == Rigid:
+        all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape)
+    else:
+        raise TypeError(f"Wrong type of Rigid: {type(r)}")
+    all_rots[..., 0, 0] = 1
+    all_rots[..., 1, 1] = alpha[..., 1]
+    all_rots[..., 1, 2] = -alpha[..., 0]
+    all_rots[..., 2, 1:] = alpha
+    if type(r) == Rigid3Array:
+        all_rots = Rot3Array.from_array(all_rots)
+        all_frames = default_r.compose_rotation(all_rots)
+    elif type(r) == Rigid:
+        all_rots = Rigid(Rotation(rot_mats=all_rots), None)
+        all_frames = default_r.compose(all_rots)
+    else:
+        raise TypeError(f"Wrong type of Rigid: {type(r)}")
+    chi2_frame_to_frame = all_frames[..., 5]
+    chi3_frame_to_frame = all_frames[..., 6]
+    chi4_frame_to_frame = all_frames[..., 7]
+    chi1_frame_to_bb = all_frames[..., 4]
+    chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
+    chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
+    chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
+    if type(all_frames) == Rigid3Array:
+        all_frames_to_bb = Rigid3Array.cat(
+            [
+                all_frames[..., :5],
+                chi2_frame_to_bb.unsqueeze(-1),
+                chi3_frame_to_bb.unsqueeze(-1),
+                chi4_frame_to_bb.unsqueeze(-1),
+            ],
+            dim=-1,
+        )
+    elif type(all_frames) == Rigid:
+        all_frames_to_bb = Rigid.cat(
+            [
+                all_frames[..., :5],
+                chi2_frame_to_bb.unsqueeze(-1),
+                chi3_frame_to_bb.unsqueeze(-1),
+                chi4_frame_to_bb.unsqueeze(-1),
+            ],
+            dim=-1,
+        )
+    all_frames_to_global = r[..., None].compose(all_frames_to_bb)
+    return all_frames_to_global
+def frames_and_literature_positions_to_atom14_pos(
+    r: Union[Rigid3Array, Rigid],
+    aatype: torch.Tensor,
+    default_frames: torch.Tensor,
+    group_idx: torch.Tensor,
+    atom_mask: torch.Tensor,
+    lit_positions: torch.Tensor,
+) -> torch.Tensor:
+    # [*, N, 14, 4, 4]
+    default_4x4 = default_frames[aatype, ...]
+    # [*, N, 14]
+    group_mask = group_idx[aatype, ...]
+    # [*, N, 14, 8]
+    if type(r) == Rigid3Array:
+        group_mask = nn.functional.one_hot(
+            group_mask.long(),
+            num_classes=default_frames.shape[-3],
+        )
+    elif type(r) == Rigid:
+        group_mask = nn.functional.one_hot(
+            group_mask,
+            num_classes=default_frames.shape[-3],
+        )
+    else:
+        raise TypeError(f"Wrong type of Rigid: {type(r)}")
+    # [*, N, 14, 8]
+    t_atoms_to_global = r[..., None, :] * group_mask
+    # [*, N, 14]
+    t_atoms_to_global = t_atoms_to_global.map_tensor_fn(lambda x: torch.sum(x, dim=-1))
+    # [*, N, 14, 1]
+    if type(r) == Rigid:
+        atom_mask = atom_mask[aatype, ...].unsqueeze(-1)
+    elif type(r) == Rigid3Array:
+        atom_mask = atom_mask[aatype, ...]
+    # [*, N, 14, 3]
+    lit_positions = lit_positions[aatype, ...]
+    pred_positions = t_atoms_to_global.apply(lit_positions)
+    pred_positions = pred_positions * atom_mask
+    return pred_positions