Merge branch 'main' of https://github.com/aqlaboratory/openfold

13f8f163 · zhuwenwen · a509a4c5 · b5fa2ba3 · 13f8f163 · 13f8f163
Commit 13f8f163 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/openfold/utils/exponential_moving_average.py
+++ b/openfold/utils/exponential_moving_average.py
+from collections import OrderedDict
+import copy
+import torch
+import torch.nn as nn
+
+from openfold.utils.tensor_utils import tensor_tree_map
+
+
+class ExponentialMovingAverage:
+    """
+    Maintains moving averages of parameters with exponential decay
+
+    At each step, the stored copy `copy` of each parameter `param` is
+    updated as follows:
+
+        `copy = decay * copy + (1 - decay) * param`
+
+    where `decay` is an attribute of the ExponentialMovingAverage object.
+    """
+
+    def __init__(self, model: nn.Module, decay: float):
+        """
+        Args:
+            model:
+                A torch.nn.Module whose parameters are to be tracked
+            decay:
+                A value (usually close to 1.) by which updates are
+                weighted as part of the above formula
+        """
+        super(ExponentialMovingAverage, self).__init__()
+
+        clone_param = lambda t: t.clone().detach()
+        self.params = tensor_tree_map(clone_param, model.state_dict())
+        self.decay = decay
+        self.device = next(model.parameters()).device
+
+    def to(self, device):
+        self.params = tensor_tree_map(lambda t: t.to(device), self.params)
+        self.device = device
+
+    def _update_state_dict_(self, update, state_dict):
+        with torch.no_grad():
+            for k, v in update.items():
+                stored = state_dict[k]
+                if not isinstance(v, torch.Tensor):
+                    self._update_state_dict_(v, stored)
+                else:
+                    diff = stored - v
+                    diff *= 1 - self.decay
+                    stored -= diff
+
+    def update(self, model: torch.nn.Module) -> None:
+        """
+        Updates the stored parameters using the state dict of the provided
+        module. The module should have the same structure as that used to
+        initialize the ExponentialMovingAverage object.
+        """
+        self._update_state_dict_(model.state_dict(), self.params)
+
+    def load_state_dict(self, state_dict: OrderedDict) -> None:
+        for k in state_dict["params"].keys():
+            self.params[k] = state_dict["params"][k].clone()
+        self.decay = state_dict["decay"]
+
+    def state_dict(self) -> OrderedDict:
+        return OrderedDict(
+            {
+                "params": self.params,
+                "decay": self.decay,
+            }
+        )
--- a/openfold/utils/feats.py
+++ b/openfold/utils/feats.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+from typing import Dict
+
+from openfold.np import protein
+import openfold.np.residue_constants as rc
+from openfold.utils.rigid_utils import Rotation, Rigid
+from openfold.utils.tensor_utils import (
+    batched_gather,
+    one_hot,
+    tree_map,
+    tensor_tree_map,
+)
+
+
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
+    is_gly = aatype == rc.restype_order["G"]
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        is_gly[..., None].expand(*((-1,) * len(is_gly.shape)), 3),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+
+    if all_atom_masks is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly,
+            all_atom_masks[..., ca_idx],
+            all_atom_masks[..., cb_idx],
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+
+
+def atom14_to_atom37(atom14, batch):
+    atom37_data = batched_gather(
+        atom14,
+        batch["residx_atom37_to_atom14"],
+        dim=-2,
+        no_batch_dims=len(atom14.shape[:-2]),
+    )
+
+    atom37_data = atom37_data * batch["atom37_atom_exists"][..., None]
+
+    return atom37_data
+
+
+def build_template_angle_feat(template_feats):
+    template_aatype = template_feats["template_aatype"]
+    torsion_angles_sin_cos = template_feats["template_torsion_angles_sin_cos"]
+    alt_torsion_angles_sin_cos = template_feats[
+        "template_alt_torsion_angles_sin_cos"
+    ]
+    torsion_angles_mask = template_feats["template_torsion_angles_mask"]
+    template_angle_feat = torch.cat(
+        [
+            nn.functional.one_hot(template_aatype, 22),
+            torsion_angles_sin_cos.reshape(
+                *torsion_angles_sin_cos.shape[:-2], 14
+            ),
+            alt_torsion_angles_sin_cos.reshape(
+                *alt_torsion_angles_sin_cos.shape[:-2], 14
+            ),
+            torsion_angles_mask,
+        ],
+        dim=-1,
+    )
+
+    return template_angle_feat
+
+
+def build_template_pair_feat(
+    batch, 
+    min_bin, max_bin, no_bins, 
+    use_unit_vector=False, 
+    eps=1e-20, inf=1e8
+):
+    template_mask = batch["template_pseudo_beta_mask"]
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+
+    # Compute distogram (this seems to differ slightly from Alg. 5)
+    tpb = batch["template_pseudo_beta"]
+    dgram = torch.sum(
+        (tpb[..., None, :] - tpb[..., None, :, :]) ** 2, dim=-1, keepdim=True
+    )
+    lower = torch.linspace(min_bin, max_bin, no_bins, device=tpb.device) ** 2
+    upper = torch.cat([lower[1:], lower.new_tensor([inf])], dim=-1)
+    dgram = ((dgram > lower) * (dgram < upper)).type(dgram.dtype)
+
+    to_concat = [dgram, template_mask_2d[..., None]]
+
+    aatype_one_hot = nn.functional.one_hot(
+        batch["template_aatype"],
+        rc.restype_num + 2,
+    )
+
+    n_res = batch["template_aatype"].shape[-1]
+    to_concat.append(
+        aatype_one_hot[..., None, :, :].expand(
+            *aatype_one_hot.shape[:-2], n_res, -1, -1
+        )
+    )
+    to_concat.append(
+        aatype_one_hot[..., None, :].expand(
+            *aatype_one_hot.shape[:-2], -1, n_res, -1
+        )
+    )
+
+    n, ca, c = [rc.atom_order[a] for a in ["N", "CA", "C"]]
+    rigids = Rigid.make_transform_from_reference(
+        n_xyz=batch["template_all_atom_positions"][..., n, :],
+        ca_xyz=batch["template_all_atom_positions"][..., ca, :],
+        c_xyz=batch["template_all_atom_positions"][..., c, :],
+        eps=eps,
+    )
+    points = rigids.get_trans()[..., None, :, :]
+    rigid_vec = rigids[..., None].invert_apply(points)
+
+    inv_distance_scalar = torch.rsqrt(eps + torch.sum(rigid_vec ** 2, dim=-1))
+
+    t_aa_masks = batch["template_all_atom_mask"]
+    template_mask = (
+        t_aa_masks[..., n] * t_aa_masks[..., ca] * t_aa_masks[..., c]
+    )
+    template_mask_2d = template_mask[..., None] * template_mask[..., None, :]
+
+    inv_distance_scalar = inv_distance_scalar * template_mask_2d
+    unit_vector = rigid_vec * inv_distance_scalar[..., None]
+    
+    if(not use_unit_vector):
+        unit_vector = unit_vector * 0.
+    
+    to_concat.extend(torch.unbind(unit_vector[..., None, :], dim=-1))
+    to_concat.append(template_mask_2d[..., None])
+
+    act = torch.cat(to_concat, dim=-1)
+    act = act * template_mask_2d[..., None]
+
+    return act
+
+
+def build_extra_msa_feat(batch):
+    msa_1hot = nn.functional.one_hot(batch["extra_msa"], 23)
+    msa_feat = [
+        msa_1hot,
+        batch["extra_has_deletion"].unsqueeze(-1),
+        batch["extra_deletion_value"].unsqueeze(-1),
+    ]
+    return torch.cat(msa_feat, dim=-1)
+
+
+def torsion_angles_to_frames(
+    r: Rigid,
+    alpha: torch.Tensor,
+    aatype: torch.Tensor,
+    rrgdf: torch.Tensor,
+):
+    # [*, N, 8, 4, 4]
+    default_4x4 = rrgdf[aatype, ...]
+
+    # [*, N, 8] transformations, i.e.
+    #   One [*, N, 8, 3, 3] rotation matrix and
+    #   One [*, N, 8, 3]    translation matrix
+    default_r = r.from_tensor_4x4(default_4x4)
+
+    bb_rot = alpha.new_zeros((*((1,) * len(alpha.shape[:-1])), 2))
+    bb_rot[..., 1] = 1
+
+    # [*, N, 8, 2]
+    alpha = torch.cat(
+        [bb_rot.expand(*alpha.shape[:-2], -1, -1), alpha], dim=-2
+    )
+
+    # [*, N, 8, 3, 3]
+    # Produces rotation matrices of the form:
+    # [
+    #   [1, 0  , 0  ],
+    #   [0, a_2,-a_1],
+    #   [0, a_1, a_2]
+    # ]
+    # This follows the original code rather than the supplement, which uses
+    # different indices.
+
+    all_rots = alpha.new_zeros(default_r.get_rots().get_rot_mats().shape)
+    all_rots[..., 0, 0] = 1
+    all_rots[..., 1, 1] = alpha[..., 1]
+    all_rots[..., 1, 2] = -alpha[..., 0]
+    all_rots[..., 2, 1:] = alpha
+
+    all_rots = Rigid(Rotation(rot_mats=all_rots), None)
+
+    all_frames = default_r.compose(all_rots)
+
+    chi2_frame_to_frame = all_frames[..., 5]
+    chi3_frame_to_frame = all_frames[..., 6]
+    chi4_frame_to_frame = all_frames[..., 7]
+
+    chi1_frame_to_bb = all_frames[..., 4]
+    chi2_frame_to_bb = chi1_frame_to_bb.compose(chi2_frame_to_frame)
+    chi3_frame_to_bb = chi2_frame_to_bb.compose(chi3_frame_to_frame)
+    chi4_frame_to_bb = chi3_frame_to_bb.compose(chi4_frame_to_frame)
+
+    all_frames_to_bb = Rigid.cat(
+        [
+            all_frames[..., :5],
+            chi2_frame_to_bb.unsqueeze(-1),
+            chi3_frame_to_bb.unsqueeze(-1),
+            chi4_frame_to_bb.unsqueeze(-1),
+        ],
+        dim=-1,
+    )
+
+    all_frames_to_global = r[..., None].compose(all_frames_to_bb)
+
+    return all_frames_to_global
+
+
+def frames_and_literature_positions_to_atom14_pos(
+    r: Rigid,
+    aatype: torch.Tensor,
+    default_frames,
+    group_idx,
+    atom_mask,
+    lit_positions,
+):
+    # [*, N, 14, 4, 4]
+    default_4x4 = default_frames[aatype, ...]
+
+    # [*, N, 14]
+    group_mask = group_idx[aatype, ...]
+
+    # [*, N, 14, 8]
+    group_mask = nn.functional.one_hot(
+        group_mask,
+        num_classes=default_frames.shape[-3],
+    )
+
+    # [*, N, 14, 8]
+    t_atoms_to_global = r[..., None, :] * group_mask
+
+    # [*, N, 14]
+    t_atoms_to_global = t_atoms_to_global.map_tensor_fn(
+        lambda x: torch.sum(x, dim=-1)
+    )
+
+    # [*, N, 14, 1]
+    atom_mask = atom_mask[aatype, ...].unsqueeze(-1)
+
+    # [*, N, 14, 3]
+    lit_positions = lit_positions[aatype, ...]
+    pred_positions = t_atoms_to_global.apply(lit_positions)
+    pred_positions = pred_positions * atom_mask
+
+    return pred_positions
--- a/openfold/utils/import_weights.py
+++ b/openfold/utils/import_weights.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from dataclasses import dataclass
+from functools import partial
+import numpy as np
+import torch
+from typing import Union, List
+
+
+_NPZ_KEY_PREFIX = "alphafold/alphafold_iteration/"
+
+
+# With Param, a poor man's enum with attributes (Rust-style)
+class ParamType(Enum):
+    LinearWeight = partial(  # hack: partial prevents fns from becoming methods
+        lambda w: w.transpose(-1, -2)
+    )
+    LinearWeightMHA = partial(
+        lambda w: w.reshape(*w.shape[:-2], -1).transpose(-1, -2)
+    )
+    LinearMHAOutputWeight = partial(
+        lambda w: w.reshape(*w.shape[:-3], -1, w.shape[-1]).transpose(-1, -2)
+    )
+    LinearBiasMHA = partial(lambda w: w.reshape(*w.shape[:-2], -1))
+    LinearWeightOPM = partial(
+        lambda w: w.reshape(*w.shape[:-3], -1, w.shape[-1]).transpose(-1, -2)
+    )
+    Other = partial(lambda w: w)
+
+    def __init__(self, fn):
+        self.transformation = fn
+
+
+@dataclass
+class Param:
+    param: Union[torch.Tensor, List[torch.Tensor]]
+    param_type: ParamType = ParamType.Other
+    stacked: bool = False
+
+
+def process_translation_dict(d, top_layer=True):
+    flat = {}
+    for k, v in d.items():
+        if type(v) == dict:
+            prefix = _NPZ_KEY_PREFIX if top_layer else ""
+            sub_flat = {
+                (prefix + "/".join([k, k_prime])): v_prime
+                for k_prime, v_prime in process_translation_dict(
+                    v, top_layer=False
+                ).items()
+            }
+            flat.update(sub_flat)
+        else:
+            k = "/" + k if not top_layer else k
+            flat[k] = v
+
+    return flat
+
+
+def stacked(param_dict_list, out=None):
+    """
+    Args:
+        param_dict_list:
+            A list of (nested) Param dicts to stack. The structure of
+            each dict must be the identical (down to the ParamTypes of
+            "parallel" Params). There must be at least one dict
+            in the list.
+    """
+    if out is None:
+        out = {}
+    template = param_dict_list[0]
+    for k, _ in template.items():
+        v = [d[k] for d in param_dict_list]
+        if type(v[0]) is dict:
+            out[k] = {}
+            stacked(v, out=out[k])
+        elif type(v[0]) is Param:
+            stacked_param = Param(
+                param=[param.param for param in v],
+                param_type=v[0].param_type,
+                stacked=True,
+            )
+
+            out[k] = stacked_param
+
+    return out
+
+
+def assign(translation_dict, orig_weights):
+    for k, param in translation_dict.items():
+        with torch.no_grad():
+            weights = torch.as_tensor(orig_weights[k])
+            ref, param_type = param.param, param.param_type
+            if param.stacked:
+                weights = torch.unbind(weights, 0)
+            else:
+                weights = [weights]
+                ref = [ref]
+
+            try:
+                weights = list(map(param_type.transformation, weights))
+                for p, w in zip(ref, weights):
+                    p.copy_(w)
+            except:
+                print(k)
+                print(ref[0].shape)
+                print(weights[0].shape)
+                raise
+
+
+def generate_translation_dict(model, version):
+    #######################
+    # Some templates
+    #######################
+
+    LinearWeight = lambda l: (Param(l, param_type=ParamType.LinearWeight))
+
+    LinearBias = lambda l: (Param(l))
+
+    LinearWeightMHA = lambda l: (Param(l, param_type=ParamType.LinearWeightMHA))
+
+    LinearBiasMHA = lambda b: (Param(b, param_type=ParamType.LinearBiasMHA))
+
+    LinearWeightOPM = lambda l: (Param(l, param_type=ParamType.LinearWeightOPM))
+
+    LinearParams = lambda l: {
+        "weights": LinearWeight(l.weight),
+        "bias": LinearBias(l.bias),
+    }
+
+    LayerNormParams = lambda l: {
+        "scale": Param(l.weight),
+        "offset": Param(l.bias),
+    }
+
+    AttentionParams = lambda att: {
+        "query_w": LinearWeightMHA(att.linear_q.weight),
+        "key_w": LinearWeightMHA(att.linear_k.weight),
+        "value_w": LinearWeightMHA(att.linear_v.weight),
+        "output_w": Param(
+            att.linear_o.weight,
+            param_type=ParamType.LinearMHAOutputWeight,
+        ),
+        "output_b": LinearBias(att.linear_o.bias),
+    }
+
+    AttentionGatedParams = lambda att: dict(
+        **AttentionParams(att),
+        **{
+            "gating_w": LinearWeightMHA(att.linear_g.weight),
+            "gating_b": LinearBiasMHA(att.linear_g.bias),
+        },
+    )
+
+    GlobalAttentionParams = lambda att: dict(
+        AttentionGatedParams(att),
+        key_w=LinearWeight(att.linear_k.weight),
+        value_w=LinearWeight(att.linear_v.weight),
+    )
+
+    TriAttParams = lambda tri_att: {
+        "query_norm": LayerNormParams(tri_att.layer_norm),
+        "feat_2d_weights": LinearWeight(tri_att.linear.weight),
+        "attention": AttentionGatedParams(tri_att.mha),
+    }
+
+    TriMulOutParams = lambda tri_mul: {
+        "layer_norm_input": LayerNormParams(tri_mul.layer_norm_in),
+        "left_projection": LinearParams(tri_mul.linear_a_p),
+        "right_projection": LinearParams(tri_mul.linear_b_p),
+        "left_gate": LinearParams(tri_mul.linear_a_g),
+        "right_gate": LinearParams(tri_mul.linear_b_g),
+        "center_layer_norm": LayerNormParams(tri_mul.layer_norm_out),
+        "output_projection": LinearParams(tri_mul.linear_z),
+        "gating_linear": LinearParams(tri_mul.linear_g),
+    }
+
+    # see commit b88f8da on the Alphafold repo
+    # Alphafold swaps the pseudocode's a and b between the incoming/outcoming
+    # iterations of triangle multiplication, which is confusing and not
+    # reproduced in our implementation.
+    TriMulInParams = lambda tri_mul: {
+        "layer_norm_input": LayerNormParams(tri_mul.layer_norm_in),
+        "left_projection": LinearParams(tri_mul.linear_b_p),
+        "right_projection": LinearParams(tri_mul.linear_a_p),
+        "left_gate": LinearParams(tri_mul.linear_b_g),
+        "right_gate": LinearParams(tri_mul.linear_a_g),
+        "center_layer_norm": LayerNormParams(tri_mul.layer_norm_out),
+        "output_projection": LinearParams(tri_mul.linear_z),
+        "gating_linear": LinearParams(tri_mul.linear_g),
+    }
+
+    PairTransitionParams = lambda pt: {
+        "input_layer_norm": LayerNormParams(pt.layer_norm),
+        "transition1": LinearParams(pt.linear_1),
+        "transition2": LinearParams(pt.linear_2),
+    }
+
+    MSAAttParams = lambda matt: {
+        "query_norm": LayerNormParams(matt.layer_norm_m),
+        "attention": AttentionGatedParams(matt.mha),
+    }
+
+    MSAColAttParams = lambda matt: {
+        "query_norm": LayerNormParams(matt._msa_att.layer_norm_m),
+        "attention": AttentionGatedParams(matt._msa_att.mha),
+    }
+
+    MSAGlobalAttParams = lambda matt: {
+        "query_norm": LayerNormParams(matt.layer_norm_m),
+        "attention": GlobalAttentionParams(matt.global_attention),
+    }
+
+    MSAAttPairBiasParams = lambda matt: dict(
+        **MSAAttParams(matt),
+        **{
+            "feat_2d_norm": LayerNormParams(matt.layer_norm_z),
+            "feat_2d_weights": LinearWeight(matt.linear_z.weight),
+        },
+    )
+
+    IPAParams = lambda ipa: {
+        "q_scalar": LinearParams(ipa.linear_q),
+        "kv_scalar": LinearParams(ipa.linear_kv),
+        "q_point_local": LinearParams(ipa.linear_q_points),
+        "kv_point_local": LinearParams(ipa.linear_kv_points),
+        "trainable_point_weights": Param(
+            param=ipa.head_weights, param_type=ParamType.Other
+        ),
+        "attention_2d": LinearParams(ipa.linear_b),
+        "output_projection": LinearParams(ipa.linear_out),
+    }
+
+    TemplatePairBlockParams = lambda b: {
+        "triangle_attention_starting_node": TriAttParams(b.tri_att_start),
+        "triangle_attention_ending_node": TriAttParams(b.tri_att_end),
+        "triangle_multiplication_outgoing": TriMulOutParams(b.tri_mul_out),
+        "triangle_multiplication_incoming": TriMulInParams(b.tri_mul_in),
+        "pair_transition": PairTransitionParams(b.pair_transition),
+    }
+
+    MSATransitionParams = lambda m: {
+        "input_layer_norm": LayerNormParams(m.layer_norm),
+        "transition1": LinearParams(m.linear_1),
+        "transition2": LinearParams(m.linear_2),
+    }
+
+    OuterProductMeanParams = lambda o: {
+        "layer_norm_input": LayerNormParams(o.layer_norm),
+        "left_projection": LinearParams(o.linear_1),
+        "right_projection": LinearParams(o.linear_2),
+        "output_w": LinearWeightOPM(o.linear_out.weight),
+        "output_b": LinearBias(o.linear_out.bias),
+    }
+
+    def EvoformerBlockParams(b, is_extra_msa=False):
+        if is_extra_msa:
+            col_att_name = "msa_column_global_attention"
+            msa_col_att_params = MSAGlobalAttParams(b.msa_att_col)
+        else:
+            col_att_name = "msa_column_attention"
+            msa_col_att_params = MSAColAttParams(b.msa_att_col)
+
+        d = {
+            "msa_row_attention_with_pair_bias": MSAAttPairBiasParams(
+                b.msa_att_row
+            ),
+            col_att_name: msa_col_att_params,
+            "msa_transition": MSATransitionParams(b.core.msa_transition),
+            "outer_product_mean": 
+                OuterProductMeanParams(b.core.outer_product_mean),
+            "triangle_multiplication_outgoing": 
+                TriMulOutParams(b.core.tri_mul_out),
+            "triangle_multiplication_incoming": 
+                TriMulInParams(b.core.tri_mul_in),
+            "triangle_attention_starting_node": 
+                TriAttParams(b.core.tri_att_start),
+            "triangle_attention_ending_node": 
+                TriAttParams(b.core.tri_att_end),
+            "pair_transition": 
+                PairTransitionParams(b.core.pair_transition),
+        }
+
+        return d
+
+    ExtraMSABlockParams = partial(EvoformerBlockParams, is_extra_msa=True)
+
+    FoldIterationParams = lambda sm: {
+        "invariant_point_attention": IPAParams(sm.ipa),
+        "attention_layer_norm": LayerNormParams(sm.layer_norm_ipa),
+        "transition": LinearParams(sm.transition.layers[0].linear_1),
+        "transition_1": LinearParams(sm.transition.layers[0].linear_2),
+        "transition_2": LinearParams(sm.transition.layers[0].linear_3),
+        "transition_layer_norm": LayerNormParams(sm.transition.layer_norm),
+        "affine_update": LinearParams(sm.bb_update.linear),
+        "rigid_sidechain": {
+            "input_projection": LinearParams(sm.angle_resnet.linear_in),
+            "input_projection_1": LinearParams(sm.angle_resnet.linear_initial),
+            "resblock1": LinearParams(sm.angle_resnet.layers[0].linear_1),
+            "resblock2": LinearParams(sm.angle_resnet.layers[0].linear_2),
+            "resblock1_1": LinearParams(sm.angle_resnet.layers[1].linear_1),
+            "resblock2_1": LinearParams(sm.angle_resnet.layers[1].linear_2),
+            "unnormalized_angles": LinearParams(sm.angle_resnet.linear_out),
+        },
+    }
+
+    ############################
+    # translations dict overflow
+    ############################
+
+    ems_blocks = model.extra_msa_stack.blocks
+    ems_blocks_params = stacked([ExtraMSABlockParams(b) for b in ems_blocks])
+
+    evo_blocks = model.evoformer.blocks
+    evo_blocks_params = stacked([EvoformerBlockParams(b) for b in evo_blocks])
+
+    translations = {
+        "evoformer": {
+            "preprocess_1d": LinearParams(model.input_embedder.linear_tf_m),
+            "preprocess_msa": LinearParams(model.input_embedder.linear_msa_m),
+            "left_single": LinearParams(model.input_embedder.linear_tf_z_i),
+            "right_single": LinearParams(model.input_embedder.linear_tf_z_j),
+            "prev_pos_linear": LinearParams(model.recycling_embedder.linear),
+            "prev_msa_first_row_norm": LayerNormParams(
+                model.recycling_embedder.layer_norm_m
+            ),
+            "prev_pair_norm": LayerNormParams(
+                model.recycling_embedder.layer_norm_z
+            ),
+            "pair_activiations": LinearParams(
+                model.input_embedder.linear_relpos
+            ),
+            "extra_msa_activations": LinearParams(
+                model.extra_msa_embedder.linear
+            ),
+            "extra_msa_stack": ems_blocks_params,
+            "evoformer_iteration": evo_blocks_params,
+            "single_activations": LinearParams(model.evoformer.linear),
+        },
+        "structure_module": {
+            "single_layer_norm": LayerNormParams(
+                model.structure_module.layer_norm_s
+            ),
+            "initial_projection": LinearParams(
+                model.structure_module.linear_in
+            ),
+            "pair_layer_norm": LayerNormParams(
+                model.structure_module.layer_norm_z
+            ),
+            "fold_iteration": FoldIterationParams(model.structure_module),
+        },
+        "predicted_lddt_head": {
+            "input_layer_norm": LayerNormParams(
+                model.aux_heads.plddt.layer_norm
+            ),
+            "act_0": LinearParams(model.aux_heads.plddt.linear_1),
+            "act_1": LinearParams(model.aux_heads.plddt.linear_2),
+            "logits": LinearParams(model.aux_heads.plddt.linear_3),
+        },
+        "distogram_head": {
+            "half_logits": LinearParams(model.aux_heads.distogram.linear),
+        },
+        "experimentally_resolved_head": {
+            "logits": LinearParams(
+                model.aux_heads.experimentally_resolved.linear
+            ),
+        },
+        "masked_msa_head": {
+            "logits": LinearParams(model.aux_heads.masked_msa.linear),
+        },
+    }
+
+    no_templ = [
+        "model_3",
+        "model_4",
+        "model_5",
+        "model_3_ptm",
+        "model_4_ptm",
+        "model_5_ptm",
+    ]
+
+    if version not in no_templ:
+        tps_blocks = model.template_pair_stack.blocks
+        tps_blocks_params = stacked(
+            [TemplatePairBlockParams(b) for b in tps_blocks]
+        ) 
+        template_param_dict = {
+            "template_embedding": {
+                "single_template_embedding": {
+                    "embedding2d": LinearParams(
+                        model.template_pair_embedder.linear
+                    ),
+                    "template_pair_stack": {
+                        "__layer_stack_no_state": tps_blocks_params,
+                    },
+                    "output_layer_norm": LayerNormParams(
+                        model.template_pair_stack.layer_norm
+                    ),
+                },
+                "attention": AttentionParams(model.template_pointwise_att.mha),
+            },
+            "template_single_embedding": LinearParams(
+                model.template_angle_embedder.linear_1
+            ),
+            "template_projection": LinearParams(
+                model.template_angle_embedder.linear_2
+            ),
+        }
+        
+        translations["evoformer"].update(template_param_dict)   
+
+    if "_ptm" in version:
+        translations["predicted_aligned_error_head"] = {
+            "logits": LinearParams(model.aux_heads.tm.linear)
+        }
+
+
+    return translations
+
+
+def import_jax_weights_(model, npz_path, version="model_1"):
+    data = np.load(npz_path)
+
+    translations = generate_translation_dict(model, version)
+
+    # Flatten keys and insert missing key prefixes
+    flat = process_translation_dict(translations)
+
+    # Sanity check
+    keys = list(data.keys())
+    flat_keys = list(flat.keys())
+    incorrect = [k for k in flat_keys if k not in keys]
+    missing = [k for k in keys if k not in flat_keys]
+    # print(f"Incorrect: {incorrect}")
+    # print(f"Missing: {missing}")
+
+    assert len(incorrect) == 0
+    # assert(sorted(list(flat.keys())) == sorted(list(data.keys())))
+
+    # Set weights
+    assign(flat, data)
--- a/openfold/utils/kernel/__init__.py
+++ b/openfold/utils/kernel/__init__.py
--- a/openfold/utils/kernel/attention_core.py
+++ b/openfold/utils/kernel/attention_core.py
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from functools import reduce
+from operator import mul
+
+import torch
+
+attn_core_inplace_cuda = importlib.import_module("attn_core_inplace_cuda")
+
+
+SUPPORTED_DTYPES = [torch.float32, torch.bfloat16]
+
+
+class AttentionCoreFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias_1=None, bias_2=None):
+        if(bias_1 is None and bias_2 is not None):
+            raise ValueError("bias_1 must be specified before bias_2")
+        if(q.dtype not in SUPPORTED_DTYPES):
+            raise ValueError("Unsupported datatype")
+
+        q = q.contiguous()
+        k = k.contiguous()
+
+        # [*, H, Q, K] 
+        attention_logits = torch.matmul(
+            q, k.transpose(-1, -2), 
+        )
+
+        if(bias_1 is not None):
+            attention_logits += bias_1
+        if(bias_2 is not None):
+            attention_logits += bias_2
+
+        attn_core_inplace_cuda.forward_(
+            attention_logits, 
+            reduce(mul, attention_logits.shape[:-1]),
+            attention_logits.shape[-1],
+        )
+
+        o = torch.matmul(attention_logits, v) 
+
+        ctx.bias_1_shape = bias_1.shape if bias_1 is not None else None
+        ctx.bias_2_shape = bias_2.shape if bias_2 is not None else None
+        ctx.save_for_backward(q, k, v, attention_logits)
+
+        return o
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        q, k, v, attention_logits = ctx.saved_tensors
+        grad_q = grad_k = grad_v = grad_bias_1 = grad_bias_2 = None
+       
+        grad_v = torch.matmul(
+            attention_logits.transpose(-1, -2), 
+            grad_output
+        )
+
+        attn_core_inplace_cuda.backward_(
+            attention_logits,
+            grad_output.contiguous(),
+            v.contiguous(), # v is implicitly transposed in the kernel
+            reduce(mul, attention_logits.shape[:-1]),
+            attention_logits.shape[-1],
+            grad_output.shape[-1],
+        )
+
+        if(ctx.bias_1_shape is not None):
+            grad_bias_1 = torch.sum(
+                attention_logits,
+                dim=tuple(i for i,d in enumerate(ctx.bias_1_shape) if d == 1),
+                keepdim=True,
+            )
+
+        if(ctx.bias_2_shape is not None):
+            grad_bias_2 = torch.sum(
+                attention_logits,
+                dim=tuple(i for i,d in enumerate(ctx.bias_2_shape) if d == 1),
+                keepdim=True,
+            )
+
+        grad_q = torch.matmul(
+            attention_logits, k
+        )
+        grad_k = torch.matmul(
+            q.transpose(-1, -2), attention_logits,
+        ).transpose(-1, -2)
+
+        return grad_q, grad_k, grad_v, grad_bias_1, grad_bias_2
+
+attention_core = AttentionCoreFunction.apply
--- a/openfold/utils/kernel/csrc/compat.h
+++ b/openfold/utils/kernel/csrc/compat.h
+// modified from https://github.com/NVIDIA/apex/blob/master/csrc/compat.h
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
--- a/openfold/utils/kernel/csrc/softmax_cuda.cpp
+++ b/openfold/utils/kernel/csrc/softmax_cuda.cpp
+// Copyright 2021 AlQuraishi Laboratory
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda.cpp
+
+#include <torch/extension.h>
+
+void attn_softmax_inplace_forward_(
+    at::Tensor input, 
+    long long rows, int cols
+);
+void attn_softmax_inplace_backward_(
+    at::Tensor output, 
+    at::Tensor d_ov,
+    at::Tensor values,
+    long long rows, 
+    int cols_output,
+    int cols_values
+);
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def(
+        "forward_", 
+        &attn_softmax_inplace_forward_, 
+        "Softmax forward (CUDA)"
+    );
+    m.def(
+        "backward_", 
+        &attn_softmax_inplace_backward_, 
+        "Softmax backward (CUDA)"
+    );
+}
--- a/openfold/utils/kernel/csrc/softmax_cuda_kernel.cu
+++ b/openfold/utils/kernel/csrc/softmax_cuda_kernel.cu
+// Copyright 2021 AlQuraishi Laboratory
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda_kernel.cu
+
+#include <math_constants.h>
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <iostream>
+
+#include "ATen/ATen.h"
+#include "ATen/cuda/CUDAContext.h"
+#include "compat.h"
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x)
+
+__inline__ __device__ float WarpAllReduceMax(float val) {
+    for (int mask = 1; mask < 32; mask *= 2) {
+        val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
+    }
+    return val;
+}
+
+__inline__ __device__ float WarpAllReduceSum(float val) {
+    for (int mask = 1; mask < 32; mask *= 2) {
+        val += __shfl_xor_sync(0xffffffff, val, mask);
+    }
+    return val;
+}
+
+
+template<typename T>
+__global__ void attn_softmax_inplace_(
+    T *input, 
+    long long rows, int cols
+) {
+    int threadidx_x = threadIdx.x / 32;
+    int threadidx_y = threadIdx.x % 32;
+    long long row_offset = (long long)(blockIdx.x * 4 + threadidx_x);
+    int cols_per_thread = (cols + 31) / 32;
+    int cols_this_thread = cols_per_thread;
+
+    int last_y = (cols / cols_per_thread);
+
+    if (threadidx_y == last_y) {
+        cols_this_thread = cols - cols_per_thread * last_y;
+    }
+    else if (threadidx_y > last_y) {
+        cols_this_thread = 0;
+    }
+
+    float buf[32];
+
+    int lane_id = threadidx_y;
+
+    if (row_offset < rows) {
+        T *row_input = input + row_offset * cols;
+        T *row_output = row_input;
+
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            int idx = lane_id * cols_per_thread + i;
+            buf[i] = static_cast<float>(row_input[idx]);
+        }
+
+        float thread_max = -1 * CUDART_INF_F;
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            thread_max = max(thread_max, buf[i]);
+        }
+
+        float warp_max = WarpAllReduceMax(thread_max);
+
+        float thread_sum = 0.f;
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            buf[i] = __expf(buf[i] - warp_max);
+            thread_sum += buf[i];
+        }
+
+        float warp_sum = WarpAllReduceSum(thread_sum);
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            row_output[lane_id * cols_per_thread + i] =
+                static_cast<T>(__fdividef(buf[i], warp_sum));
+        }
+    }
+}
+
+
+void attn_softmax_inplace_forward_(
+    at::Tensor input, 
+    long long rows, int cols
+) {
+    CHECK_INPUT(input);
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+
+    int grid = (rows + 3) / 4;
+    dim3 block(128);
+
+    if (input.dtype() == torch::kFloat32) {
+        attn_softmax_inplace_<float><<<grid, block>>>(
+            (float *)input.data_ptr(),
+            rows, cols
+        );
+    } 
+    else {
+        attn_softmax_inplace_<at::BFloat16><<<grid, block>>>(
+            (at::BFloat16 *)input.data_ptr(), 
+            rows, cols
+        );
+    }
+}
+
+
+template<typename T>
+__global__ void attn_softmax_inplace_grad_(
+    T *output,
+    T *d_ov,
+    T *values,
+    long long rows, 
+    int cols_output,
+    int cols_values
+) {
+    int threadidx_x = threadIdx.x / 32;
+    int threadidx_y = threadIdx.x % 32;
+    long long row_offset = (long long)(blockIdx.x * 4 + threadidx_x);
+    int cols_per_thread = (cols_output + 31) / 32;
+    int cols_this_thread = cols_per_thread;
+    int rows_values = cols_output;
+    // values are set to the beginning of the current 
+    // rows_values x cols_values leaf matrix
+    long long value_row_offset = row_offset - row_offset % rows_values;
+    int last_y = (cols_output / cols_per_thread);
+
+    if (threadidx_y == last_y) {
+        cols_this_thread = cols_output - cols_per_thread * last_y;
+    }
+    else if (threadidx_y > last_y) {
+        cols_this_thread = 0;
+    }
+
+    float y_buf[32];
+    float dy_buf[32];
+
+    int lane_id = threadidx_y;
+
+    if (row_offset < rows) {
+        T *row_output = output + row_offset * cols_output;
+        T *row_d_ov = d_ov + row_offset * cols_values;
+        T *row_values = values + value_row_offset * cols_values;
+
+        float thread_max = -1 * CUDART_INF_F;
+
+        // Compute a chunk of the output gradient on the fly
+        int value_row_idx = 0;
+        int value_idx = 0;
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            T sum = 0.;
+            #pragma unroll
+            for (int j = 0; j < cols_values; j++) {
+                value_row_idx = ((lane_id * cols_per_thread) + i);
+                value_idx = value_row_idx * cols_values + j;
+                sum += row_d_ov[j] * row_values[value_idx];
+            }
+            dy_buf[i] = static_cast<float>(sum);
+        }
+
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            y_buf[i] = static_cast<float>(row_output[lane_id * cols_per_thread + i]);
+        }
+
+        float thread_sum = 0.;
+
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            thread_sum += y_buf[i] * dy_buf[i];
+        }
+
+        float warp_sum = WarpAllReduceSum(thread_sum);
+
+        #pragma unroll
+        for (int i = 0; i < cols_this_thread; i++) {
+            row_output[lane_id * cols_per_thread + i] = static_cast<T>(
+                    (dy_buf[i] - warp_sum) * y_buf[i]
+            );
+        }
+    }
+}
+
+
+void attn_softmax_inplace_backward_(
+    at::Tensor output,
+    at::Tensor d_ov, 
+    at::Tensor values,
+    long long rows, 
+    int cols_output,
+    int cols_values
+) {
+    CHECK_INPUT(output);
+    CHECK_INPUT(d_ov);
+    CHECK_INPUT(values);
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+
+    int grid = (rows + 3) / 4;
+    dim3 block(128);
+
+    if (output.dtype() == torch::kFloat32) {
+        attn_softmax_inplace_grad_<float><<<grid, block>>>(
+            (float *)output.data_ptr(),
+            (float *)d_ov.data_ptr(), 
+            (float *)values.data_ptr(),
+            rows, cols_output, cols_values
+        );
+    } else {
+        attn_softmax_inplace_grad_<at::BFloat16><<<grid, block>>>(
+            (at::BFloat16 *)output.data_ptr(),
+            (at::BFloat16 *)d_ov.data_ptr(), 
+            (at::BFloat16 *)values.data_ptr(),
+            rows, cols_output, cols_values
+        );
+    }
+}
--- a/openfold/utils/kernel/csrc/softmax_cuda_stub.cpp
+++ b/openfold/utils/kernel/csrc/softmax_cuda_stub.cpp
+// Copyright 2021 AlQuraishi Laboratory
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// modified from fastfold/model/fastnn/kernel/cuda_native/csrc/softmax_cuda.cpp
+
+#include <torch/extension.h>
+
+void attn_softmax_inplace_forward_(
+    at::Tensor input, 
+    long long rows, int cols
+)
+{
+    throw std::runtime_error("attn_softmax_inplace_forward_ not implemented on CPU");
+};
+void attn_softmax_inplace_backward_(
+    at::Tensor output, 
+    at::Tensor d_ov,
+    at::Tensor values,
+    long long rows, 
+    int cols_output,
+    int cols_values
+)
+{
+    throw std::runtime_error("attn_softmax_inplace_backward_ not implemented on CPU");
+};
\ No newline at end of file
--- a/openfold/utils/logger.py
+++ b/openfold/utils/logger.py
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import operator
+import time
+
+import dllogger as logger
+from dllogger import JSONStreamBackend, StdOutBackend, Verbosity
+import numpy as np
+from pytorch_lightning import Callback
+import torch.cuda.profiler as profiler
+
+
+def is_main_process():
+    return int(os.getenv("LOCAL_RANK", "0")) == 0
+
+
+class PerformanceLoggingCallback(Callback):
+    def __init__(self, log_file, global_batch_size, warmup_steps: int = 0, profile: bool = False):
+        logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_file), StdOutBackend(Verbosity.VERBOSE)])
+        self.warmup_steps = warmup_steps
+        self.global_batch_size = global_batch_size
+        self.step = 0
+        self.profile = profile
+        self.timestamps = []
+
+    def do_step(self):
+        self.step += 1
+        if self.profile and self.step == self.warmup_steps:
+            profiler.start()
+        if self.step > self.warmup_steps:
+            self.timestamps.append(time.time())
+
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+        self.do_step()
+
+    def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+        self.do_step()
+
+    def process_performance_stats(self, deltas):
+        def _round3(val):
+            return round(val, 3)
+
+        throughput_imgps = _round3(self.global_batch_size / np.mean(deltas))
+        timestamps_ms = 1000 * deltas
+        stats = {
+            f"throughput": throughput_imgps,
+            f"latency_mean": _round3(timestamps_ms.mean()),
+        }
+        for level in [90, 95, 99]:
+            stats.update({f"latency_{level}": _round3(np.percentile(timestamps_ms, level))})
+
+        return stats
+
+    def _log(self):
+        if is_main_process():
+            diffs = list(map(operator.sub, self.timestamps[1:], self.timestamps[:-1]))
+            deltas = np.array(diffs)
+            stats = self.process_performance_stats(deltas)
+            logger.log(step=(), data=stats)
+            logger.flush()
+
+    def on_train_end(self, trainer, pl_module):
+        if self.profile:
+            profiler.stop()
+        self._log()
+
+    def on_epoch_end(self, trainer, pl_module):
+        self._log()
--- a/openfold/utils/loss.py
+++ b/openfold/utils/loss.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import logging
+import ml_collections
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.distributions.bernoulli import Bernoulli
+from typing import Dict, Optional, Tuple
+
+from openfold.np import residue_constants
+from openfold.utils import feats
+from openfold.utils.rigid_utils import Rotation, Rigid
+from openfold.utils.tensor_utils import (
+    tree_map,
+    tensor_tree_map,
+    masked_mean,
+    permute_final_dims,
+    batched_gather,
+)
+
+
+def softmax_cross_entropy(logits, labels):
+    loss = -1 * torch.sum(
+        labels * torch.nn.functional.log_softmax(logits, dim=-1),
+        dim=-1,
+    )
+    return loss
+
+
+def sigmoid_cross_entropy(logits, labels):
+    logits_dtype = logits.dtype
+    logits = logits.double()
+    labels = labels.double()
+    log_p = torch.nn.functional.logsigmoid(logits)
+    # log_p = torch.log(torch.sigmoid(logits))
+    log_not_p = torch.nn.functional.logsigmoid(-1 * logits)
+    # log_not_p = torch.log(torch.sigmoid(-logits))
+    loss = (-1. * labels) * log_p - (1. - labels) * log_not_p
+    loss = loss.to(dtype=logits_dtype)
+    return loss
+
+
+def torsion_angle_loss(
+    a,  # [*, N, 7, 2]
+    a_gt,  # [*, N, 7, 2]
+    a_alt_gt,  # [*, N, 7, 2]
+):
+    # [*, N, 7]
+    norm = torch.norm(a, dim=-1)
+
+    # [*, N, 7, 2]
+    a = a / norm.unsqueeze(-1)
+
+    # [*, N, 7]
+    diff_norm_gt = torch.norm(a - a_gt, dim=-1)
+    diff_norm_alt_gt = torch.norm(a - a_alt_gt, dim=-1)
+    min_diff = torch.minimum(diff_norm_gt ** 2, diff_norm_alt_gt ** 2)
+
+    # [*]
+    l_torsion = torch.mean(min_diff, dim=(-1, -2))
+    l_angle_norm = torch.mean(torch.abs(norm - 1), dim=(-1, -2))
+
+    an_weight = 0.02
+    return l_torsion + an_weight * l_angle_norm
+
+
+def compute_fape(
+    pred_frames: Rigid,
+    target_frames: Rigid,
+    frames_mask: torch.Tensor,
+    pred_positions: torch.Tensor,
+    target_positions: torch.Tensor,
+    positions_mask: torch.Tensor,
+    length_scale: float,
+    l1_clamp_distance: Optional[float] = None,
+    eps=1e-8,
+) -> torch.Tensor:
+    """
+        Computes FAPE loss.
+
+        Args:
+            pred_frames:
+                [*, N_frames] Rigid object of predicted frames
+            target_frames:
+                [*, N_frames] Rigid object of ground truth frames
+            frames_mask:
+                [*, N_frames] binary mask for the frames
+            pred_positions:
+                [*, N_pts, 3] predicted atom positions
+            target_positions:
+                [*, N_pts, 3] ground truth positions
+            positions_mask:
+                [*, N_pts] positions mask
+            length_scale:
+                Length scale by which the loss is divided
+            l1_clamp_distance:
+                Cutoff above which distance errors are disregarded
+            eps:
+                Small value used to regularize denominators
+        Returns:
+            [*] loss tensor
+    """
+    # [*, N_frames, N_pts, 3]
+    local_pred_pos = pred_frames.invert()[..., None].apply(
+        pred_positions[..., None, :, :],
+    )
+    local_target_pos = target_frames.invert()[..., None].apply(
+        target_positions[..., None, :, :],
+    )
+
+    error_dist = torch.sqrt(
+        torch.sum((local_pred_pos - local_target_pos) ** 2, dim=-1) + eps
+    )
+
+    if l1_clamp_distance is not None:
+        error_dist = torch.clamp(error_dist, min=0, max=l1_clamp_distance)
+
+    normed_error = error_dist / length_scale
+    normed_error = normed_error * frames_mask[..., None]
+    normed_error = normed_error * positions_mask[..., None, :]
+
+    # FP16-friendly averaging. Roughly equivalent to:
+    #
+    # norm_factor = (
+    #     torch.sum(frames_mask, dim=-1) *
+    #     torch.sum(positions_mask, dim=-1)
+    # )
+    # normed_error = torch.sum(normed_error, dim=(-1, -2)) / (eps + norm_factor)
+    #
+    # ("roughly" because eps is necessarily duplicated in the latter)
+    normed_error = torch.sum(normed_error, dim=-1)
+    normed_error = (
+        normed_error / (eps + torch.sum(frames_mask, dim=-1))[..., None]
+    )
+    normed_error = torch.sum(normed_error, dim=-1)
+    normed_error = normed_error / (eps + torch.sum(positions_mask, dim=-1))
+
+    return normed_error
+
+
+def backbone_loss(
+    backbone_rigid_tensor: torch.Tensor,
+    backbone_rigid_mask: torch.Tensor,
+    traj: torch.Tensor,
+    use_clamped_fape: Optional[torch.Tensor] = None,
+    clamp_distance: float = 10.0,
+    loss_unit_distance: float = 10.0,
+    eps: float = 1e-4,
+    **kwargs,
+) -> torch.Tensor:
+    pred_aff = Rigid.from_tensor_7(traj)
+    pred_aff = Rigid(
+        Rotation(rot_mats=pred_aff.get_rots().get_rot_mats(), quats=None),
+        pred_aff.get_trans(),
+    )
+
+    # DISCREPANCY: DeepMind somehow gets a hold of a tensor_7 version of
+    # backbone tensor, normalizes it, and then turns it back to a rotation
+    # matrix. To avoid a potentially numerically unstable rotation matrix
+    # to quaternion conversion, we just use the original rotation matrix
+    # outright. This one hasn't been composed a bunch of times, though, so
+    # it might be fine.
+    gt_aff = Rigid.from_tensor_4x4(backbone_rigid_tensor)
+
+    fape_loss = compute_fape(
+        pred_aff,
+        gt_aff[None],
+        backbone_rigid_mask[None],
+        pred_aff.get_trans(),
+        gt_aff[None].get_trans(),
+        backbone_rigid_mask[None],
+        l1_clamp_distance=clamp_distance,
+        length_scale=loss_unit_distance,
+        eps=eps,
+    )
+    if use_clamped_fape is not None:
+        unclamped_fape_loss = compute_fape(
+            pred_aff,
+            gt_aff[None],
+            backbone_rigid_mask[None],
+            pred_aff.get_trans(),
+            gt_aff[None].get_trans(),
+            backbone_rigid_mask[None],
+            l1_clamp_distance=None,
+            length_scale=loss_unit_distance,
+            eps=eps,
+        )
+
+        fape_loss = fape_loss * use_clamped_fape + unclamped_fape_loss * (
+            1 - use_clamped_fape
+        )
+
+    # Average over the batch dimension
+    fape_loss = torch.mean(fape_loss)
+
+    return fape_loss
+
+
+def sidechain_loss(
+    sidechain_frames: torch.Tensor,
+    sidechain_atom_pos: torch.Tensor,
+    rigidgroups_gt_frames: torch.Tensor,
+    rigidgroups_alt_gt_frames: torch.Tensor,
+    rigidgroups_gt_exists: torch.Tensor,
+    renamed_atom14_gt_positions: torch.Tensor,
+    renamed_atom14_gt_exists: torch.Tensor,
+    alt_naming_is_better: torch.Tensor,
+    clamp_distance: float = 10.0,
+    length_scale: float = 10.0,
+    eps: float = 1e-4,
+    **kwargs,
+) -> torch.Tensor:
+    renamed_gt_frames = (
+        1.0 - alt_naming_is_better[..., None, None, None]
+    ) * rigidgroups_gt_frames + alt_naming_is_better[
+        ..., None, None, None
+    ] * rigidgroups_alt_gt_frames
+
+    # Steamroll the inputs
+    sidechain_frames = sidechain_frames[-1]
+    batch_dims = sidechain_frames.shape[:-4]
+    sidechain_frames = sidechain_frames.view(*batch_dims, -1, 4, 4)
+    sidechain_frames = Rigid.from_tensor_4x4(sidechain_frames)
+    renamed_gt_frames = renamed_gt_frames.view(*batch_dims, -1, 4, 4)
+    renamed_gt_frames = Rigid.from_tensor_4x4(renamed_gt_frames)
+    rigidgroups_gt_exists = rigidgroups_gt_exists.reshape(*batch_dims, -1)
+    sidechain_atom_pos = sidechain_atom_pos[-1]
+    sidechain_atom_pos = sidechain_atom_pos.view(*batch_dims, -1, 3)
+    renamed_atom14_gt_positions = renamed_atom14_gt_positions.view(
+        *batch_dims, -1, 3
+    )
+    renamed_atom14_gt_exists = renamed_atom14_gt_exists.view(*batch_dims, -1)
+
+    fape = compute_fape(
+        sidechain_frames,
+        renamed_gt_frames,
+        rigidgroups_gt_exists,
+        sidechain_atom_pos,
+        renamed_atom14_gt_positions,
+        renamed_atom14_gt_exists,
+        l1_clamp_distance=clamp_distance,
+        length_scale=length_scale,
+        eps=eps,
+    )
+
+    return fape
+
+
+def fape_loss(
+    out: Dict[str, torch.Tensor],
+    batch: Dict[str, torch.Tensor],
+    config: ml_collections.ConfigDict,
+) -> torch.Tensor:
+    bb_loss = backbone_loss(
+        traj=out["sm"]["frames"],
+        **{**batch, **config.backbone},
+    )
+
+    sc_loss = sidechain_loss(
+        out["sm"]["sidechain_frames"],
+        out["sm"]["positions"],
+        **{**batch, **config.sidechain},
+    )
+
+    loss = config.backbone.weight * bb_loss + config.sidechain.weight * sc_loss
+    
+    # Average over the batch dimension
+    loss = torch.mean(loss)
+
+    return loss
+
+
+def supervised_chi_loss(
+    angles_sin_cos: torch.Tensor,
+    unnormalized_angles_sin_cos: torch.Tensor,
+    aatype: torch.Tensor,
+    seq_mask: torch.Tensor,
+    chi_mask: torch.Tensor,
+    chi_angles_sin_cos: torch.Tensor,
+    chi_weight: float,
+    angle_norm_weight: float,
+    eps=1e-6,
+    **kwargs,
+) -> torch.Tensor:
+    """
+        Implements Algorithm 27 (torsionAngleLoss)
+
+        Args:
+            angles_sin_cos:
+                [*, N, 7, 2] predicted angles
+            unnormalized_angles_sin_cos:
+                The same angles, but unnormalized
+            aatype:
+                [*, N] residue indices
+            seq_mask:
+                [*, N] sequence mask
+            chi_mask:
+                [*, N, 7] angle mask
+            chi_angles_sin_cos:
+                [*, N, 7, 2] ground truth angles
+            chi_weight:
+                Weight for the angle component of the loss
+            angle_norm_weight:
+                Weight for the normalization component of the loss
+        Returns:
+            [*] loss tensor
+    """
+    pred_angles = angles_sin_cos[..., 3:, :]
+    residue_type_one_hot = torch.nn.functional.one_hot(
+        aatype,
+        residue_constants.restype_num + 1,
+    )
+    chi_pi_periodic = torch.einsum(
+        "...ij,jk->ik",
+        residue_type_one_hot.type(angles_sin_cos.dtype),
+        angles_sin_cos.new_tensor(residue_constants.chi_pi_periodic),
+    )
+
+    true_chi = chi_angles_sin_cos[None]
+
+    shifted_mask = (1 - 2 * chi_pi_periodic).unsqueeze(-1)
+    true_chi_shifted = shifted_mask * true_chi
+    sq_chi_error = torch.sum((true_chi - pred_angles) ** 2, dim=-1)
+    sq_chi_error_shifted = torch.sum(
+        (true_chi_shifted - pred_angles) ** 2, dim=-1
+    )
+    sq_chi_error = torch.minimum(sq_chi_error, sq_chi_error_shifted)
+    
+    # The ol' switcheroo
+    sq_chi_error = sq_chi_error.permute(
+        *range(len(sq_chi_error.shape))[1:-2], 0, -2, -1
+    )
+
+    sq_chi_loss = masked_mean(
+        chi_mask[..., None, :, :], sq_chi_error, dim=(-1, -2, -3)
+    )
+
+    loss = chi_weight * sq_chi_loss
+
+    angle_norm = torch.sqrt(
+        torch.sum(unnormalized_angles_sin_cos ** 2, dim=-1) + eps
+    )
+    norm_error = torch.abs(angle_norm - 1.0)
+    norm_error = norm_error.permute(
+        *range(len(norm_error.shape))[1:-2], 0, -2, -1
+    )
+    angle_norm_loss = masked_mean(
+        seq_mask[..., None, :, None], norm_error, dim=(-1, -2, -3)
+    )
+
+    loss = loss + angle_norm_weight * angle_norm_loss
+
+    # Average over the batch dimension
+    loss = torch.mean(loss)
+
+    return loss
+
+
+def compute_plddt(logits: torch.Tensor) -> torch.Tensor:
+    num_bins = logits.shape[-1]
+    bin_width = 1.0 / num_bins
+    bounds = torch.arange(
+        start=0.5 * bin_width, end=1.0, step=bin_width, device=logits.device
+    )
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    pred_lddt_ca = torch.sum(
+        probs * bounds.view(*((1,) * len(probs.shape[:-1])), *bounds.shape),
+        dim=-1,
+    )
+    return pred_lddt_ca * 100
+
+
+def lddt(
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    cutoff: float = 15.0,
+    eps: float = 1e-10,
+    per_residue: bool = True,
+) -> torch.Tensor:
+    n = all_atom_mask.shape[-2]
+    dmat_true = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                all_atom_positions[..., None, :]
+                - all_atom_positions[..., None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+
+    dmat_pred = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                all_atom_pred_pos[..., None, :]
+                - all_atom_pred_pos[..., None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+    dists_to_score = (
+        (dmat_true < cutoff)
+        * all_atom_mask
+        * permute_final_dims(all_atom_mask, (1, 0))
+        * (1.0 - torch.eye(n, device=all_atom_mask.device))
+    )
+
+    dist_l1 = torch.abs(dmat_true - dmat_pred)
+
+    score = (
+        (dist_l1 < 0.5).type(dist_l1.dtype)
+        + (dist_l1 < 1.0).type(dist_l1.dtype)
+        + (dist_l1 < 2.0).type(dist_l1.dtype)
+        + (dist_l1 < 4.0).type(dist_l1.dtype)
+    )
+    score = score * 0.25
+
+    dims = (-1,) if per_residue else (-2, -1)
+    norm = 1.0 / (eps + torch.sum(dists_to_score, dim=dims))
+    score = norm * (eps + torch.sum(dists_to_score * score, dim=dims))
+
+    return score
+
+
+def lddt_ca(
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    cutoff: float = 15.0,
+    eps: float = 1e-10,
+    per_residue: bool = True,
+) -> torch.Tensor:
+    ca_pos = residue_constants.atom_order["CA"]
+    all_atom_pred_pos = all_atom_pred_pos[..., ca_pos, :]
+    all_atom_positions = all_atom_positions[..., ca_pos, :]
+    all_atom_mask = all_atom_mask[..., ca_pos : (ca_pos + 1)]  # keep dim
+
+    return lddt(
+        all_atom_pred_pos,
+        all_atom_positions,
+        all_atom_mask,
+        cutoff=cutoff,
+        eps=eps,
+        per_residue=per_residue,
+    )
+
+
+def lddt_loss(
+    logits: torch.Tensor,
+    all_atom_pred_pos: torch.Tensor,
+    all_atom_positions: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    resolution: torch.Tensor,
+    cutoff: float = 15.0,
+    no_bins: int = 50,
+    min_resolution: float = 0.1,
+    max_resolution: float = 3.0,
+    eps: float = 1e-10,
+    **kwargs,
+) -> torch.Tensor:
+    n = all_atom_mask.shape[-2]
+
+    ca_pos = residue_constants.atom_order["CA"]
+    all_atom_pred_pos = all_atom_pred_pos[..., ca_pos, :]
+    all_atom_positions = all_atom_positions[..., ca_pos, :]
+    all_atom_mask = all_atom_mask[..., ca_pos : (ca_pos + 1)]  # keep dim
+
+    score = lddt(
+        all_atom_pred_pos, 
+        all_atom_positions, 
+        all_atom_mask, 
+        cutoff=cutoff, 
+        eps=eps
+    )
+
+    score = score.detach()
+
+    bin_index = torch.floor(score * no_bins).long()
+    bin_index = torch.clamp(bin_index, max=(no_bins - 1))
+    lddt_ca_one_hot = torch.nn.functional.one_hot(
+        bin_index, num_classes=no_bins
+    )
+
+    errors = softmax_cross_entropy(logits, lddt_ca_one_hot)
+    all_atom_mask = all_atom_mask.squeeze(-1)
+    loss = torch.sum(errors * all_atom_mask, dim=-1) / (
+        eps + torch.sum(all_atom_mask, dim=-1)
+    )
+
+    loss = loss * (
+        (resolution >= min_resolution) & (resolution <= max_resolution)
+    )
+
+    # Average over the batch dimension
+    loss = torch.mean(loss)
+
+    return loss
+
+
+def distogram_loss(
+    logits,
+    pseudo_beta,
+    pseudo_beta_mask,
+    min_bin=2.3125,
+    max_bin=21.6875,
+    no_bins=64,
+    eps=1e-6,
+    **kwargs,
+):
+    boundaries = torch.linspace(
+        min_bin,
+        max_bin,
+        no_bins - 1,
+        device=logits.device,
+    )
+    boundaries = boundaries ** 2
+    
+    dists = torch.sum(
+        (pseudo_beta[..., None, :] - pseudo_beta[..., None, :, :]) ** 2,
+        dim=-1,
+        keepdims=True,
+    )
+
+    true_bins = torch.sum(dists > boundaries, dim=-1)
+
+    errors = softmax_cross_entropy(
+        logits,
+        torch.nn.functional.one_hot(true_bins, no_bins),
+    )
+
+    square_mask = pseudo_beta_mask[..., None] * pseudo_beta_mask[..., None, :]
+
+    # FP16-friendly sum. Equivalent to:
+    # mean = (torch.sum(errors * square_mask, dim=(-1, -2)) /
+    #         (eps + torch.sum(square_mask, dim=(-1, -2))))
+    denom = eps + torch.sum(square_mask, dim=(-1, -2))
+    mean = errors * square_mask
+    mean = torch.sum(mean, dim=-1)
+    mean = mean / denom[..., None]
+    mean = torch.sum(mean, dim=-1)
+
+    # Average over the batch dimensions
+    mean = torch.mean(mean)
+
+    return mean
+
+
+def _calculate_bin_centers(boundaries: torch.Tensor):
+    step = boundaries[1] - boundaries[0]
+    bin_centers = boundaries + step / 2
+    bin_centers = torch.cat(
+        [bin_centers, (bin_centers[-1] + step).unsqueeze(-1)], dim=0
+    )
+    return bin_centers
+
+
+def _calculate_expected_aligned_error(
+    alignment_confidence_breaks: torch.Tensor,
+    aligned_distance_error_probs: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    bin_centers = _calculate_bin_centers(alignment_confidence_breaks)
+    return (
+        torch.sum(aligned_distance_error_probs * bin_centers, dim=-1),
+        bin_centers[-1],
+    )
+
+
+def compute_predicted_aligned_error(
+    logits: torch.Tensor,
+    max_bin: int = 31,
+    no_bins: int = 64,
+    **kwargs,
+) -> Dict[str, torch.Tensor]:
+    """Computes aligned confidence metrics from logits.
+
+    Args:
+      logits: [*, num_res, num_res, num_bins] the logits output from
+        PredictedAlignedErrorHead.
+      max_bin: Maximum bin value
+      no_bins: Number of bins
+    Returns:
+      aligned_confidence_probs: [*, num_res, num_res, num_bins] the predicted
+        aligned error probabilities over bins for each residue pair.
+      predicted_aligned_error: [*, num_res, num_res] the expected aligned distance
+        error for each pair of residues.
+      max_predicted_aligned_error: [*] the maximum predicted error possible.
+    """
+    boundaries = torch.linspace(
+        0, max_bin, steps=(no_bins - 1), device=logits.device
+    )
+
+    aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1)
+    (
+        predicted_aligned_error,
+        max_predicted_aligned_error,
+    ) = _calculate_expected_aligned_error(
+        alignment_confidence_breaks=boundaries,
+        aligned_distance_error_probs=aligned_confidence_probs,
+    )
+
+    return {
+        "aligned_confidence_probs": aligned_confidence_probs,
+        "predicted_aligned_error": predicted_aligned_error,
+        "max_predicted_aligned_error": max_predicted_aligned_error,
+    }
+
+
+def compute_tm(
+    logits: torch.Tensor,
+    residue_weights: Optional[torch.Tensor] = None,
+    max_bin: int = 31,
+    no_bins: int = 64,
+    eps: float = 1e-8,
+    **kwargs,
+) -> torch.Tensor:
+    if residue_weights is None:
+        residue_weights = logits.new_ones(logits.shape[-2])
+
+    boundaries = torch.linspace(
+        0, max_bin, steps=(no_bins - 1), device=logits.device
+    )
+
+    bin_centers = _calculate_bin_centers(boundaries)
+    clipped_n = max(torch.sum(residue_weights), 19)
+
+    d0 = 1.24 * (clipped_n - 15) ** (1.0 / 3) - 1.8
+
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+
+    tm_per_bin = 1.0 / (1 + (bin_centers ** 2) / (d0 ** 2))
+    predicted_tm_term = torch.sum(probs * tm_per_bin, dim=-1)
+
+    normed_residue_mask = residue_weights / (eps + residue_weights.sum())
+    per_alignment = torch.sum(predicted_tm_term * normed_residue_mask, dim=-1)
+
+    weighted = per_alignment * residue_weights
+     
+    argmax = (weighted == torch.max(weighted)).nonzero()[0]
+    return per_alignment[tuple(argmax)]
+
+
+def tm_loss(
+    logits,
+    final_affine_tensor,
+    backbone_rigid_tensor,
+    backbone_rigid_mask,
+    resolution,
+    max_bin=31,
+    no_bins=64,
+    min_resolution: float = 0.1,
+    max_resolution: float = 3.0,
+    eps=1e-8,
+    **kwargs,
+):
+    pred_affine = Rigid.from_tensor_7(final_affine_tensor)
+    backbone_rigid = Rigid.from_tensor_4x4(backbone_rigid_tensor)
+
+    def _points(affine):
+        pts = affine.get_trans()[..., None, :, :]
+        return affine.invert()[..., None].apply(pts)
+
+    sq_diff = torch.sum(
+        (_points(pred_affine) - _points(backbone_rigid)) ** 2, dim=-1
+    )
+
+    sq_diff = sq_diff.detach()
+
+    boundaries = torch.linspace(
+        0, max_bin, steps=(no_bins - 1), device=logits.device
+    )
+    boundaries = boundaries ** 2
+    true_bins = torch.sum(sq_diff[..., None] > boundaries, dim=-1)
+
+    errors = softmax_cross_entropy(
+        logits, torch.nn.functional.one_hot(true_bins, no_bins)
+    )
+
+    square_mask = (
+        backbone_rigid_mask[..., None] * backbone_rigid_mask[..., None, :]
+    )
+
+    loss = torch.sum(errors * square_mask, dim=-1)
+    scale = 0.5  # hack to help FP16 training along
+    denom = eps + torch.sum(scale * square_mask, dim=(-1, -2))
+    loss = loss / denom[..., None]
+    loss = torch.sum(loss, dim=-1)
+    loss = loss * scale
+
+    loss = loss * (
+        (resolution >= min_resolution) & (resolution <= max_resolution)
+    )
+
+    # Average over the loss dimension
+    loss = torch.mean(loss)
+
+    return loss
+
+
+def between_residue_bond_loss(
+    pred_atom_positions: torch.Tensor,  # (*, N, 37/14, 3)
+    pred_atom_mask: torch.Tensor,  # (*, N, 37/14)
+    residue_index: torch.Tensor,  # (*, N)
+    aatype: torch.Tensor,  # (*, N)
+    tolerance_factor_soft=12.0,
+    tolerance_factor_hard=12.0,
+    eps=1e-6,
+) -> Dict[str, torch.Tensor]:
+    """Flat-bottom loss to penalize structural violations between residues.
+
+    This is a loss penalizing any violation of the geometry around the peptide
+    bond between consecutive amino acids. This loss corresponds to
+    Jumper et al. (2021) Suppl. Sec. 1.9.11, eq 44, 45.
+
+    Args:
+      pred_atom_positions: Atom positions in atom37/14 representation
+      pred_atom_mask: Atom mask in atom37/14 representation
+      residue_index: Residue index for given amino acid, this is assumed to be
+        monotonically increasing.
+      aatype: Amino acid type of given residue
+      tolerance_factor_soft: soft tolerance factor measured in standard deviations
+        of pdb distributions
+      tolerance_factor_hard: hard tolerance factor measured in standard deviations
+        of pdb distributions
+
+    Returns:
+      Dict containing:
+        * 'c_n_loss_mean': Loss for peptide bond length violations
+        * 'ca_c_n_loss_mean': Loss for violations of bond angle around C spanned
+            by CA, C, N
+        * 'c_n_ca_loss_mean': Loss for violations of bond angle around N spanned
+            by C, N, CA
+        * 'per_residue_loss_sum': sum of all losses for each residue
+        * 'per_residue_violation_mask': mask denoting all residues with violation
+            present.
+    """
+    # Get the positions of the relevant backbone atoms.
+    this_ca_pos = pred_atom_positions[..., :-1, 1, :]
+    this_ca_mask = pred_atom_mask[..., :-1, 1]
+    this_c_pos = pred_atom_positions[..., :-1, 2, :]
+    this_c_mask = pred_atom_mask[..., :-1, 2]
+    next_n_pos = pred_atom_positions[..., 1:, 0, :]
+    next_n_mask = pred_atom_mask[..., 1:, 0]
+    next_ca_pos = pred_atom_positions[..., 1:, 1, :]
+    next_ca_mask = pred_atom_mask[..., 1:, 1]
+    has_no_gap_mask = (residue_index[..., 1:] - residue_index[..., :-1]) == 1.0
+
+    # Compute loss for the C--N bond.
+    c_n_bond_length = torch.sqrt(
+        eps + torch.sum((this_c_pos - next_n_pos) ** 2, dim=-1)
+    )
+
+    # The C-N bond to proline has slightly different length because of the ring.
+    next_is_proline = aatype[..., 1:] == residue_constants.resname_to_idx["PRO"]
+    gt_length = (
+        ~next_is_proline
+    ) * residue_constants.between_res_bond_length_c_n[
+        0
+    ] + next_is_proline * residue_constants.between_res_bond_length_c_n[
+        1
+    ]
+    gt_stddev = (
+        ~next_is_proline
+    ) * residue_constants.between_res_bond_length_stddev_c_n[
+        0
+    ] + next_is_proline * residue_constants.between_res_bond_length_stddev_c_n[
+        1
+    ]
+    c_n_bond_length_error = torch.sqrt(eps + (c_n_bond_length - gt_length) ** 2)
+    c_n_loss_per_residue = torch.nn.functional.relu(
+        c_n_bond_length_error - tolerance_factor_soft * gt_stddev
+    )
+    mask = this_c_mask * next_n_mask * has_no_gap_mask
+    c_n_loss = torch.sum(mask * c_n_loss_per_residue, dim=-1) / (
+        torch.sum(mask, dim=-1) + eps
+    )
+    c_n_violation_mask = mask * (
+        c_n_bond_length_error > (tolerance_factor_hard * gt_stddev)
+    )
+
+    # Compute loss for the angles.
+    ca_c_bond_length = torch.sqrt(
+        eps + torch.sum((this_ca_pos - this_c_pos) ** 2, dim=-1)
+    )
+    n_ca_bond_length = torch.sqrt(
+        eps + torch.sum((next_n_pos - next_ca_pos) ** 2, dim=-1)
+    )
+
+    c_ca_unit_vec = (this_ca_pos - this_c_pos) / ca_c_bond_length[..., None]
+    c_n_unit_vec = (next_n_pos - this_c_pos) / c_n_bond_length[..., None]
+    n_ca_unit_vec = (next_ca_pos - next_n_pos) / n_ca_bond_length[..., None]
+
+    ca_c_n_cos_angle = torch.sum(c_ca_unit_vec * c_n_unit_vec, dim=-1)
+    gt_angle = residue_constants.between_res_cos_angles_ca_c_n[0]
+    gt_stddev = residue_constants.between_res_bond_length_stddev_c_n[0]
+    ca_c_n_cos_angle_error = torch.sqrt(
+        eps + (ca_c_n_cos_angle - gt_angle) ** 2
+    )
+    ca_c_n_loss_per_residue = torch.nn.functional.relu(
+        ca_c_n_cos_angle_error - tolerance_factor_soft * gt_stddev
+    )
+    mask = this_ca_mask * this_c_mask * next_n_mask * has_no_gap_mask
+    ca_c_n_loss = torch.sum(mask * ca_c_n_loss_per_residue, dim=-1) / (
+        torch.sum(mask, dim=-1) + eps
+    )
+    ca_c_n_violation_mask = mask * (
+        ca_c_n_cos_angle_error > (tolerance_factor_hard * gt_stddev)
+    )
+
+    c_n_ca_cos_angle = torch.sum((-c_n_unit_vec) * n_ca_unit_vec, dim=-1)
+    gt_angle = residue_constants.between_res_cos_angles_c_n_ca[0]
+    gt_stddev = residue_constants.between_res_cos_angles_c_n_ca[1]
+    c_n_ca_cos_angle_error = torch.sqrt(
+        eps + torch.square(c_n_ca_cos_angle - gt_angle)
+    )
+    c_n_ca_loss_per_residue = torch.nn.functional.relu(
+        c_n_ca_cos_angle_error - tolerance_factor_soft * gt_stddev
+    )
+    mask = this_c_mask * next_n_mask * next_ca_mask * has_no_gap_mask
+    c_n_ca_loss = torch.sum(mask * c_n_ca_loss_per_residue, dim=-1) / (
+        torch.sum(mask, dim=-1) + eps
+    )
+    c_n_ca_violation_mask = mask * (
+        c_n_ca_cos_angle_error > (tolerance_factor_hard * gt_stddev)
+    )
+
+    # Compute a per residue loss (equally distribute the loss to both
+    # neighbouring residues).
+    per_residue_loss_sum = (
+        c_n_loss_per_residue + ca_c_n_loss_per_residue + c_n_ca_loss_per_residue
+    )
+    per_residue_loss_sum = 0.5 * (
+        torch.nn.functional.pad(per_residue_loss_sum, (0, 1))
+        + torch.nn.functional.pad(per_residue_loss_sum, (1, 0))
+    )
+
+    # Compute hard violations.
+    violation_mask = torch.max(
+        torch.stack(
+            [c_n_violation_mask, ca_c_n_violation_mask, c_n_ca_violation_mask],
+            dim=-2,
+        ),
+        dim=-2,
+    )[0]
+    violation_mask = torch.maximum(
+        torch.nn.functional.pad(violation_mask, (0, 1)),
+        torch.nn.functional.pad(violation_mask, (1, 0)),
+    )
+
+    return {
+        "c_n_loss_mean": c_n_loss,
+        "ca_c_n_loss_mean": ca_c_n_loss,
+        "c_n_ca_loss_mean": c_n_ca_loss,
+        "per_residue_loss_sum": per_residue_loss_sum,
+        "per_residue_violation_mask": violation_mask,
+    }
+
+
+def between_residue_clash_loss(
+    atom14_pred_positions: torch.Tensor,
+    atom14_atom_exists: torch.Tensor,
+    atom14_atom_radius: torch.Tensor,
+    residue_index: torch.Tensor,
+    overlap_tolerance_soft=1.5,
+    overlap_tolerance_hard=1.5,
+    eps=1e-10,
+) -> Dict[str, torch.Tensor]:
+    """Loss to penalize steric clashes between residues.
+
+    This is a loss penalizing any steric clashes due to non bonded atoms in
+    different peptides coming too close. This loss corresponds to the part with
+    different residues of
+    Jumper et al. (2021) Suppl. Sec. 1.9.11, eq 46.
+
+    Args:
+      atom14_pred_positions: Predicted positions of atoms in
+        global prediction frame
+      atom14_atom_exists: Mask denoting whether atom at positions exists for given
+        amino acid type
+      atom14_atom_radius: Van der Waals radius for each atom.
+      residue_index: Residue index for given amino acid.
+      overlap_tolerance_soft: Soft tolerance factor.
+      overlap_tolerance_hard: Hard tolerance factor.
+
+    Returns:
+      Dict containing:
+        * 'mean_loss': average clash loss
+        * 'per_atom_loss_sum': sum of all clash losses per atom, shape (N, 14)
+        * 'per_atom_clash_mask': mask whether atom clashes with any other atom
+            shape (N, 14)
+    """
+    fp_type = atom14_pred_positions.dtype
+
+    # Create the distance matrix.
+    # (N, N, 14, 14)
+    dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_pred_positions[..., :, None, :, None, :]
+                - atom14_pred_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+
+    # Create the mask for valid distances.
+    # shape (N, N, 14, 14)
+    dists_mask = (
+        atom14_atom_exists[..., :, None, :, None]
+        * atom14_atom_exists[..., None, :, None, :]
+    ).type(fp_type)
+
+    # Mask out all the duplicate entries in the lower triangular matrix.
+    # Also mask out the diagonal (atom-pairs from the same residue) -- these atoms
+    # are handled separately.
+    dists_mask = dists_mask * (
+        residue_index[..., :, None, None, None]
+        < residue_index[..., None, :, None, None]
+    )
+
+    # Backbone C--N bond between subsequent residues is no clash.
+    c_one_hot = torch.nn.functional.one_hot(
+        residue_index.new_tensor(2), num_classes=14
+    )
+    c_one_hot = c_one_hot.reshape(
+        *((1,) * len(residue_index.shape[:-1])), *c_one_hot.shape
+    )
+    c_one_hot = c_one_hot.type(fp_type)
+    n_one_hot = torch.nn.functional.one_hot(
+        residue_index.new_tensor(0), num_classes=14
+    )
+    n_one_hot = n_one_hot.reshape(
+        *((1,) * len(residue_index.shape[:-1])), *n_one_hot.shape
+    )
+    n_one_hot = n_one_hot.type(fp_type)
+
+    neighbour_mask = (
+        residue_index[..., :, None, None, None] + 1
+    ) == residue_index[..., None, :, None, None]
+    c_n_bonds = (
+        neighbour_mask
+        * c_one_hot[..., None, None, :, None]
+        * n_one_hot[..., None, None, None, :]
+    )
+    dists_mask = dists_mask * (1.0 - c_n_bonds)
+
+    # Disulfide bridge between two cysteines is no clash.
+    cys = residue_constants.restype_name_to_atom14_names["CYS"]
+    cys_sg_idx = cys.index("SG")
+    cys_sg_idx = residue_index.new_tensor(cys_sg_idx)
+    cys_sg_idx = cys_sg_idx.reshape(
+        *((1,) * len(residue_index.shape[:-1])), 1
+    ).squeeze(-1)
+    cys_sg_one_hot = torch.nn.functional.one_hot(cys_sg_idx, num_classes=14)
+    disulfide_bonds = (
+        cys_sg_one_hot[..., None, None, :, None]
+        * cys_sg_one_hot[..., None, None, None, :]
+    )
+    dists_mask = dists_mask * (1.0 - disulfide_bonds)
+
+    # Compute the lower bound for the allowed distances.
+    # shape (N, N, 14, 14)
+    dists_lower_bound = dists_mask * (
+        atom14_atom_radius[..., :, None, :, None]
+        + atom14_atom_radius[..., None, :, None, :]
+    )
+
+    # Compute the error.
+    # shape (N, N, 14, 14)
+    dists_to_low_error = dists_mask * torch.nn.functional.relu(
+        dists_lower_bound - overlap_tolerance_soft - dists
+    )
+
+    # Compute the mean loss.
+    # shape ()
+    mean_loss = torch.sum(dists_to_low_error) / (1e-6 + torch.sum(dists_mask))
+
+    # Compute the per atom loss sum.
+    # shape (N, 14)
+    per_atom_loss_sum = torch.sum(dists_to_low_error, dim=(-4, -2)) + torch.sum(
+        dists_to_low_error, axis=(-3, -1)
+    )
+
+    # Compute the hard clash mask.
+    # shape (N, N, 14, 14)
+    clash_mask = dists_mask * (
+        dists < (dists_lower_bound - overlap_tolerance_hard)
+    )
+
+    # Compute the per atom clash.
+    # shape (N, 14)
+    per_atom_clash_mask = torch.maximum(
+        torch.amax(clash_mask, axis=(-4, -2)),
+        torch.amax(clash_mask, axis=(-3, -1)),
+    )
+
+    return {
+        "mean_loss": mean_loss,  # shape ()
+        "per_atom_loss_sum": per_atom_loss_sum,  # shape (N, 14)
+        "per_atom_clash_mask": per_atom_clash_mask,  # shape (N, 14)
+    }
+
+
+def within_residue_violations(
+    atom14_pred_positions: torch.Tensor,
+    atom14_atom_exists: torch.Tensor,
+    atom14_dists_lower_bound: torch.Tensor,
+    atom14_dists_upper_bound: torch.Tensor,
+    tighten_bounds_for_loss=0.0,
+    eps=1e-10,
+) -> Dict[str, torch.Tensor]:
+    """Loss to penalize steric clashes within residues.
+
+    This is a loss penalizing any steric violations or clashes of non-bonded atoms
+    in a given peptide. This loss corresponds to the part with
+    the same residues of
+    Jumper et al. (2021) Suppl. Sec. 1.9.11, eq 46.
+
+    Args:
+        atom14_pred_positions ([*, N, 14, 3]):
+            Predicted positions of atoms in global prediction frame.
+        atom14_atom_exists ([*, N, 14]):
+            Mask denoting whether atom at positions exists for given
+            amino acid type
+        atom14_dists_lower_bound ([*, N, 14]):
+            Lower bound on allowed distances.
+        atom14_dists_upper_bound ([*, N, 14]):
+            Upper bound on allowed distances
+        tighten_bounds_for_loss ([*, N]):
+            Extra factor to tighten loss
+
+    Returns:
+      Dict containing:
+        * 'per_atom_loss_sum' ([*, N, 14]):
+              sum of all clash losses per atom, shape
+        * 'per_atom_clash_mask' ([*, N, 14]):
+              mask whether atom clashes with any other atom shape
+    """
+    # Compute the mask for each residue.
+    dists_masks = 1.0 - torch.eye(14, device=atom14_atom_exists.device)[None]
+    dists_masks = dists_masks.reshape(
+        *((1,) * len(atom14_atom_exists.shape[:-2])), *dists_masks.shape
+    )
+    dists_masks = (
+        atom14_atom_exists[..., :, :, None]
+        * atom14_atom_exists[..., :, None, :]
+        * dists_masks
+    )
+
+    # Distance matrix
+    dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_pred_positions[..., :, :, None, :]
+                - atom14_pred_positions[..., :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+
+    # Compute the loss.
+    dists_to_low_error = torch.nn.functional.relu(
+        atom14_dists_lower_bound + tighten_bounds_for_loss - dists
+    )
+    dists_to_high_error = torch.nn.functional.relu(
+        dists - (atom14_dists_upper_bound - tighten_bounds_for_loss)
+    )
+    loss = dists_masks * (dists_to_low_error + dists_to_high_error)
+
+    # Compute the per atom loss sum.
+    per_atom_loss_sum = torch.sum(loss, dim=-2) + torch.sum(loss, dim=-1)
+
+    # Compute the violations mask.
+    violations = dists_masks * (
+        (dists < atom14_dists_lower_bound) | (dists > atom14_dists_upper_bound)
+    )
+
+    # Compute the per atom violations.
+    per_atom_violations = torch.maximum(
+        torch.max(violations, dim=-2)[0], torch.max(violations, axis=-1)[0]
+    )
+
+    return {
+        "per_atom_loss_sum": per_atom_loss_sum,
+        "per_atom_violations": per_atom_violations,
+    }
+
+
+def find_structural_violations(
+    batch: Dict[str, torch.Tensor],
+    atom14_pred_positions: torch.Tensor,
+    violation_tolerance_factor: float,
+    clash_overlap_tolerance: float,
+    **kwargs,
+) -> Dict[str, torch.Tensor]:
+    """Computes several checks for structural violations."""
+
+    # Compute between residue backbone violations of bonds and angles.
+    connection_violations = between_residue_bond_loss(
+        pred_atom_positions=atom14_pred_positions,
+        pred_atom_mask=batch["atom14_atom_exists"],
+        residue_index=batch["residue_index"],
+        aatype=batch["aatype"],
+        tolerance_factor_soft=violation_tolerance_factor,
+        tolerance_factor_hard=violation_tolerance_factor,
+    )
+
+    # Compute the Van der Waals radius for every atom
+    # (the first letter of the atom name is the element type).
+    # Shape: (N, 14).
+    atomtype_radius = [
+        residue_constants.van_der_waals_radius[name[0]]
+        for name in residue_constants.atom_types
+    ]
+    atomtype_radius = atom14_pred_positions.new_tensor(atomtype_radius)
+    atom14_atom_radius = (
+        batch["atom14_atom_exists"]
+        * atomtype_radius[batch["residx_atom14_to_atom37"]]
+    )
+
+    # Compute the between residue clash loss.
+    between_residue_clashes = between_residue_clash_loss(
+        atom14_pred_positions=atom14_pred_positions,
+        atom14_atom_exists=batch["atom14_atom_exists"],
+        atom14_atom_radius=atom14_atom_radius,
+        residue_index=batch["residue_index"],
+        overlap_tolerance_soft=clash_overlap_tolerance,
+        overlap_tolerance_hard=clash_overlap_tolerance,
+    )
+
+    # Compute all within-residue violations (clashes,
+    # bond length and angle violations).
+    restype_atom14_bounds = residue_constants.make_atom14_dists_bounds(
+        overlap_tolerance=clash_overlap_tolerance,
+        bond_length_tolerance_factor=violation_tolerance_factor,
+    )
+    atom14_atom_exists = batch["atom14_atom_exists"]
+    atom14_dists_lower_bound = atom14_pred_positions.new_tensor(
+        restype_atom14_bounds["lower_bound"]
+    )[batch["aatype"]]
+    atom14_dists_upper_bound = atom14_pred_positions.new_tensor(
+        restype_atom14_bounds["upper_bound"]
+    )[batch["aatype"]]
+    residue_violations = within_residue_violations(
+        atom14_pred_positions=atom14_pred_positions,
+        atom14_atom_exists=batch["atom14_atom_exists"],
+        atom14_dists_lower_bound=atom14_dists_lower_bound,
+        atom14_dists_upper_bound=atom14_dists_upper_bound,
+        tighten_bounds_for_loss=0.0,
+    )
+
+    # Combine them to a single per-residue violation mask (used later for LDDT).
+    per_residue_violations_mask = torch.max(
+        torch.stack(
+            [
+                connection_violations["per_residue_violation_mask"],
+                torch.max(
+                    between_residue_clashes["per_atom_clash_mask"], dim=-1
+                )[0],
+                torch.max(residue_violations["per_atom_violations"], dim=-1)[0],
+            ],
+            dim=-1,
+        ),
+        dim=-1,
+    )[0]
+
+    return {
+        "between_residues": {
+            "bonds_c_n_loss_mean": connection_violations["c_n_loss_mean"],  # ()
+            "angles_ca_c_n_loss_mean": connection_violations[
+                "ca_c_n_loss_mean"
+            ],  # ()
+            "angles_c_n_ca_loss_mean": connection_violations[
+                "c_n_ca_loss_mean"
+            ],  # ()
+            "connections_per_residue_loss_sum": connection_violations[
+                "per_residue_loss_sum"
+            ],  # (N)
+            "connections_per_residue_violation_mask": connection_violations[
+                "per_residue_violation_mask"
+            ],  # (N)
+            "clashes_mean_loss": between_residue_clashes["mean_loss"],  # ()
+            "clashes_per_atom_loss_sum": between_residue_clashes[
+                "per_atom_loss_sum"
+            ],  # (N, 14)
+            "clashes_per_atom_clash_mask": between_residue_clashes[
+                "per_atom_clash_mask"
+            ],  # (N, 14)
+        },
+        "within_residues": {
+            "per_atom_loss_sum": residue_violations[
+                "per_atom_loss_sum"
+            ],  # (N, 14)
+            "per_atom_violations": residue_violations[
+                "per_atom_violations"
+            ],  # (N, 14),
+        },
+        "total_per_residue_violations_mask": per_residue_violations_mask,  # (N)
+    }
+
+
+def find_structural_violations_np(
+    batch: Dict[str, np.ndarray],
+    atom14_pred_positions: np.ndarray,
+    config: ml_collections.ConfigDict,
+) -> Dict[str, np.ndarray]:
+    to_tensor = lambda x: torch.tensor(x)
+    batch = tree_map(to_tensor, batch, np.ndarray)
+    atom14_pred_positions = to_tensor(atom14_pred_positions)
+
+    out = find_structural_violations(batch, atom14_pred_positions, **config)
+
+    to_np = lambda x: np.array(x)
+    np_out = tensor_tree_map(to_np, out)
+
+    return np_out
+
+
+def extreme_ca_ca_distance_violations(
+    pred_atom_positions: torch.Tensor,  # (N, 37(14), 3)
+    pred_atom_mask: torch.Tensor,  # (N, 37(14))
+    residue_index: torch.Tensor,  # (N)
+    max_angstrom_tolerance=1.5,
+    eps=1e-6,
+) -> torch.Tensor:
+    """Counts residues whose Ca is a large distance from its neighbour.
+
+    Measures the fraction of CA-CA pairs between consecutive amino acids that are
+    more than 'max_angstrom_tolerance' apart.
+
+    Args:
+      pred_atom_positions: Atom positions in atom37/14 representation
+      pred_atom_mask: Atom mask in atom37/14 representation
+      residue_index: Residue index for given amino acid, this is assumed to be
+        monotonically increasing.
+      max_angstrom_tolerance: Maximum distance allowed to not count as violation.
+    Returns:
+      Fraction of consecutive CA-CA pairs with violation.
+    """
+    this_ca_pos = pred_atom_positions[..., :-1, 1, :]
+    this_ca_mask = pred_atom_mask[..., :-1, 1]
+    next_ca_pos = pred_atom_positions[..., 1:, 1, :]
+    next_ca_mask = pred_atom_mask[..., 1:, 1]
+    has_no_gap_mask = (residue_index[..., 1:] - residue_index[..., :-1]) == 1.0
+    ca_ca_distance = torch.sqrt(
+        eps + torch.sum((this_ca_pos - next_ca_pos) ** 2, dim=-1)
+    )
+    violations = (
+        ca_ca_distance - residue_constants.ca_ca
+    ) > max_angstrom_tolerance
+    mask = this_ca_mask * next_ca_mask * has_no_gap_mask
+    mean = masked_mean(mask, violations, -1)
+    return mean
+
+
+def compute_violation_metrics(
+    batch: Dict[str, torch.Tensor],
+    atom14_pred_positions: torch.Tensor,  # (N, 14, 3)
+    violations: Dict[str, torch.Tensor],
+) -> Dict[str, torch.Tensor]:
+    """Compute several metrics to assess the structural violations."""
+    ret = {}
+    extreme_ca_ca_violations = extreme_ca_ca_distance_violations(
+        pred_atom_positions=atom14_pred_positions,
+        pred_atom_mask=batch["atom14_atom_exists"],
+        residue_index=batch["residue_index"],
+    )
+    ret["violations_extreme_ca_ca_distance"] = extreme_ca_ca_violations
+    ret["violations_between_residue_bond"] = masked_mean(
+        batch["seq_mask"],
+        violations["between_residues"][
+            "connections_per_residue_violation_mask"
+        ],
+        dim=-1,
+    )
+    ret["violations_between_residue_clash"] = masked_mean(
+        mask=batch["seq_mask"],
+        value=torch.max(
+            violations["between_residues"]["clashes_per_atom_clash_mask"],
+            dim=-1,
+        )[0],
+        dim=-1,
+    )
+    ret["violations_within_residue"] = masked_mean(
+        mask=batch["seq_mask"],
+        value=torch.max(
+            violations["within_residues"]["per_atom_violations"], dim=-1
+        )[0],
+        dim=-1,
+    )
+    ret["violations_per_residue"] = masked_mean(
+        mask=batch["seq_mask"],
+        value=violations["total_per_residue_violations_mask"],
+        dim=-1,
+    )
+    return ret
+
+
+def compute_violation_metrics_np(
+    batch: Dict[str, np.ndarray],
+    atom14_pred_positions: np.ndarray,
+    violations: Dict[str, np.ndarray],
+) -> Dict[str, np.ndarray]:
+    to_tensor = lambda x: torch.tensor(x)
+    batch = tree_map(to_tensor, batch, np.ndarray)
+    atom14_pred_positions = to_tensor(atom14_pred_positions)
+    violations = tree_map(to_tensor, violations, np.ndarray)
+
+    out = compute_violation_metrics(batch, atom14_pred_positions, violations)
+
+    to_np = lambda x: np.array(x)
+    return tree_map(to_np, out, torch.Tensor)
+
+
+def violation_loss(
+    violations: Dict[str, torch.Tensor],
+    atom14_atom_exists: torch.Tensor,
+    eps=1e-6,
+    **kwargs,
+) -> torch.Tensor:
+    num_atoms = torch.sum(atom14_atom_exists)
+    l_clash = torch.sum(
+        violations["between_residues"]["clashes_per_atom_loss_sum"]
+        + violations["within_residues"]["per_atom_loss_sum"]
+    )
+    l_clash = l_clash / (eps + num_atoms)
+    loss = (
+        violations["between_residues"]["bonds_c_n_loss_mean"]
+        + violations["between_residues"]["angles_ca_c_n_loss_mean"]
+        + violations["between_residues"]["angles_c_n_ca_loss_mean"]
+        + l_clash
+    )
+
+    return loss
+
+
+def compute_renamed_ground_truth(
+    batch: Dict[str, torch.Tensor],
+    atom14_pred_positions: torch.Tensor,
+    eps=1e-10,
+) -> Dict[str, torch.Tensor]:
+    """
+    Find optimal renaming of ground truth based on the predicted positions.
+
+    Alg. 26 "renameSymmetricGroundTruthAtoms"
+
+    This renamed ground truth is then used for all losses,
+    such that each loss moves the atoms in the same direction.
+
+    Args:
+      batch: Dictionary containing:
+        * atom14_gt_positions: Ground truth positions.
+        * atom14_alt_gt_positions: Ground truth positions with renaming swaps.
+        * atom14_atom_is_ambiguous: 1.0 for atoms that are affected by
+            renaming swaps.
+        * atom14_gt_exists: Mask for which atoms exist in ground truth.
+        * atom14_alt_gt_exists: Mask for which atoms exist in ground truth
+            after renaming.
+        * atom14_atom_exists: Mask for whether each atom is part of the given
+            amino acid type.
+      atom14_pred_positions: Array of atom positions in global frame with shape
+    Returns:
+      Dictionary containing:
+        alt_naming_is_better: Array with 1.0 where alternative swap is better.
+        renamed_atom14_gt_positions: Array of optimal ground truth positions
+          after renaming swaps are performed.
+        renamed_atom14_gt_exists: Mask after renaming swap is performed.
+    """
+
+    pred_dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_pred_positions[..., None, :, None, :]
+                - atom14_pred_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+
+    atom14_gt_positions = batch["atom14_gt_positions"]
+    gt_dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_gt_positions[..., None, :, None, :]
+                - atom14_gt_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+
+    atom14_alt_gt_positions = batch["atom14_alt_gt_positions"]
+    alt_gt_dists = torch.sqrt(
+        eps
+        + torch.sum(
+            (
+                atom14_alt_gt_positions[..., None, :, None, :]
+                - atom14_alt_gt_positions[..., None, :, None, :, :]
+            )
+            ** 2,
+            dim=-1,
+        )
+    )
+
+    lddt = torch.sqrt(eps + (pred_dists - gt_dists) ** 2)
+    alt_lddt = torch.sqrt(eps + (pred_dists - alt_gt_dists) ** 2)
+
+    atom14_gt_exists = batch["atom14_gt_exists"]
+    atom14_atom_is_ambiguous = batch["atom14_atom_is_ambiguous"]
+    mask = (
+        atom14_gt_exists[..., None, :, None]
+        * atom14_atom_is_ambiguous[..., None, :, None]
+        * atom14_gt_exists[..., None, :, None, :]
+        * (1.0 - atom14_atom_is_ambiguous[..., None, :, None, :])
+    )
+
+    per_res_lddt = torch.sum(mask * lddt, dim=(-1, -2, -3))
+    alt_per_res_lddt = torch.sum(mask * alt_lddt, dim=(-1, -2, -3))
+
+    fp_type = atom14_pred_positions.dtype
+    alt_naming_is_better = (alt_per_res_lddt < per_res_lddt).type(fp_type)
+
+    renamed_atom14_gt_positions = (
+        1.0 - alt_naming_is_better[..., None, None]
+    ) * atom14_gt_positions + alt_naming_is_better[
+        ..., None, None
+    ] * atom14_alt_gt_positions
+
+    renamed_atom14_gt_mask = (
+        1.0 - alt_naming_is_better[..., None]
+    ) * atom14_gt_exists + alt_naming_is_better[..., None] * batch[
+        "atom14_alt_gt_exists"
+    ]
+
+    return {
+        "alt_naming_is_better": alt_naming_is_better,
+        "renamed_atom14_gt_positions": renamed_atom14_gt_positions,
+        "renamed_atom14_gt_exists": renamed_atom14_gt_mask,
+    }
+
+
+def experimentally_resolved_loss(
+    logits: torch.Tensor,
+    atom37_atom_exists: torch.Tensor,
+    all_atom_mask: torch.Tensor,
+    resolution: torch.Tensor,
+    min_resolution: float,
+    max_resolution: float,
+    eps: float = 1e-8,
+    **kwargs,
+) -> torch.Tensor:
+    errors = sigmoid_cross_entropy(logits, all_atom_mask)
+    loss = torch.sum(errors * atom37_atom_exists, dim=-1)
+    loss = loss / (eps + torch.sum(atom37_atom_exists, dim=(-1, -2)))
+    loss = torch.sum(loss, dim=-1)
+    
+    loss = loss * (
+        (resolution >= min_resolution) & (resolution <= max_resolution)
+    )
+
+    loss = torch.mean(loss)
+ 
+    return loss
+
+
+def masked_msa_loss(logits, true_msa, bert_mask, eps=1e-8, **kwargs):
+    """
+    Computes BERT-style masked MSA loss. Implements subsection 1.9.9.
+
+    Args:
+        logits: [*, N_seq, N_res, 23] predicted residue distribution
+        true_msa: [*, N_seq, N_res] true MSA
+        bert_mask: [*, N_seq, N_res] MSA mask
+    Returns:
+        Masked MSA loss
+    """
+    errors = softmax_cross_entropy(
+        logits, torch.nn.functional.one_hot(true_msa, num_classes=23)
+    )
+
+    # FP16-friendly averaging. Equivalent to:
+    # loss = (
+    #     torch.sum(errors * bert_mask, dim=(-1, -2)) /
+    #     (eps + torch.sum(bert_mask, dim=(-1, -2)))
+    # )
+    loss = errors * bert_mask
+    loss = torch.sum(loss, dim=-1)
+    scale = 0.5
+    denom = eps + torch.sum(scale * bert_mask, dim=(-1, -2))
+    loss = loss / denom[..., None]
+    loss = torch.sum(loss, dim=-1)
+    loss = loss * scale
+
+    loss = torch.mean(loss)
+
+    return loss
+
+
+class AlphaFoldLoss(nn.Module):
+    """Aggregation of the various losses described in the supplement"""
+    def __init__(self, config):
+        super(AlphaFoldLoss, self).__init__()
+        self.config = config
+
+    def forward(self, out, batch, _return_breakdown=False):
+        if "violation" not in out.keys():
+            out["violation"] = find_structural_violations(
+                batch,
+                out["sm"]["positions"][-1],
+                **self.config.violation,
+            )
+
+        if "renamed_atom14_gt_positions" not in out.keys():
+            batch.update(
+                compute_renamed_ground_truth(
+                    batch,
+                    out["sm"]["positions"][-1],
+                )
+            )
+
+        loss_fns = {
+            "distogram": lambda: distogram_loss(
+                logits=out["distogram_logits"],
+                **{**batch, **self.config.distogram},
+            ),
+            "experimentally_resolved": lambda: experimentally_resolved_loss(
+                logits=out["experimentally_resolved_logits"],
+                **{**batch, **self.config.experimentally_resolved},
+            ),
+            "fape": lambda: fape_loss(
+                out,
+                batch,
+                self.config.fape,
+            ),
+            "plddt_loss": lambda: lddt_loss(
+                logits=out["lddt_logits"],
+                all_atom_pred_pos=out["final_atom_positions"],
+                **{**batch, **self.config.plddt_loss},
+            ),
+            "masked_msa": lambda: masked_msa_loss(
+                logits=out["masked_msa_logits"],
+                **{**batch, **self.config.masked_msa},
+            ),
+            "supervised_chi": lambda: supervised_chi_loss(
+                out["sm"]["angles"],
+                out["sm"]["unnormalized_angles"],
+                **{**batch, **self.config.supervised_chi},
+            ),
+            "violation": lambda: violation_loss(
+                out["violation"],
+                **batch,
+            ),
+        }
+
+        if(self.config.tm.enabled):
+            loss_fns["tm"] = lambda: tm_loss(
+                logits=out["tm_logits"],
+                **{**batch, **out, **self.config.tm},
+            )
+
+        cum_loss = 0.
+        losses = {}
+        for loss_name, loss_fn in loss_fns.items():
+            weight = self.config[loss_name].weight
+            loss = loss_fn()
+            if(torch.isnan(loss) or torch.isinf(loss)):
+                #for k,v in batch.items():
+                #    if(torch.any(torch.isnan(v)) or torch.any(torch.isinf(v))):
+                #        logging.warning(f"{k}: is nan")
+                #logging.warning(f"{loss_name}: {loss}")
+                logging.warning(f"{loss_name} loss is NaN. Skipping...")
+                loss = loss.new_tensor(0., requires_grad=True)
+            cum_loss = cum_loss + weight * loss
+            losses[loss_name] = loss.detach().clone()
+
+        losses["unscaled_loss"] = cum_loss.detach().clone()
+
+        # Scale the loss by the square root of the minimum of the crop size and
+        # the (average) sequence length. See subsection 1.9.
+        seq_len = torch.mean(batch["seq_length"].float())
+        crop_len = batch["aatype"].shape[-1]
+        cum_loss = cum_loss * torch.sqrt(min(seq_len, crop_len))
+
+        losses["loss"] = cum_loss.detach().clone()
+
+        if(not _return_breakdown):
+            return cum_loss
+        
+        return cum_loss, losses
--- a/openfold/utils/lr_schedulers.py
+++ b/openfold/utils/lr_schedulers.py
+import torch
+
+
+class AlphaFoldLRScheduler(torch.optim.lr_scheduler._LRScheduler):
+    """ Implements the learning rate schedule defined in the AlphaFold 2
+        supplement. A linear warmup is followed by a plateau at the maximum
+        learning rate and then exponential decay.
+         
+        Note that the initial learning rate of the optimizer in question is 
+        ignored; use this class' base_lr parameter to specify the starting 
+        point of the warmup.
+    """
+    def __init__(self, 
+        optimizer, 
+        last_epoch: int = -1, 
+        verbose: bool = False,
+        base_lr: float = 0.,
+        max_lr: float = 0.001,
+        warmup_no_steps: int = 1000,
+        start_decay_after_n_steps: int = 50000,
+        decay_every_n_steps: int = 50000,
+        decay_factor: float = 0.95,
+    ):
+        step_counts = {
+            "warmup_no_steps": warmup_no_steps,
+            "start_decay_after_n_steps": start_decay_after_n_steps,
+        }
+
+        for k,v in step_counts.items():
+            if(v < 0):
+                raise ValueError(f"{k} must be nonnegative")
+
+        if(warmup_no_steps > start_decay_after_n_steps):
+            raise ValueError(
+                "warmup_no_steps must not exceed start_decay_after_n_steps"
+            )
+
+        self.optimizer = optimizer
+        self.last_epoch = last_epoch
+        self.verbose = verbose
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.warmup_no_steps = warmup_no_steps
+        self.start_decay_after_n_steps = start_decay_after_n_steps
+        self.decay_every_n_steps = decay_every_n_steps
+        self.decay_factor = decay_factor
+
+        super(AlphaFoldLRScheduler, self).__init__(
+            optimizer,
+            last_epoch=last_epoch, 
+            verbose=verbose,
+        )
+
+    def state_dict(self):
+        state_dict = {
+            k:v for k,v in self.__dict__.items() if k not in ["optimizer"]
+        }
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+
+    def get_lr(self):
+        if(not self._get_lr_called_within_step):
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+
+        step_no = self.last_epoch
+
+        if(step_no <= self.warmup_no_steps):
+            lr = self.base_lr + (step_no / self.warmup_no_steps) * self.max_lr
+        elif(step_no > self.start_decay_after_n_steps):
+            steps_since_decay = step_no - self.start_decay_after_n_steps
+            exp = (steps_since_decay // self.decay_every_n_steps) + 1
+            lr = self.max_lr * (self.decay_factor ** exp)
+        else: # plateau
+            lr = self.max_lr
+
+        return [lr for group in self.optimizer.param_groups]
--- a/openfold/utils/precision_utils.py
+++ b/openfold/utils/precision_utils.py
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+
+import torch
+
+def is_fp16_enabled():
+    # Autocast world
+    fp16_enabled = torch.get_autocast_gpu_dtype() == torch.float16
+    fp16_enabled = fp16_enabled and torch.is_autocast_enabled()
+
+    return fp16_enabled
--- a/openfold/utils/rigid_utils.py
+++ b/openfold/utils/rigid_utils.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+from functools import lru_cache
+from typing import Tuple, Any, Sequence, Callable, Optional
+
+import numpy as np
+import torch
+
+
+def rot_matmul(
+    a: torch.Tensor, 
+    b: torch.Tensor
+) -> torch.Tensor:
+    """
+        Performs matrix multiplication of two rotation matrix tensors. Written
+        out by hand to avoid AMP downcasting.
+
+        Args:
+            a: [*, 3, 3] left multiplicand
+            b: [*, 3, 3] right multiplicand
+        Returns:
+            The product ab
+    """
+    def row_mul(i):
+        return torch.stack(
+            [
+                a[..., i, 0] * b[..., 0, 0]
+                + a[..., i, 1] * b[..., 1, 0]
+                + a[..., i, 2] * b[..., 2, 0],
+                a[..., i, 0] * b[..., 0, 1]
+                + a[..., i, 1] * b[..., 1, 1]
+                + a[..., i, 2] * b[..., 2, 1],
+                a[..., i, 0] * b[..., 0, 2]
+                + a[..., i, 1] * b[..., 1, 2]
+                + a[..., i, 2] * b[..., 2, 2],
+            ],
+            dim=-1,
+        )
+
+    return torch.stack(
+        [
+            row_mul(0), 
+            row_mul(1), 
+            row_mul(2),
+        ], 
+        dim=-2
+    )
+
+
+def rot_vec_mul(
+    r: torch.Tensor, 
+    t: torch.Tensor
+) -> torch.Tensor:
+    """
+        Applies a rotation to a vector. Written out by hand to avoid transfer
+        to avoid AMP downcasting.
+
+        Args:
+            r: [*, 3, 3] rotation matrices
+            t: [*, 3] coordinate tensors
+        Returns:
+            [*, 3] rotated coordinates
+    """
+    x, y, z = torch.unbind(t, dim=-1)
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+
+@lru_cache(maxsize=None)
+def identity_rot_mats(
+    batch_dims: Tuple[int], 
+    dtype: Optional[torch.dtype] = None, 
+    device: Optional[torch.device] = None, 
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    rots = torch.eye(
+        3, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    rots = rots.view(*((1,) * len(batch_dims)), 3, 3)
+    rots = rots.expand(*batch_dims, -1, -1)
+    rots = rots.contiguous()
+
+    return rots
+
+
+@lru_cache(maxsize=None)
+def identity_trans(
+    batch_dims: Tuple[int], 
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None, 
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    trans = torch.zeros(
+        (*batch_dims, 3), 
+        dtype=dtype, 
+        device=device, 
+        requires_grad=requires_grad
+    )
+    return trans
+
+
+@lru_cache(maxsize=None)
+def identity_quats(
+    batch_dims: Tuple[int], 
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[torch.device] = None, 
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    quat = torch.zeros(
+        (*batch_dims, 4), 
+        dtype=dtype, 
+        device=device, 
+        requires_grad=requires_grad
+    )
+
+    with torch.no_grad():
+        quat[..., 0] = 1
+
+    return quat
+
+
+_quat_elements = ["a", "b", "c", "d"]
+_qtr_keys = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements]
+_qtr_ind_dict = {key: ind for ind, key in enumerate(_qtr_keys)}
+
+
+def _to_mat(pairs):
+    mat = np.zeros((4, 4))
+    for pair in pairs:
+        key, value = pair
+        ind = _qtr_ind_dict[key]
+        mat[ind // 4][ind % 4] = value
+
+    return mat
+
+
+_QTR_MAT = np.zeros((4, 4, 3, 3))
+_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)])
+_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)])
+_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)])
+_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)])
+_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)])
+_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)])
+_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)])
+_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)])
+_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)])
+
+
+def quat_to_rot(quat: torch.Tensor) -> torch.Tensor:
+    """
+        Converts a quaternion to a rotation matrix.
+
+        Args:
+            quat: [*, 4] quaternions
+        Returns:
+            [*, 3, 3] rotation matrices
+    """
+    # [*, 4, 4]
+    quat = quat[..., None] * quat[..., None, :]
+
+    # [4, 4, 3, 3]
+    mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device)
+
+    # [*, 4, 4, 3, 3]
+    shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape)
+    quat = quat[..., None, None] * shaped_qtr_mat
+
+    # [*, 3, 3]
+    return torch.sum(quat, dim=(-3, -4))
+
+
+def rot_to_quat(
+    rot: torch.Tensor,
+):
+    if(rot.shape[-2:] != (3, 3)):
+        raise ValueError("Input rotation is incorrectly shaped")
+
+    rot = [[rot[..., i, j] for j in range(3)] for i in range(3)]
+    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = rot 
+
+    k = [
+        [ xx + yy + zz,      zy - yz,      xz - zx,      yx - xy,],
+        [      zy - yz, xx - yy - zz,      xy + yx,      xz + zx,],
+        [      xz - zx,      xy + yx, yy - xx - zz,      yz + zy,],
+        [      yx - xy,      xz + zx,      yz + zy, zz - xx - yy,]
+    ]
+
+    k = (1./3.) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2)
+
+    _, vectors = torch.linalg.eigh(k)
+    return vectors[..., -1]
+
+
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[ 1, 0, 0, 0],
+                          [ 0,-1, 0, 0],
+                          [ 0, 0,-1, 0],
+                          [ 0, 0, 0,-1]]
+
+_QUAT_MULTIPLY[:, :, 1] = [[ 0, 1, 0, 0],
+                          [ 1, 0, 0, 0],
+                          [ 0, 0, 0, 1],
+                          [ 0, 0,-1, 0]]
+
+_QUAT_MULTIPLY[:, :, 2] = [[ 0, 0, 1, 0],
+                          [ 0, 0, 0,-1],
+                          [ 1, 0, 0, 0],
+                          [ 0, 1, 0, 0]]
+
+_QUAT_MULTIPLY[:, :, 3] = [[ 0, 0, 0, 1],
+                          [ 0, 0, 1, 0],
+                          [ 0,-1, 0, 0],
+                          [ 1, 0, 0, 0]]
+
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+
+_CACHED_QUATS = {
+    "_QTR_MAT": _QTR_MAT,
+    "_QUAT_MULTIPLY": _QUAT_MULTIPLY,
+    "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC
+}
+
+@lru_cache(maxsize=None)
+def _get_quat(quat_key, dtype, device):
+    return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device)
+
+
+def quat_multiply(quat1, quat2):
+    """Multiply a quaternion by another quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device)
+    reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape)
+    return torch.sum(
+        reshaped_mat *
+        quat1[..., :, None, None] *
+        quat2[..., None, :, None],
+        dim=(-3, -2)
+      )
+
+
+def quat_multiply_by_vec(quat, vec):
+    """Multiply a quaternion by a pure-vector quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device)
+    reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape)
+    return torch.sum(
+        reshaped_mat *
+        quat[..., :, None, None] *
+        vec[..., None, :, None],
+        dim=(-3, -2)
+    )
+
+
+def invert_rot_mat(rot_mat: torch.Tensor):
+    return rot_mat.transpose(-1, -2)
+
+
+def invert_quat(quat: torch.Tensor):
+    quat_prime = quat.clone()
+    quat_prime[..., 1:] *= -1
+    inv = quat_prime / torch.sum(quat ** 2, dim=-1, keepdim=True)
+    return inv
+
+
+class Rotation:
+    """
+        A 3D rotation. Depending on how the object is initialized, the
+        rotation is represented by either a rotation matrix or a
+        quaternion, though both formats are made available by helper functions.
+        To simplify gradient computation, the underlying format of the
+        rotation cannot be changed in-place. Like Rigid, the class is designed
+        to mimic the behavior of a torch Tensor, almost as if each Rotation
+        object were a tensor of rotations, in one format or another.
+    """
+    def __init__(self,
+        rot_mats: Optional[torch.Tensor] = None,
+        quats: Optional[torch.Tensor] = None,
+        normalize_quats: bool = True,
+    ):
+        """
+            Args:
+                rot_mats:
+                    A [*, 3, 3] rotation matrix tensor. Mutually exclusive with
+                    quats
+                quats:
+                    A [*, 4] quaternion. Mutually exclusive with rot_mats. If
+                    normalize_quats is not True, must be a unit quaternion
+                normalize_quats:
+                    If quats is specified, whether to normalize quats
+        """
+        if((rot_mats is None and quats is None) or 
+            (rot_mats is not None and quats is not None)):
+            raise ValueError("Exactly one input argument must be specified")
+
+        if((rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or 
+            (quats is not None and quats.shape[-1] != 4)):
+            raise ValueError(
+                "Incorrectly shaped rotation matrix or quaternion"
+            )
+
+        # Force full-precision
+        if(quats is not None):
+            quats = quats.to(dtype=torch.float32)
+        if(rot_mats is not None):
+            rot_mats = rot_mats.to(dtype=torch.float32)
+
+        if(quats is not None and normalize_quats):
+            quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+
+        self._rot_mats = rot_mats
+        self._quats = quats
+
+    @staticmethod
+    def identity(
+        shape,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rotation:
+        """
+            Returns an identity Rotation.
+
+            Args:
+                shape:
+                    The "shape" of the resulting Rotation object. See documentation
+                    for the shape property
+                dtype:
+                    The torch dtype for the rotation
+                device:
+                    The torch device for the new rotation
+                requires_grad:
+                    Whether the underlying tensors in the new rotation object
+                    should require gradient computation
+                fmt:
+                    One of "quat" or "rot_mat". Determines the underlying format
+                    of the new object's rotation 
+            Returns:
+                A new identity rotation
+        """
+        if(fmt == "rot_mat"):
+            rot_mats = identity_rot_mats(
+                shape, dtype, device, requires_grad,
+            )
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(fmt == "quat"):
+            quats = identity_quats(shape, dtype, device, requires_grad)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError(f"Invalid format: f{fmt}")
+
+    # Magic methods
+
+    def __getitem__(self, index: Any) -> Rotation:
+        """
+            Allows torch-style indexing over the virtual shape of the rotation
+            object. See documentation for the shape property.
+
+            Args:
+                index:
+                    A torch index. E.g. (1, 3, 2), or (slice(None,))
+            Returns:
+                The indexed rotation
+        """
+        if type(index) != tuple:
+            index = (index,)
+
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats[index + (slice(None), slice(None))]
+            return Rotation(rot_mats=rot_mats)
+        elif(self._quats is not None):
+            quats = self._quats[index + (slice(None),)]
+            return Rotation(quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def __mul__(self,
+        right: torch.Tensor,
+    ) -> Rotation:
+        """
+            Pointwise left multiplication of the rotation with a tensor. Can be
+            used to e.g. mask the Rotation.
+
+            Args:
+                right:
+                    The tensor multiplicand
+            Returns:
+                The product
+        """
+        if not(isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats * right[..., None, None]
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = self._quats * right[..., None]
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def __rmul__(self,
+        left: torch.Tensor,
+    ) -> Rotation:
+        """
+            Reverse pointwise multiplication of the rotation with a tensor.
+
+            Args:
+                left:
+                    The left multiplicand
+            Returns:
+                The product
+        """
+        return self.__mul__(left)
+    
+    # Properties
+
+    @property
+    def shape(self) -> torch.Size:
+        """
+            Returns the virtual shape of the rotation object. This shape is
+            defined as the batch dimensions of the underlying rotation matrix
+            or quaternion. If the Rotation was initialized with a [10, 3, 3]
+            rotation matrix tensor, for example, the resulting shape would be
+            [10].
+        
+            Returns:
+                The virtual shape of the rotation object
+        """
+        s = None
+        if(self._quats is not None):
+            s = self._quats.shape[:-1]
+        else:
+            s = self._rot_mats.shape[:-2]
+
+        return s
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+            Returns the dtype of the underlying rotation.
+
+            Returns:
+                The dtype of the underlying rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.dtype
+        elif(self._quats is not None):
+            return self._quats.dtype
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def device(self) -> torch.device:
+        """
+            The device of the underlying rotation
+
+            Returns:
+                The device of the underlying rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.device
+        elif(self._quats is not None):
+            return self._quats.device
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def requires_grad(self) -> bool:
+        """
+            Returns the requires_grad property of the underlying rotation
+
+            Returns:
+                The requires_grad property of the underlying tensor
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats.requires_grad
+        elif(self._quats is not None):
+            return self._quats.requires_grad
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_rot_mats(self) -> torch.Tensor:
+        """
+            Returns the underlying rotation as a rotation matrix tensor.
+
+            Returns:
+                The rotation as a rotation matrix tensor
+        """
+        rot_mats = self._rot_mats
+        if(rot_mats is None):
+            if(self._quats is None):
+                raise ValueError("Both rotations are None")
+            else:
+                rot_mats = quat_to_rot(self._quats)
+
+        return rot_mats 
+
+    def get_quats(self) -> torch.Tensor:
+        """
+            Returns the underlying rotation as a quaternion tensor.
+
+            Depending on whether the Rotation was initialized with a
+            quaternion, this function may call torch.linalg.eigh.
+
+            Returns:
+                The rotation as a quaternion tensor.
+        """
+        quats = self._quats
+        if(quats is None):
+            if(self._rot_mats is None):
+                raise ValueError("Both rotations are None")
+            else:
+                quats = rot_to_quat(self._rot_mats)
+
+        return quats
+
+    def get_cur_rot(self) -> torch.Tensor:
+        """
+            Return the underlying rotation in its current form
+
+            Returns:
+                The stored rotation
+        """
+        if(self._rot_mats is not None):
+            return self._rot_mats
+        elif(self._quats is not None):
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+
+    # Rotation functions
+
+    def compose_q_update_vec(self, 
+        q_update_vec: torch.Tensor, 
+        normalize_quats: bool = True
+    ) -> Rotation:
+        """
+            Returns a new quaternion Rotation after updating the current
+            object's underlying rotation with a quaternion update, formatted
+            as a [*, 3] tensor whose final three columns represent x, y, z such 
+            that (1, x, y, z) is the desired (not necessarily unit) quaternion
+            update.
+
+            Args:
+                q_update_vec:
+                    A [*, 3] quaternion update tensor
+                normalize_quats:
+                    Whether to normalize the output quaternion
+            Returns:
+                An updated Rotation
+        """
+        quats = self.get_quats()
+        new_quats = quats + quat_multiply_by_vec(quats, q_update_vec)
+        return Rotation(
+            rot_mats=None, 
+            quats=new_quats, 
+            normalize_quats=normalize_quats,
+        )
+
+    def compose_r(self, r: Rotation) -> Rotation:
+        """
+            Compose the rotation matrices of the current Rotation object with
+            those of another.
+
+            Args:
+                r:
+                    An update rotation object
+            Returns:
+                An updated rotation object
+        """
+        r1 = self.get_rot_mats()
+        r2 = r.get_rot_mats()
+        new_rot_mats = rot_matmul(r1, r2)
+        return Rotation(rot_mats=new_rot_mats, quats=None)
+
+    def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation:
+        """
+            Compose the quaternions of the current Rotation object with those
+            of another.
+
+            Depending on whether either Rotation was initialized with
+            quaternions, this function may call torch.linalg.eigh.
+
+            Args:
+                r:
+                    An update rotation object
+            Returns:
+                An updated rotation object
+        """
+        q1 = self.get_quats()
+        q2 = r.get_quats()
+        new_quats = quat_multiply(q1, q2)
+        return Rotation(
+            rot_mats=None, quats=new_quats, normalize_quats=normalize_quats
+        )
+
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+            Apply the current Rotation as a rotation matrix to a set of 3D
+            coordinates.
+
+            Args:
+                pts:
+                    A [*, 3] set of points
+            Returns:
+                [*, 3] rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        return rot_vec_mul(rot_mats, pts)
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+            The inverse of the apply() method.
+
+            Args:
+                pts:
+                    A [*, 3] set of points
+            Returns:
+                [*, 3] inverse-rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        inv_rot_mats = invert_rot_mat(rot_mats) 
+        return rot_vec_mul(inv_rot_mats, pts)
+
+    def invert(self) -> Rotation:
+        """
+            Returns the inverse of the current Rotation.
+
+            Returns:
+                The inverse of the current Rotation
+        """
+        if(self._rot_mats is not None):
+            return Rotation(
+                rot_mats=invert_rot_mat(self._rot_mats), 
+                quats=None
+            )
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None,
+                quats=invert_quat(self._quats),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    # "Tensor" stuff
+
+    def unsqueeze(self, 
+        dim: int,
+    ) -> Rigid:
+        """
+            Analogous to torch.unsqueeze. The dimension is relative to the
+            shape of the Rotation object.
+            
+            Args:
+                dim: A positive or negative dimension index.
+            Returns:
+                The unsqueezed Rotation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2)
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    @staticmethod
+    def cat(
+        rs: Sequence[Rotation], 
+        dim: int,
+    ) -> Rigid:
+        """
+            Concatenates rotations along one of the batch dimensions. Analogous
+            to torch.cat().
+
+            Note that the output of this operation is always a rotation matrix,
+            regardless of the format of input rotations.
+
+            Args:
+                rs: 
+                    A list of rotation objects
+                dim: 
+                    The dimension along which the rotations should be 
+                    concatenated
+            Returns:
+                A concatenated Rotation object in rotation matrix format
+        """
+        rot_mats = [r.get_rot_mats() for r in rs]
+        rot_mats = torch.cat(rot_mats, dim=dim if dim >= 0 else dim - 2)
+
+        return Rotation(rot_mats=rot_mats, quats=None) 
+
+    def map_tensor_fn(self, 
+        fn: Callable[torch.Tensor, torch.Tensor]
+    ) -> Rotation:
+        """
+            Apply a Tensor -> Tensor function to underlying rotation tensors,
+            mapping over the rotation dimension(s). Can be used e.g. to sum out
+            a one-hot batch dimension.
+
+            Args:
+                fn:
+                    A Tensor -> Tensor function to be mapped over the Rotation 
+            Returns:
+                The transformed Rotation object
+        """ 
+        if(self._rot_mats is not None):
+            rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,))
+            rot_mats = torch.stack(
+                list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1
+            )
+            rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3))
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif(self._quats is not None):
+            quats = torch.stack(
+                list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1
+            )
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+    
+    def cuda(self) -> Rotation:
+        """
+            Analogous to the cuda() method of torch Tensors
+
+            Returns:
+                A copy of the Rotation in CUDA memory
+        """
+        if(self._rot_mats is not None):
+            return Rotation(rot_mats=self._rot_mats.cuda(), quats=None)
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None, 
+                quats=self._quats.cuda(),
+                normalize_quats=False
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    def to(self, 
+        device: Optional[torch.device], 
+        dtype: Optional[torch.dtype]
+    ) -> Rotation:
+        """
+            Analogous to the to() method of torch Tensors
+
+            Args:
+                device:
+                    A torch device
+                dtype:
+                    A torch dtype
+            Returns:
+                A copy of the Rotation using the new device and dtype
+        """
+        if(self._rot_mats is not None):
+            return Rotation(
+                rot_mats=self._rot_mats.to(device=device, dtype=dtype), 
+                quats=None,
+            )
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None, 
+                quats=self._quats.to(device=device, dtype=dtype),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    def detach(self) -> Rotation:
+        """
+            Returns a copy of the Rotation whose underlying Tensor has been
+            detached from its torch graph.
+
+            Returns:
+                A copy of the Rotation whose underlying Tensor has been detached
+                from its torch graph
+        """
+        if(self._rot_mats is not None):
+            return Rotation(rot_mats=self._rot_mats.detach(), quats=None)
+        elif(self._quats is not None):
+            return Rotation(
+                rot_mats=None, 
+                quats=self._quats.detach(), 
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+
+class Rigid:
+    """
+        A class representing a rigid transformation. Little more than a wrapper
+        around two objects: a Rotation object and a [*, 3] translation
+        Designed to behave approximately like a single torch tensor with the 
+        shape of the shared batch dimensions of its component parts.
+    """
+    def __init__(self, 
+        rots: Optional[Rotation],
+        trans: Optional[torch.Tensor],
+    ):
+        """
+            Args:
+                rots: A [*, 3, 3] rotation tensor
+                trans: A corresponding [*, 3] translation tensor
+        """
+        # (we need device, dtype, etc. from at least one input)
+
+        batch_dims, dtype, device, requires_grad = None, None, None, None
+        if(trans is not None):
+            batch_dims = trans.shape[:-1]
+            dtype = trans.dtype
+            device = trans.device
+            requires_grad = trans.requires_grad
+        elif(rots is not None):
+            batch_dims = rots.shape
+            dtype = rots.dtype
+            device = rots.device
+            requires_grad = rots.requires_grad
+        else:
+            raise ValueError("At least one input argument must be specified")
+
+        if(rots is None):
+            rots = Rotation.identity(
+                batch_dims, dtype, device, requires_grad,
+            )
+        elif(trans is None):
+            trans = identity_trans(
+                batch_dims, dtype, device, requires_grad,
+            )
+
+        if((rots.shape != trans.shape[:-1]) or
+           (rots.device != trans.device)):
+            raise ValueError("Rots and trans incompatible")
+
+        # Force full precision. Happens to the rotations automatically.
+        trans = trans.to(dtype=torch.float32)
+
+        self._rots = rots
+        self._trans = trans
+
+    @staticmethod
+    def identity(
+        shape: Tuple[int], 
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None, 
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rigid:
+        """
+            Constructs an identity transformation.
+
+            Args:
+                shape: 
+                    The desired shape
+                dtype: 
+                    The dtype of both internal tensors
+                device: 
+                    The device of both internal tensors
+                requires_grad: 
+                    Whether grad should be enabled for the internal tensors
+            Returns:
+                The identity transformation
+        """
+        return Rigid(
+            Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt),
+            identity_trans(shape, dtype, device, requires_grad),
+        )
+
+    def __getitem__(self, 
+        index: Any,
+    ) -> Rigid:
+        """ 
+            Indexes the affine transformation with PyTorch-style indices.
+            The index is applied to the shared dimensions of both the rotation
+            and the translation.
+
+            E.g.::
+
+                r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None)
+                t = Rigid(r, torch.rand(10, 10, 3))
+                indexed = t[3, 4:6]
+                assert(indexed.shape == (2,))
+                assert(indexed.get_rots().shape == (2,))
+                assert(indexed.get_trans().shape == (2, 3))
+
+            Args:
+                index: A standard torch tensor index. E.g. 8, (10, None, 3),
+                or (3, slice(0, 1, None))
+            Returns:
+                The indexed tensor 
+        """
+        if type(index) != tuple:
+            index = (index,)
+        
+        return Rigid(
+            self._rots[index],
+            self._trans[index + (slice(None),)],
+        )
+
+    def __mul__(self,
+        right: torch.Tensor,
+    ) -> Rigid:
+        """
+            Pointwise left multiplication of the transformation with a tensor.
+            Can be used to e.g. mask the Rigid.
+
+            Args:
+                right:
+                    The tensor multiplicand
+            Returns:
+                The product
+        """
+        if not(isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+
+        new_rots = self._rots * right
+        new_trans = self._trans * right[..., None]
+
+        return Rigid(new_rots, new_trans)
+
+    def __rmul__(self,
+        left: torch.Tensor,
+    ) -> Rigid:
+        """
+            Reverse pointwise multiplication of the transformation with a 
+            tensor.
+
+            Args:
+                left:
+                    The left multiplicand
+            Returns:
+                The product
+        """
+        return self.__mul__(left)
+
+    @property
+    def shape(self) -> torch.Size:
+        """
+            Returns the shape of the shared dimensions of the rotation and
+            the translation.
+            
+            Returns:
+                The shape of the transformation
+        """
+        s = self._trans.shape[:-1]
+        return s
+
+    @property
+    def device(self) -> torch.device:
+        """
+            Returns the device on which the Rigid's tensors are located.
+
+            Returns:
+                The device on which the Rigid's tensors are located
+        """
+        return self._trans.device
+
+    def get_rots(self) -> Rotation:
+        """
+            Getter for the rotation.
+
+            Returns:
+                The rotation object
+        """
+        return self._rots
+
+    def get_trans(self) -> torch.Tensor:
+        """
+            Getter for the translation.
+
+            Returns:
+                The stored translation
+        """
+        return self._trans
+
+    def compose_q_update_vec(self, 
+        q_update_vec: torch.Tensor,
+    ) -> Rigid:
+        """
+            Composes the transformation with a quaternion update vector of
+            shape [*, 6], where the final 6 columns represent the x, y, and
+            z values of a quaternion of form (1, x, y, z) followed by a 3D
+            translation.
+
+            Args:
+                q_vec: The quaternion update vector.
+            Returns:
+                The composed transformation.
+        """
+        q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:]
+        new_rots = self._rots.compose_q_update_vec(q_vec)
+
+        trans_update = self._rots.apply(t_vec)
+        new_translation = self._trans + trans_update
+
+        return Rigid(new_rots, new_translation)
+
+    def compose(self,
+        r: Rigid,
+    ) -> Rigid:
+        """
+            Composes the current rigid object with another.
+
+            Args:
+                r:
+                    Another Rigid object
+            Returns:
+                The composition of the two transformations
+        """
+        new_rot = self._rots.compose_r(r._rots)
+        new_trans = self._rots.apply(r._trans) + self._trans
+        return Rigid(new_rot, new_trans)
+
+    def apply(self, 
+        pts: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+            Applies the transformation to a coordinate tensor.
+
+            Args:
+                pts: A [*, 3] coordinate tensor.
+            Returns:
+                The transformed points.
+        """
+        rotated = self._rots.apply(pts) 
+        return rotated + self._trans
+
+    def invert_apply(self, 
+        pts: torch.Tensor
+    ) -> torch.Tensor:
+        """
+            Applies the inverse of the transformation to a coordinate tensor.
+
+            Args:
+                pts: A [*, 3] coordinate tensor
+            Returns:
+                The transformed points.
+        """
+        pts = pts - self._trans
+        return self._rots.invert_apply(pts) 
+
+    def invert(self) -> Rigid:
+        """
+            Inverts the transformation.
+
+            Returns:
+                The inverse transformation.
+        """
+        rot_inv = self._rots.invert() 
+        trn_inv = rot_inv.apply(self._trans)
+
+        return Rigid(rot_inv, -1 * trn_inv)
+
+    def map_tensor_fn(self, 
+        fn: Callable[torch.Tensor, torch.Tensor]
+    ) -> Rigid:
+        """
+            Apply a Tensor -> Tensor function to underlying translation and
+            rotation tensors, mapping over the translation/rotation dimensions
+            respectively.
+
+            Args:
+                fn:
+                    A Tensor -> Tensor function to be mapped over the Rigid
+            Returns:
+                The transformed Rigid object
+        """     
+        new_rots = self._rots.map_tensor_fn(fn) 
+        new_trans = torch.stack(
+            list(map(fn, torch.unbind(self._trans, dim=-1))), 
+            dim=-1
+        )
+
+        return Rigid(new_rots, new_trans)
+
+    def to_tensor_4x4(self) -> torch.Tensor:
+        """
+            Converts a transformation to a homogenous transformation tensor.
+
+            Returns:
+                A [*, 4, 4] homogenous transformation tensor
+        """
+        tensor = self._trans.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._rots.get_rot_mats()
+        tensor[..., :3, 3] = self._trans
+        tensor[..., 3, 3] = 1
+        return tensor
+
+    @staticmethod
+    def from_tensor_4x4(
+        t: torch.Tensor
+    ) -> Rigid:
+        """
+            Constructs a transformation from a homogenous transformation
+            tensor.
+
+            Args:
+                t: [*, 4, 4] homogenous transformation tensor
+            Returns:
+                T object with shape [*]
+        """
+        if(t.shape[-2:] != (4, 4)):
+            raise ValueError("Incorrectly shaped input tensor")
+
+        rots = Rotation(rot_mats=t[..., :3, :3], quats=None)
+        trans = t[..., :3, 3]
+        
+        return Rigid(rots, trans)
+
+    def to_tensor_7(self) -> torch.Tensor:
+        """
+            Converts a transformation to a tensor with 7 final columns, four 
+            for the quaternion followed by three for the translation.
+
+            Returns:
+                A [*, 7] tensor representation of the transformation
+        """
+        tensor = self._trans.new_zeros((*self.shape, 7))
+        tensor[..., :4] = self._rots.get_quats()
+        tensor[..., 4:] = self._trans
+
+        return tensor
+
+    @staticmethod
+    def from_tensor_7(
+        t: torch.Tensor,
+        normalize_quats: bool = False,
+    ) -> Rigid:
+        if(t.shape[-1] != 7):
+            raise ValueError("Incorrectly shaped input tensor")
+
+        quats, trans = t[..., :4], t[..., 4:]
+
+        rots = Rotation(
+            rot_mats=None, 
+            quats=quats, 
+            normalize_quats=normalize_quats
+        )
+
+        return Rigid(rots, trans)
+
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor, 
+        origin: torch.Tensor, 
+        p_xy_plane: torch.Tensor, 
+        eps: float = 1e-8
+    ) -> Rigid:
+        """
+            Implements algorithm 21. Constructs transformations from sets of 3 
+            points using the Gram-Schmidt algorithm.
+
+            Args:
+                p_neg_x_axis: [*, 3] coordinates
+                origin: [*, 3] coordinates used as frame origins
+                p_xy_plane: [*, 3] coordinates
+                eps: Small epsilon value
+            Returns:
+                A transformation object of shape [*]
+        """
+        p_neg_x_axis = torch.unbind(p_neg_x_axis, dim=-1)
+        origin = torch.unbind(origin, dim=-1)
+        p_xy_plane = torch.unbind(p_xy_plane, dim=-1)
+
+        e0 = [c1 - c2 for c1, c2 in zip(origin, p_neg_x_axis)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane, origin)]
+
+        denom = torch.sqrt(sum((c * c for c in e0)) + eps)
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum((c * c for c in e1)) + eps)
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+
+        return Rigid(rot_obj, torch.stack(origin, dim=-1))
+
+    def unsqueeze(self, 
+        dim: int,
+    ) -> Rigid:
+        """
+            Analogous to torch.unsqueeze. The dimension is relative to the
+            shared dimensions of the rotation/translation.
+            
+            Args:
+                dim: A positive or negative dimension index.
+            Returns:
+                The unsqueezed transformation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        rots = self._rots.unsqueeze(dim)
+        trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1)
+
+        return Rigid(rots, trans)
+
+    @staticmethod
+    def cat(
+        ts: Sequence[Rigid], 
+        dim: int,
+    ) -> Rigid:
+        """
+            Concatenates transformations along a new dimension.
+
+            Args:
+                ts: 
+                    A list of T objects
+                dim: 
+                    The dimension along which the transformations should be 
+                    concatenated
+            Returns:
+                A concatenated transformation object
+        """
+        rots = Rotation.cat([t._rots for t in ts], dim) 
+        trans = torch.cat(
+            [t._trans for t in ts], dim=dim if dim >= 0 else dim - 1
+        )
+
+        return Rigid(rots, trans)
+
+    def apply_rot_fn(self, fn: Callable[Rotation, Rotation]) -> Rigid:
+        """
+            Applies a Rotation -> Rotation function to the stored rotation
+            object.
+
+            Args:
+                fn: A function of type Rotation -> Rotation
+            Returns:
+                A transformation object with a transformed rotation.
+        """
+        return Rigid(fn(self._rots), self._trans)
+
+    def apply_trans_fn(self, fn: Callable[torch.Tensor, torch.Tensor]) -> Rigid:
+        """
+            Applies a Tensor -> Tensor function to the stored translation.
+
+            Args:
+                fn: 
+                    A function of type Tensor -> Tensor to be applied to the
+                    translation
+            Returns:
+                A transformation object with a transformed translation.
+        """
+        return Rigid(self._rots, fn(self._trans))
+
+    def scale_translation(self, trans_scale_factor: float) -> Rigid:
+        """
+            Scales the translation by a constant factor.
+
+            Args:
+                trans_scale_factor:
+                    The constant factor
+            Returns:
+                A transformation object with a scaled translation.
+        """
+        fn = lambda t: t * trans_scale_factor
+        return self.apply_trans_fn(fn)
+
+    def stop_rot_gradient(self) -> Rigid:
+        """
+            Detaches the underlying rotation object
+
+            Returns:
+                A transformation object with detached rotations
+        """
+        fn = lambda r: r.detach()
+        return self.apply_rot_fn(fn)
+
+    @staticmethod
+    def make_transform_from_reference(n_xyz, ca_xyz, c_xyz, eps=1e-20):
+        """
+            Returns a transformation object from reference coordinates.
+  
+            Note that this method does not take care of symmetries. If you 
+            provide the atom positions in the non-standard way, the N atom will 
+            end up not at [-0.527250, 1.359329, 0.0] but instead at 
+            [-0.527250, -1.359329, 0.0]. You need to take care of such cases in 
+            your code.
+  
+            Args:
+                n_xyz: A [*, 3] tensor of nitrogen xyz coordinates.
+                ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates.
+                c_xyz: A [*, 3] tensor of carbon xyz coordinates.
+            Returns:
+                A transformation object. After applying the translation and 
+                rotation to the reference backbone, the coordinates will 
+                approximately equal to the input coordinates.
+        """    
+        translation = -1 * ca_xyz
+        n_xyz = n_xyz + translation
+        c_xyz = c_xyz + translation
+
+        c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x ** 2 + c_y ** 2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+        zeros = sin_c1.new_zeros(sin_c1.shape)
+        ones = sin_c1.new_ones(sin_c1.shape)
+
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+
+        norm = torch.sqrt(eps + c_x ** 2 + c_y ** 2 + c_z ** 2)
+        sin_c2 = c_z / norm
+        cos_c2 = torch.sqrt(c_x ** 2 + c_y ** 2) / norm
+
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c2_rots[..., 2, 0] = -1 * sin_c2
+        c2_rots[..., 2, 2] = cos_c2
+
+        c_rots = rot_matmul(c2_rots, c1_rots)
+        n_xyz = rot_vec_mul(c_rots, n_xyz)
+
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y ** 2 + n_z ** 2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+
+        rots = rot_matmul(n_rots, c_rots)
+
+        rots = rots.transpose(-1, -2)
+        translation = -1 * translation
+
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+
+        return Rigid(rot_obj, translation)
+
+    def cuda(self) -> Rigid:
+        """
+            Moves the transformation object to GPU memory
+            
+            Returns:
+                A version of the transformation on GPU
+        """
+        return Rigid(self._rots.cuda(), self._trans.cuda())
--- a/openfold/utils/script_utils.py
+++ b/openfold/utils/script_utils.py
+import json
+import logging
+import os
+import re
+import time
+
+import numpy
+import torch
+
+from openfold.model.model import AlphaFold
+from openfold.np import residue_constants, protein
+from openfold.np.relax import relax
+from openfold.utils.import_weights import (
+    import_jax_weights_,
+)
+
+from pytorch_lightning.utilities.deepspeed import (
+    convert_zero_checkpoint_to_fp32_state_dict
+)
+
+logging.basicConfig()
+logger = logging.getLogger(__file__)
+logger.setLevel(level=logging.INFO)
+
+
+def count_models_to_evaluate(openfold_checkpoint_path, jax_param_path):
+    model_count = 0
+    if openfold_checkpoint_path:
+        model_count += len(openfold_checkpoint_path.split(","))
+    if jax_param_path:
+        model_count += len(jax_param_path.split(","))
+    return model_count
+
+
+def get_model_basename(model_path):
+    return os.path.splitext(
+                os.path.basename(
+                    os.path.normpath(model_path)
+                )
+            )[0]
+
+
+def make_output_directory(output_dir, model_name, multiple_model_mode):
+    if multiple_model_mode:
+        prediction_dir = os.path.join(output_dir, "predictions", model_name)
+    else:
+        prediction_dir = os.path.join(output_dir, "predictions")
+    os.makedirs(prediction_dir, exist_ok=True)
+    return prediction_dir
+
+
+def load_models_from_command_line(config, model_device, openfold_checkpoint_path, jax_param_path, output_dir):
+    # Create the output directory
+
+    multiple_model_mode = count_models_to_evaluate(openfold_checkpoint_path, jax_param_path) > 1
+    if multiple_model_mode:
+        logger.info(f"evaluating multiple models")
+
+    if jax_param_path:
+        for path in jax_param_path.split(","):
+            model_basename = get_model_basename(path)
+            model_version = "_".join(model_basename.split("_")[1:])
+            model = AlphaFold(config)
+            model = model.eval()
+            import_jax_weights_(
+                model, path, version=model_version
+            )
+            model = model.to(model_device)
+            logger.info(
+                f"Successfully loaded JAX parameters at {path}..."
+            )
+            output_directory = make_output_directory(output_dir, model_basename, multiple_model_mode)
+            yield model, output_directory
+
+    if openfold_checkpoint_path:
+        for path in openfold_checkpoint_path.split(","):
+            model = AlphaFold(config)
+            model = model.eval()
+            checkpoint_basename = get_model_basename(path)
+            if os.path.isdir(path):
+                # A DeepSpeed checkpoint
+                ckpt_path = os.path.join(
+                    output_dir,
+                    checkpoint_basename + ".pt",
+                )
+
+                if not os.path.isfile(ckpt_path):
+                    convert_zero_checkpoint_to_fp32_state_dict(
+                        path,
+                        ckpt_path,
+                    )
+                d = torch.load(ckpt_path)
+                model.load_state_dict(d["ema"]["params"])
+            else:
+                ckpt_path = path
+                d = torch.load(ckpt_path)
+
+                if "ema" in d:
+                    # The public weights have had this done to them already
+                    d = d["ema"]["params"]
+                model.load_state_dict(d)
+
+            model = model.to(model_device)
+            logger.info(
+                f"Loaded OpenFold parameters at {path}..."
+            )
+            output_directory = make_output_directory(output_dir, checkpoint_basename, multiple_model_mode)
+            yield model, output_directory
+
+    if not jax_param_path and not openfold_checkpoint_path:
+        raise ValueError(
+            "At least one of jax_param_path or openfold_checkpoint_path must "
+            "be specified."
+        )
+
+
+def parse_fasta(data):
+    data = re.sub('>$', '', data, flags=re.M)
+    lines = [
+        l.replace('\n', '')
+        for prot in data.split('>') for l in prot.strip().split('\n', 1)
+    ][1:]
+    tags, seqs = lines[::2], lines[1::2]
+
+    tags = [t.split()[0] for t in tags]
+
+    return tags, seqs
+
+
+def update_timings(timing_dict, output_file=os.path.join(os.getcwd(), "timings.json")):
+    """
+    Write dictionary of one or more run step times to a file
+    """
+    if os.path.exists(output_file):
+        with open(output_file, "r") as f:
+            try:
+                timings = json.load(f)
+            except json.JSONDecodeError:
+                logger.info(f"Overwriting non-standard JSON in {output_file}.")
+                timings = {}
+    else:
+        timings = {}
+    timings.update(timing_dict)
+    with open(output_file, "w") as f:
+        json.dump(timings, f)
+    return output_file
+
+
+def run_model(model, batch, tag, output_dir):
+    with torch.no_grad():
+        # Temporarily disable templates if there aren't any in the batch
+        template_enabled = model.config.template.enabled
+        model.config.template.enabled = template_enabled and any([
+            "template_" in k for k in batch
+        ])
+
+        logger.info(f"Running inference for {tag}...")
+        t = time.perf_counter()
+        out = model(batch)
+        inference_time = time.perf_counter() - t
+        logger.info(f"Inference time: {inference_time}")
+        update_timings({"inference": inference_time}, os.path.join(output_dir, "timings.json"))
+
+        model.config.template.enabled = template_enabled
+
+    return out
+
+
+def prep_output(out, batch, feature_dict, feature_processor, config_preset, multimer_ri_gap, subtract_plddt):
+    plddt = out["plddt"]
+
+    plddt_b_factors = numpy.repeat(
+        plddt[..., None], residue_constants.atom_type_num, axis=-1
+    )
+
+    if subtract_plddt:
+        plddt_b_factors = 100 - plddt_b_factors
+
+    # Prep protein metadata
+    template_domain_names = []
+    template_chain_index = None
+    if feature_processor.config.common.use_templates and "template_domain_names" in feature_dict:
+        template_domain_names = [
+            t.decode("utf-8") for t in feature_dict["template_domain_names"]
+        ]
+
+        # This works because templates are not shuffled during inference
+        template_domain_names = template_domain_names[
+                                :feature_processor.config.predict.max_templates
+                                ]
+
+        if "template_chain_index" in feature_dict:
+            template_chain_index = feature_dict["template_chain_index"]
+            template_chain_index = template_chain_index[
+                                   :feature_processor.config.predict.max_templates
+                                   ]
+
+    no_recycling = feature_processor.config.common.max_recycling_iters
+    remark = ', '.join([
+        f"no_recycling={no_recycling}",
+        f"max_templates={feature_processor.config.predict.max_templates}",
+        f"config_preset={config_preset}",
+    ])
+
+    # For multi-chain FASTAs
+    ri = feature_dict["residue_index"]
+    chain_index = (ri - numpy.arange(ri.shape[0])) / multimer_ri_gap
+    chain_index = chain_index.astype(numpy.int64)
+    cur_chain = 0
+    prev_chain_max = 0
+    for i, c in enumerate(chain_index):
+        if c != cur_chain:
+            cur_chain = c
+            prev_chain_max = i + cur_chain * multimer_ri_gap
+
+        batch["residue_index"][i] -= prev_chain_max
+
+    unrelaxed_protein = protein.from_prediction(
+        features=batch,
+        result=out,
+        b_factors=plddt_b_factors,
+        chain_index=chain_index,
+        remark=remark,
+        parents=template_domain_names,
+        parents_chain_index=template_chain_index,
+    )
+
+    return unrelaxed_protein
+
+
+def relax_protein(config, model_device, unrelaxed_protein, output_directory, output_name, cif_output):
+    amber_relaxer = relax.AmberRelaxation(
+        use_gpu=(model_device != "cpu"),
+        **config.relax,
+    )
+
+    t = time.perf_counter()
+    visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", default="")
+    if "cuda" in model_device:
+        device_no = model_device.split(":")[-1]
+        os.environ["CUDA_VISIBLE_DEVICES"] = device_no
+    # the struct_str will contain either a PDB-format or a ModelCIF format string
+    struct_str, _, _ = amber_relaxer.process(prot=unrelaxed_protein, cif_output=cif_output)
+    os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
+    relaxation_time = time.perf_counter() - t
+
+    logger.info(f"Relaxation time: {relaxation_time}")
+    update_timings({"relaxation": relaxation_time}, os.path.join(output_directory, "timings.json"))
+
+    # Save the relaxed PDB.
+    suffix = "_relaxed.pdb"
+    if cif_output:
+        suffix = "_relaxed.cif"
+    relaxed_output_path = os.path.join(
+        output_directory, f'{output_name}{suffix}'
+    )
+    with open(relaxed_output_path, 'w') as fp:
+        fp.write(struct_str)
+
+    logger.info(f"Relaxed output written to {relaxed_output_path}...")
\ No newline at end of file
--- a/openfold/utils/seed.py
+++ b/openfold/utils/seed.py
+import os
+import logging
+import random
+import numpy as np
+from pytorch_lightning.utilities.seed import seed_everything
+
+from openfold.utils.suppress_output import SuppressLogging
+
+
+def seed_globally(seed=None):
+    if("PL_GLOBAL_SEED" not in os.environ):
+        if(seed is None):
+            seed = random.randint(0, np.iinfo(np.uint32).max)
+        os.environ["PL_GLOBAL_SEED"] = str(seed)
+        logging.info(f'os.environ["PL_GLOBAL_SEED"] set to {seed}')
+
+    # seed_everything is a bit log-happy
+    with SuppressLogging(logging.INFO):
+        seed_everything(seed=None)
--- a/openfold/utils/superimposition.py
+++ b/openfold/utils/superimposition.py
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from Bio.SVDSuperimposer import SVDSuperimposer
+import numpy as np
+import torch
+
+
+def _superimpose_np(reference, coords):
+    """
+        Superimposes coordinates onto a reference by minimizing RMSD using SVD.
+
+        Args:
+            reference:
+                [N, 3] reference array
+            coords:
+                [N, 3] array
+        Returns:
+            A tuple of [N, 3] superimposed coords and the final RMSD.
+    """
+    sup = SVDSuperimposer()
+    sup.set(reference, coords)
+    sup.run()
+    return sup.get_transformed(), sup.get_rms()
+
+
+def _superimpose_single(reference, coords):
+    reference_np = reference.detach().cpu().numpy()    
+    coords_np = coords.detach().cpu().numpy()
+    superimposed, rmsd = _superimpose_np(reference_np, coords_np)
+    return coords.new_tensor(superimposed), coords.new_tensor(rmsd)
+
+
+def superimpose(reference, coords, mask):
+    """
+        Superimposes coordinates onto a reference by minimizing RMSD using SVD.
+
+        Args:
+            reference:
+                [*, N, 3] reference tensor
+            coords:
+                [*, N, 3] tensor
+            mask:
+                [*, N] tensor
+        Returns:
+            A tuple of [*, N, 3] superimposed coords and [*] final RMSDs.
+    """
+    def select_unmasked_coords(coords, mask):
+        return torch.masked_select(
+            coords,
+            (mask > 0.)[..., None],
+        ).reshape(-1, 3)
+
+    batch_dims = reference.shape[:-2]
+    flat_reference = reference.reshape((-1,) + reference.shape[-2:])
+    flat_coords = coords.reshape((-1,) + reference.shape[-2:])
+    flat_mask = mask.reshape((-1,) + mask.shape[-1:])
+    superimposed_list = []
+    rmsds = []
+    for r, c, m in zip(flat_reference, flat_coords, flat_mask):
+        r_unmasked_coords = select_unmasked_coords(r, m)
+        c_unmasked_coords = select_unmasked_coords(c, m)
+        superimposed, rmsd = _superimpose_single(
+            r_unmasked_coords, 
+            c_unmasked_coords
+        )
+
+        # This is very inelegant, but idk how else to invert the masking
+        # procedure.
+        count = 0
+        superimposed_full_size = torch.zeros_like(r)
+        for i, unmasked in enumerate(m):
+            if(unmasked):
+                superimposed_full_size[i] = superimposed[count]
+                count += 1
+
+        superimposed_list.append(superimposed_full_size)
+        rmsds.append(rmsd)
+
+    superimposed_stacked = torch.stack(superimposed_list, dim=0)
+    rmsds_stacked = torch.stack(rmsds, dim=0)
+
+    superimposed_reshaped = superimposed_stacked.reshape(
+        batch_dims + coords.shape[-2:]
+    )
+    rmsds_reshaped = rmsds_stacked.reshape(
+        batch_dims
+    )
+
+    return superimposed_reshaped, rmsds_reshaped
--- a/openfold/utils/suppress_output.py
+++ b/openfold/utils/suppress_output.py
+import logging
+import sys
+
+
+class SuppressStdout:
+    def __enter__(self):
+        self.stdout = sys.stdout
+        dev_null = open("/dev/null", "w")
+        sys.stdout = dev_null
+
+    def __exit__(self, typ, value, traceback):
+        fp = sys.stdout
+        sys.stdout = self.stdout
+        fp.close()
+        
+
+class SuppressLogging:
+    def __init__(self, level):
+        self.level = level
+
+    def __enter__(self):
+        logging.disable(self.level)
+
+    def __exit__(self, typ, value, traceback):
+        logging.disable(logging.NOTSET)
+
--- a/openfold/utils/tensor_utils.py
+++ b/openfold/utils/tensor_utils.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import logging
+from typing import Tuple, List, Callable, Any, Dict, Sequence, Optional
+
+import torch
+import torch.nn as nn
+
+
+def add(m1, m2, inplace):
+    # The first operation in a checkpoint can't be in-place, but it's
+    # nice to have in-place addition during inference. Thus...
+    if(not inplace):
+        m1 = m1 + m2
+    else:
+        m1 += m2
+
+    return m1
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+def flatten_final_dims(t: torch.Tensor, no_dims: int):
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+
+
+def masked_mean(mask, value, dim, eps=1e-4):
+    mask = mask.expand(*value.shape)
+    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
+
+
+def pts_to_distogram(pts, min_bin=2.3125, max_bin=21.6875, no_bins=64):
+    boundaries = torch.linspace(
+        min_bin, max_bin, no_bins - 1, device=pts.device
+    )
+    dists = torch.sqrt(
+        torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1)
+    )
+    return torch.bucketize(dists, boundaries)
+
+
+def dict_multimap(fn, dicts):
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if type(v) is dict:
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+
+    return new_dict
+
+
+def one_hot(x, v_bins):
+    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
+    diffs = x[..., None] - reshaped_bins
+    am = torch.argmin(torch.abs(diffs), dim=-1)
+    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
+
+
+def batched_gather(data, inds, dim=0, no_batch_dims=0):
+    ranges = []
+    for i, s in enumerate(data.shape[:no_batch_dims]):
+        r = torch.arange(s)
+        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
+        ranges.append(r)
+
+    remaining_dims = [
+        slice(None) for _ in range(len(data.shape) - no_batch_dims)
+    ]
+    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
+    ranges.extend(remaining_dims)
+    return data[ranges]
+
+
+# With tree_map, a poor man's JAX tree_map
+def dict_map(fn, dic, leaf_type):
+    new_dict = {}
+    for k, v in dic.items():
+        if type(v) is dict:
+            new_dict[k] = dict_map(fn, v, leaf_type)
+        else:
+            new_dict[k] = tree_map(fn, v, leaf_type)
+
+    return new_dict
+
+
+def tree_map(fn, tree, leaf_type):
+    if isinstance(tree, dict):
+        return dict_map(fn, tree, leaf_type)
+    elif isinstance(tree, list):
+        return [tree_map(fn, x, leaf_type) for x in tree]
+    elif isinstance(tree, tuple):
+        return tuple([tree_map(fn, x, leaf_type) for x in tree])
+    elif isinstance(tree, leaf_type):
+        return fn(tree)
+    else:
+        print(type(tree))
+        raise ValueError("Not supported")
+
+
+tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
--- a/openfold/utils/trace_utils.py
+++ b/openfold/utils/trace_utils.py
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+from functools import partialmethod
+
+import numpy as np
+import torch
+
+from openfold.utils.tensor_utils import tensor_tree_map
+
+
+def pad_feature_dict_seq(feature_dict, seqlen):
+    """ Pads the sequence length of a feature dict. Used for tracing. """
+    # The real sequence length can't be longer than the desired one
+    true_n = feature_dict["aatype"].shape[-2]
+    assert(true_n <= seqlen)
+    
+    new_feature_dict = {}
+    
+    feat_seq_dims = {
+        "aatype": -2,
+        "between_segment_residues": -1,
+        "residue_index": -1,
+        "seq_length": -1,
+        "deletion_matrix_int": -1,
+        "msa": -1,
+        "num_alignments": -1,
+        "template_aatype": -2,
+        "template_all_atom_mask": -2,
+        "template_all_atom_positions": -3,
+    }
+
+    for k,v in feature_dict.items():
+        if(k not in feat_seq_dims):
+            new_feature_dict[k] = v
+            continue
+
+        seq_dim = feat_seq_dims[k]
+        padded_shape = list(v.shape)
+        padded_shape[seq_dim] = seqlen
+        new_value = np.zeros(padded_shape, dtype=v.dtype)
+        new_value[tuple(slice(0, s) for s in v.shape)] = v
+        new_feature_dict[k] = new_value
+    
+    new_feature_dict["seq_length"][0] = seqlen
+
+    return new_feature_dict
+
+
+def trace_model_(model, sample_input):
+    # Grab the inputs to the final recycling iteration
+    feats = tensor_tree_map(lambda t: t[..., -1], sample_input)
+
+    # Gather some metadata
+    n = feats["aatype"].shape[-1]
+    msa_depth = feats["true_msa"].shape[-2]
+    extra_msa_depth = feats["extra_msa"].shape[-2]
+    no_templates = feats["template_aatype"].shape[-2]
+    device = feats["aatype"].device
+
+    seq_mask = feats["seq_mask"].to(device)
+    pair_mask = seq_mask[..., None] * seq_mask[..., None, :]
+    extra_msa_mask = feats["extra_msa_mask"].to(device)
+    template_pair_mask = torch.stack([pair_mask] * no_templates, dim=-3)
+
+    # Create some fake representations with the correct shapes
+    m = torch.rand(msa_depth + 4, n, model.globals.c_m).to(device)
+    z = torch.rand(n, n, model.globals.c_z).to(device)
+    t = torch.rand(no_templates, n, n, model.globals.c_t).to(device)
+    a = torch.rand(extra_msa_depth, n, model.globals.c_e).to(device)
+    msa_mask = torch.randint(0, 1, (msa_depth + 4, n)).to(device)
+
+    # We need to do a dry run through the model so the chunk size tuners'
+    # trial runs (which run during the first-ever model iteration) aren't 
+    # baked into the trace. There's no need to run the entire thing, 
+    # though; we just need to run one block from each transformer stack.
+    evoformer_blocks = model.evoformer.blocks
+    model.evoformer.blocks = evoformer_blocks[:1]
+
+    extra_msa_blocks = model.extra_msa_stack.blocks
+    model.extra_msa_stack.blocks = extra_msa_blocks[:1]
+
+    if(model.template_config.enabled):
+        template_pair_stack_blocks = model.template_pair_stack.blocks
+        model.template_pair_stack.blocks = template_pair_stack_blocks[:1]
+
+    single_recycling_iter_input = tensor_tree_map(
+        lambda t: t[..., :1], sample_input,
+    )
+
+    with torch.no_grad():
+        _ = model(single_recycling_iter_input)
+
+    model.evoformer.blocks = evoformer_blocks
+    model.extra_msa_stack.blocks = extra_msa_blocks
+
+    del evoformer_blocks, extra_msa_blocks
+
+    if(model.template_config.enabled):
+        model.template_pair_stack.blocks = template_pair_stack_blocks
+        del template_pair_stack_blocks
+    
+    def get_tuned_chunk_size(module):
+        tuner = module.chunk_size_tuner
+        chunk_size = tuner.cached_chunk_size
+        
+        # After our trial run above, this should always be set
+        assert(chunk_size is not None)
+
+        return chunk_size
+
+    # Fetch the resulting chunk sizes
+    evoformer_chunk_size = model.globals.chunk_size
+    if(model.evoformer.chunk_size_tuner is not None):
+        evoformer_chunk_size = get_tuned_chunk_size(model.evoformer)
+
+    extra_msa_chunk_size = model.globals.chunk_size
+    if(model.extra_msa_stack.chunk_size_tuner is not None):
+        extra_msa_chunk_size = get_tuned_chunk_size(model.extra_msa_stack)
+
+    if(model.template_config.enabled):
+        template_pair_stack_chunk_size = model.globals.chunk_size
+        if(model.template_pair_stack.chunk_size_tuner is not None):
+            template_pair_stack_chunk_size = get_tuned_chunk_size(
+                model.template_pair_stack
+            )
+
+    def trace_block(block, block_inputs):
+        # Yes, yes, I know
+        with contextlib.redirect_stderr(None):
+            traced_block = torch.jit.trace(block, block_inputs)
+        
+        traced_block = torch.jit.freeze(traced_block, optimize_numerics=True)
+        
+        # It would be nice to use this, but its runtimes are extremely
+        # unpredictable
+        # traced_block = torch.jit.optimize_for_inference(traced_block)
+        
+        # All trace inputs need to be tensors. This wrapper takes care of that
+        def traced_block_wrapper(*args, **kwargs): 
+            to_tensor = lambda t: torch.tensor(t) if type(t) != torch.Tensor else t
+            args = [to_tensor(a) for a in args]
+            kwargs = {k: to_tensor(v) for k,v in kwargs.items()} 
+            return traced_block(*args, **kwargs)
+        
+        return traced_block_wrapper
+
+    def verify_arg_order(fn, arg_list):
+        """ Because it's difficult to specify keyword arguments of Module 
+            functions during tracing, we need to pass them as a tuple. As a 
+            sanity check, we manually verify their order here.
+        """
+        fn_arg_names = fn.__code__.co_varnames
+        # Remove the "self" parameter
+        assert(fn_arg_names[0] == "self")
+        fn_arg_names = fn_arg_names[1:]
+        # Trim unspecified arguments
+        fn_arg_names = fn_arg_names[:len(arg_list)]
+        name_tups = list(zip(fn_arg_names, [n for n, _ in arg_list]))
+        assert(all([n1 == n2 for n1, n2 in name_tups]))
+
+    evoformer_attn_chunk_size = max(
+        model.globals.chunk_size, evoformer_chunk_size // 4
+    )
+
+    # MSA row attention
+    msa_att_row_arg_tuples = [
+        ("m", m),
+        ("z", z),
+        ("mask", msa_mask),
+        ("chunk_size", torch.tensor(evoformer_attn_chunk_size)),
+        ("use_memory_efficient_kernel", torch.tensor(False)),
+        ("use_lma", torch.tensor(model.globals.use_lma)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].msa_att_row.forward, 
+        msa_att_row_arg_tuples
+    )
+    msa_att_row_args = [arg for _, arg in msa_att_row_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.msa_att_row, msa_att_row_args
+            )
+            del b.msa_att_row
+            b.msa_att_row = traced_block
+
+    # MSA col attention
+    msa_att_col_arg_tuples = [
+        ("m", m),
+        ("mask", msa_mask),
+        ("chunk_size", torch.tensor(evoformer_chunk_size)),
+        ("use_lma", torch.tensor(model.globals.use_lma)),
+        ("use_flash", torch.tensor(model.globals.use_flash)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].msa_att_col.forward, 
+        msa_att_col_arg_tuples
+    )
+    msa_att_col_args = [arg for _, arg in msa_att_col_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.msa_att_col, msa_att_col_args
+            )
+            del b.msa_att_col
+            b.msa_att_col = traced_block
+    
+    # OPM
+    opm_arg_tuples = [
+        ("m", m),
+        ("mask", msa_mask.float()),
+        ("chunk_size", torch.tensor(evoformer_chunk_size)),
+        ("inplace_safe", torch.tensor(True)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].core.outer_product_mean.forward, 
+        opm_arg_tuples
+    )
+    opm_args = [arg for _, arg in opm_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.core.outer_product_mean, opm_args
+            )
+            del b.core.outer_product_mean
+            b.core.outer_product_mean = traced_block
+
+    # Triangular multiplicative update (out)
+    tri_mul_out_arg_tuples = [
+        ("z", z),
+        ("mask", pair_mask.float()),
+        ("inplace_safe", torch.tensor(True)),
+        ("_add_with_inplace", torch.tensor(True)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].core.tri_mul_out.forward, 
+        tri_mul_out_arg_tuples
+    )
+    tri_mul_out_args = [arg for _, arg in tri_mul_out_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.core.tri_mul_out, tri_mul_out_args
+            )
+            del b.core.tri_mul_out
+            b.core.tri_mul_out = traced_block
+
+    # Triangular multiplicative update (in)
+    tri_mul_in_arg_tuples = [
+        ("z", z),
+        ("mask", pair_mask.float()),
+        ("inplace_safe", torch.tensor(True)),
+        ("_add_with_inplace", torch.tensor(True)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].core.tri_mul_in.forward, 
+        tri_mul_in_arg_tuples
+    )
+    tri_mul_in_args = [arg for _, arg in tri_mul_in_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.core.tri_mul_in, tri_mul_in_args
+            )
+            del b.core.tri_mul_in
+            b.core.tri_mul_in = traced_block
+
+    # Triangular attention (start)
+    tri_att_start_arg_tuples = [
+        ("x", z),
+        ("mask", pair_mask.float()),
+        ("chunk_size", torch.tensor(evoformer_attn_chunk_size)),
+        ("use_memory_efficient_kernel", torch.tensor(False)),
+        ("use_lma", torch.tensor(model.globals.use_lma)),
+        ("inplace_safe", torch.tensor(True)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].core.tri_att_start.forward, 
+        tri_att_start_arg_tuples
+    )
+    tri_att_start_args = [arg for _, arg in tri_att_start_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.core.tri_att_start, tri_att_start_args
+            )
+            del b.core.tri_att_start
+            b.core.tri_att_start = traced_block
+
+    # Triangular attention (end)
+    tri_att_end_arg_tuples = [
+        ("x", z.transpose(-2, -3)),
+        ("mask", pair_mask.transpose(-1, -2).float()),
+        ("chunk_size", torch.tensor(evoformer_attn_chunk_size)),
+        ("use_memory_efficient_kernel", torch.tensor(False)),
+        ("use_lma", torch.tensor(model.globals.use_lma)),
+        ("inplace_safe", torch.tensor(True)),
+    ]
+    verify_arg_order(
+        model.evoformer.blocks[0].core.tri_att_end.forward, 
+        tri_att_end_arg_tuples
+    )
+    tri_att_end_args = [arg for _, arg in tri_att_end_arg_tuples]
+    with torch.no_grad():
+        for b in model.evoformer.blocks:
+            traced_block = trace_block(
+                b.core.tri_att_end, tri_att_end_args
+            )
+            del b.core.tri_att_end
+            b.core.tri_att_end = traced_block
+
+    #evoformer_arg_tuples = [
+    #    ("m", m),
+    #    ("z", z),
+    #    ("msa_mask", msa_mask),
+    #    ("pair_mask", pair_mask),
+    #    ("chunk_size", torch.tensor(evoformer_chunk_size)),
+    #    ("use_lma", torch.tensor(model.globals.use_lma)),
+    #    ("use_flash", torch.tensor(model.globals.use_flash)),
+    #    ("inplace_safe", torch.tensor(1)),
+    #    ("_mask_trans", torch.tensor(model.config._mask_trans)),
+    #    ("_attn_chunk_size", torch.tensor(evoformer_attn_chunk_size)),
+    #]
+    #verify_arg_order(model.evoformer.blocks[0].forward, evoformer_arg_tuples)
+    #evoformer_args = [arg for _, arg in evoformer_arg_tuples]
+    #with torch.no_grad():
+    #    traced_evoformer_stack = []
+    #    for b in model.evoformer.blocks:
+    #        traced_block = trace_block(b, evoformer_args)
+    #        traced_evoformer_stack.append(traced_block)
+
+    #del model.evoformer.blocks
+    #model.evoformer.blocks = traced_evoformer_stack
+
+    
+
+#    with torch.no_grad():
+#        for b in model.evoformer.blocks:
+#            _ = b(*evoformer_args)
+#
+#    with torch.no_grad():
+#        for b in model.evoformer.blocks:
+#            _ = b(*evoformer_args)
+#    extra_msa_attn_chunk_size = max(
+#        model.globals.chunk_size, extra_msa_chunk_size // 4
+#    )
+#    extra_msa_arg_tuples = [
+#        ("m", a),
+#        ("z", z),
+#        ("msa_mask", extra_msa_mask),
+#        ("pair_mask", pair_mask),
+#        ("chunk_size", torch.tensor(extra_msa_chunk_size)),
+#        ("use_lma", torch.tensor(model.globals.use_lma)),
+#        ("inplace_safe", torch.tensor(1)),
+#        ("_mask_trans", torch.tensor(model.config._mask_trans)),
+#        ("_attn_chunk_size", torch.tensor(extra_msa_attn_chunk_size)),
+#    ]
+#    verify_arg_order(
+#        model.extra_msa_stack.blocks[0].forward, extra_msa_arg_tuples
+#    )
+#    extra_msa_args = [arg for _, arg in extra_msa_arg_tuples]
+#    with torch.no_grad():
+#        traced_extra_msa_stack = []
+#        for b in model.extra_msa_stack.blocks:
+#            traced_block = trace_block(b, extra_msa_args)
+#            traced_extra_msa_stack.append(traced_block)
+#    
+#    del model.extra_msa_stack.blocks
+#    model.extra_msa_stack.blocks = traced_extra_msa_stack
+
+#    if(model.template_config.enabled):
+#        template_pair_stack_attn_chunk_size = max(
+#            model.globals.chunk_size, template_pair_stack_chunk_size // 4
+#        )
+#        template_pair_stack_arg_tuples = [
+#            ("z", t),
+#            ("mask", template_pair_mask),
+#            ("chunk_size", torch.tensor(template_pair_stack_chunk_size)),
+#            ("use_lma", torch.tensor(model.globals.use_lma)),
+#            ("inplace_safe", torch.tensor(1)),
+#            ("_mask_trans", torch.tensor(model.config._mask_trans)),
+#            ("_attn_chunk_size", torch.tensor(
+#                template_pair_stack_attn_chunk_size
+#            )),
+#        ]
+#        verify_arg_order(
+#            model.template_pair_stack.blocks[0].forward, 
+#            template_pair_stack_arg_tuples
+#        )
+#        template_pair_stack_args = [
+#            arg for _, arg in template_pair_stack_arg_tuples
+#        ]
+#    
+#        with torch.no_grad():
+#            traced_template_pair_stack = []
+#            for b in model.template_pair_stack.blocks:
+#                traced_block = trace_block(b, template_pair_stack_args)
+#                traced_template_pair_stack.append(traced_block)
+#       
+#        del model.template_pair_stack.blocks
+#        model.template_pair_stack.blocks = traced_template_pair_stack
+
+    # We need to do another dry run after tracing to allow the model to reach
+    # top speeds. Why, I don't know.
+    two_recycling_iter_input = tensor_tree_map(
+        lambda t: t[..., :2], sample_input,
+    )
+
+    with torch.no_grad():
+        _ = model(two_recycling_iter_input)