Merge branch 'main' of https://github.com/hpcaitech/FastFold

b14e47f4 · zhuwenwen · 490cb6f5 · 05681304 · b14e47f4 · b14e47f4
Commit b14e47f4 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/fastfold/data/data_transforms.py
+++ b/fastfold/data/data_transforms.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from functools import reduce, wraps
+from operator import add
+
+import numpy as np
+import torch
+
+from fastfold.config import NUM_RES, NUM_EXTRA_SEQ, NUM_TEMPLATES, NUM_MSA_SEQ
+from fastfold.common import residue_constants as rc
+from fastfold.utils.rigid_utils import Rotation, Rigid
+from fastfold.utils.tensor_utils import (
+    tree_map,
+    tensor_tree_map,
+    batched_gather,
+)
+
+
+MSA_FEATURE_NAMES = [
+    "msa",
+    "deletion_matrix",
+    "msa_mask",
+    "msa_row_mask",
+    "bert_mask",
+    "true_msa",
+]
+
+
+def cast_to_64bit_ints(protein):
+    # We keep all ints as int64
+    for k, v in protein.items():
+        if v.dtype == torch.int32:
+            protein[k] = v.type(torch.int64)
+
+    return protein
+
+
+def make_one_hot(x, num_classes):
+    x_one_hot = torch.zeros(*x.shape, num_classes, device=x.device)
+    x_one_hot.scatter_(-1, x.unsqueeze(-1), 1)
+    return x_one_hot
+
+
+def make_seq_mask(protein):
+    protein["seq_mask"] = torch.ones(
+        protein["aatype"].shape, dtype=torch.float32
+    )
+    return protein
+
+
+def make_template_mask(protein):
+    protein["template_mask"] = torch.ones(
+        protein["template_aatype"].shape[0], dtype=torch.float32
+    )
+    return protein
+
+
+def curry1(f):
+    """Supply all arguments but the first."""
+    @wraps(f)
+    def fc(*args, **kwargs):
+        return lambda x: f(x, *args, **kwargs)
+
+    return fc
+
+
+def make_all_atom_aatype(protein):
+    protein["all_atom_aatype"] = protein["aatype"]
+    return protein
+
+
+def fix_templates_aatype(protein):
+    # Map one-hot to indices
+    num_templates = protein["template_aatype"].shape[0]
+    if(num_templates > 0):
+        protein["template_aatype"] = torch.argmax(
+            protein["template_aatype"], dim=-1
+        )
+        # Map hhsearch-aatype to our aatype.
+        new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+        new_order = torch.tensor(
+            new_order_list, dtype=torch.int64, device=protein["aatype"].device,
+        ).expand(num_templates, -1)
+        protein["template_aatype"] = torch.gather(
+            new_order, 1, index=protein["template_aatype"]
+        )
+
+    return protein
+
+
+def correct_msa_restypes(protein):
+    """Correct MSA restype to have the same order as rc."""
+    new_order_list = rc.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+    new_order = torch.tensor(
+        [new_order_list] * protein["msa"].shape[1], 
+        device=protein["msa"].device,
+    ).transpose(0, 1)
+    protein["msa"] = torch.gather(new_order, 0, protein["msa"])
+
+    perm_matrix = np.zeros((22, 22), dtype=np.float32)
+    perm_matrix[range(len(new_order_list)), new_order_list] = 1.0
+
+    for k in protein:
+        if "profile" in k:
+            num_dim = protein[k].shape.as_list()[-1]
+            assert num_dim in [
+                20,
+                21,
+                22,
+            ], "num_dim for %s out of expected range: %s" % (k, num_dim)
+            protein[k] = torch.dot(protein[k], perm_matrix[:num_dim, :num_dim])
+    
+    return protein
+
+
+def squeeze_features(protein):
+    """Remove singleton and repeated dimensions in protein features."""
+    protein["aatype"] = torch.argmax(protein["aatype"], dim=-1)
+    for k in [
+        "domain_name",
+        "msa",
+        "num_alignments",
+        "seq_length",
+        "sequence",
+        "superfamily",
+        "deletion_matrix",
+        "resolution",
+        "between_segment_residues",
+        "residue_index",
+        "template_all_atom_mask",
+    ]:
+        if k in protein:
+            final_dim = protein[k].shape[-1]
+            if isinstance(final_dim, int) and final_dim == 1:
+                if torch.is_tensor(protein[k]):
+                    protein[k] = torch.squeeze(protein[k], dim=-1)
+                else:
+                    protein[k] = np.squeeze(protein[k], axis=-1)
+
+    for k in ["seq_length", "num_alignments"]:
+        if k in protein:
+            protein[k] = protein[k][0]
+
+    return protein
+
+
+@curry1
+def randomly_replace_msa_with_unknown(protein, replace_proportion):
+    """Replace a portion of the MSA with 'X'."""
+    msa_mask = torch.rand(protein["msa"].shape) < replace_proportion
+    x_idx = 20
+    gap_idx = 21
+    msa_mask = torch.logical_and(msa_mask, protein["msa"] != gap_idx)
+    protein["msa"] = torch.where(
+        msa_mask,
+        torch.ones_like(protein["msa"]) * x_idx,
+        protein["msa"]
+    )
+    aatype_mask = torch.rand(protein["aatype"].shape) < replace_proportion
+
+    protein["aatype"] = torch.where(
+        aatype_mask,
+        torch.ones_like(protein["aatype"]) * x_idx,
+        protein["aatype"],
+    )
+    return protein
+
+
+@curry1
+def sample_msa(protein, max_seq, keep_extra, seed=None):
+    """Sample MSA randomly, remaining sequences are stored are stored as `extra_*`.""" 
+    num_seq = protein["msa"].shape[0]
+    g = torch.Generator(device=protein["msa"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+    shuffled = torch.randperm(num_seq - 1, generator=g) + 1
+    index_order = torch.cat(
+        (torch.tensor([0], device=shuffled.device), shuffled), 
+        dim=0
+    )
+    num_sel = min(max_seq, num_seq)
+    sel_seq, not_sel_seq = torch.split(
+        index_order, [num_sel, num_seq - num_sel]
+    )
+
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            if keep_extra:
+                protein["extra_" + k] = torch.index_select(
+                    protein[k], 0, not_sel_seq
+                )
+            protein[k] = torch.index_select(protein[k], 0, sel_seq)
+
+    return protein
+
+
+@curry1
+def add_distillation_flag(protein, distillation):
+    protein['is_distillation'] = distillation
+    return protein
+
+@curry1
+def sample_msa_distillation(protein, max_seq):
+    if(protein["is_distillation"] == 1):
+        protein = sample_msa(max_seq, keep_extra=False)(protein)
+    return protein
+
+
+@curry1
+def crop_extra_msa(protein, max_extra_msa):
+    num_seq = protein["extra_msa"].shape[0]
+    num_sel = min(max_extra_msa, num_seq)
+    select_indices = torch.randperm(num_seq)[:num_sel]
+    for k in MSA_FEATURE_NAMES:
+        if "extra_" + k in protein:
+            protein["extra_" + k] = torch.index_select(
+                protein["extra_" + k], 0, select_indices
+            )
+    
+    return protein
+
+
+def delete_extra_msa(protein):
+    for k in MSA_FEATURE_NAMES:
+        if "extra_" + k in protein:
+            del protein["extra_" + k]
+    return protein
+
+
+# Not used in inference
+@curry1
+def block_delete_msa(protein, config):
+    num_seq = protein["msa"].shape[0]
+    block_num_seq = torch.floor(
+        torch.tensor(num_seq, dtype=torch.float32, device=protein["msa"].device)
+        * config.msa_fraction_per_block
+    ).to(torch.int32)
+
+    if config.randomize_num_blocks:
+        nb = torch.distributions.uniform.Uniform(
+            0, config.num_blocks + 1
+        ).sample()
+    else:
+        nb = config.num_blocks
+
+    del_block_starts = torch.distributions.Uniform(0, num_seq).sample(nb)
+    del_blocks = del_block_starts[:, None] + torch.range(block_num_seq)
+    del_blocks = torch.clip(del_blocks, 0, num_seq - 1)
+    del_indices = torch.unique(torch.sort(torch.reshape(del_blocks, [-1])))[0]
+
+    # Make sure we keep the original sequence
+    combined = torch.cat((torch.range(1, num_seq)[None], del_indices[None]))
+    uniques, counts = combined.unique(return_counts=True)
+    difference = uniques[counts == 1]
+    intersection = uniques[counts > 1]
+    keep_indices = torch.squeeze(difference, 0)
+
+    for k in MSA_FEATURE_NAMES:
+        if k in protein:
+            protein[k] = torch.gather(protein[k], keep_indices)
+
+    return protein
+
+
+@curry1
+def nearest_neighbor_clusters(protein, gap_agreement_weight=0.0):
+    weights = torch.cat(
+        [
+            torch.ones(21, device=protein["msa"].device), 
+            gap_agreement_weight * torch.ones(1, device=protein["msa"].device),
+            torch.zeros(1, device=protein["msa"].device)
+        ],
+        0,
+    )
+
+    # Make agreement score as weighted Hamming distance
+    msa_one_hot = make_one_hot(protein["msa"], 23)
+    sample_one_hot = protein["msa_mask"][:, :, None] * msa_one_hot
+    extra_msa_one_hot = make_one_hot(protein["extra_msa"], 23)
+    extra_one_hot = protein["extra_msa_mask"][:, :, None] * extra_msa_one_hot
+
+    num_seq, num_res, _ = sample_one_hot.shape
+    extra_num_seq, _, _ = extra_one_hot.shape
+
+    # Compute tf.einsum('mrc,nrc,c->mn', sample_one_hot, extra_one_hot, weights)
+    # in an optimized fashion to avoid possible memory or computation blowup.
+    agreement = torch.matmul(
+        torch.reshape(extra_one_hot, [extra_num_seq, num_res * 23]),
+        torch.reshape(
+            sample_one_hot * weights, [num_seq, num_res * 23]
+        ).transpose(0, 1),
+    )
+
+    # Assign each sequence in the extra sequences to the closest MSA sample
+    protein["extra_cluster_assignment"] = torch.argmax(agreement, dim=1).to(
+        torch.int64
+    )
+    
+    return protein
+
+
+def unsorted_segment_sum(data, segment_ids, num_segments):
+    """
+    Computes the sum along segments of a tensor. Similar to 
+    tf.unsorted_segment_sum, but only supports 1-D indices.
+
+    :param data: A tensor whose segments are to be summed.
+    :param segment_ids: The 1-D segment indices tensor.
+    :param num_segments: The number of segments.
+    :return: A tensor of same data type as the data argument.
+    """
+    assert (
+        len(segment_ids.shape) == 1 and
+        segment_ids.shape[0] == data.shape[0]
+    )
+    segment_ids = segment_ids.view(
+        segment_ids.shape[0], *((1,) * len(data.shape[1:]))
+    )
+    segment_ids = segment_ids.expand(data.shape)
+    shape = [num_segments] + list(data.shape[1:])
+    tensor = (
+        torch.zeros(*shape, device=segment_ids.device)
+        .scatter_add_(0, segment_ids, data.float())
+    )
+    tensor = tensor.type(data.dtype)
+    return tensor
+
+
+@curry1
+def summarize_clusters(protein):
+    """Produce profile and deletion_matrix_mean within each cluster."""
+    num_seq = protein["msa"].shape[0]
+
+    def csum(x):
+        return unsorted_segment_sum(
+            x, protein["extra_cluster_assignment"], num_seq
+        )
+
+    mask = protein["extra_msa_mask"]
+    mask_counts = 1e-6 + protein["msa_mask"] + csum(mask)  # Include center
+
+    msa_sum = csum(mask[:, :, None] * make_one_hot(protein["extra_msa"], 23))
+    msa_sum += make_one_hot(protein["msa"], 23)  # Original sequence
+    protein["cluster_profile"] = msa_sum / mask_counts[:, :, None]
+    del msa_sum
+
+    del_sum = csum(mask * protein["extra_deletion_matrix"])
+    del_sum += protein["deletion_matrix"]  # Original sequence
+    protein["cluster_deletion_mean"] = del_sum / mask_counts
+    del del_sum
+    
+    return protein
+
+
+def make_msa_mask(protein):
+    """Mask features are all ones, but will later be zero-padded."""
+    protein["msa_mask"] = torch.ones(protein["msa"].shape, dtype=torch.float32)
+    protein["msa_row_mask"] = torch.ones(
+        (protein["msa"].shape[0]), dtype=torch.float32
+    )
+    return protein
+
+
+def pseudo_beta_fn(aatype, all_atom_positions, all_atom_mask):
+    """Create pseudo beta features."""
+    is_gly = torch.eq(aatype, rc.restype_order["G"])
+    ca_idx = rc.atom_order["CA"]
+    cb_idx = rc.atom_order["CB"]
+    pseudo_beta = torch.where(
+        torch.tile(is_gly[..., None], [1] * len(is_gly.shape) + [3]),
+        all_atom_positions[..., ca_idx, :],
+        all_atom_positions[..., cb_idx, :],
+    )
+
+    if all_atom_mask is not None:
+        pseudo_beta_mask = torch.where(
+            is_gly, all_atom_mask[..., ca_idx], all_atom_mask[..., cb_idx]
+        )
+        return pseudo_beta, pseudo_beta_mask
+    else:
+        return pseudo_beta
+
+
+@curry1
+def make_pseudo_beta(protein, prefix=""):
+    """Create pseudo-beta (alpha for glycine) position and mask."""
+    assert prefix in ["", "template_"]
+    (
+        protein[prefix + "pseudo_beta"],
+        protein[prefix + "pseudo_beta_mask"],
+    ) = pseudo_beta_fn(
+        protein["template_aatype" if prefix else "aatype"],
+        protein[prefix + "all_atom_positions"],
+        protein["template_all_atom_mask" if prefix else "all_atom_mask"],
+    )
+    return protein
+
+
+@curry1
+def add_constant_field(protein, key, value):
+    protein[key] = torch.tensor(value, device=protein["msa"].device)
+    return protein
+
+
+def shaped_categorical(probs, epsilon=1e-10):
+    ds = probs.shape
+    num_classes = ds[-1]
+    distribution = torch.distributions.categorical.Categorical(
+        torch.reshape(probs + epsilon, [-1, num_classes])
+    )
+    counts = distribution.sample()
+    return torch.reshape(counts, ds[:-1])
+
+
+def make_hhblits_profile(protein):
+    """Compute the HHblits MSA profile if not already present."""
+    if "hhblits_profile" in protein:
+        return protein
+
+    # Compute the profile for every residue (over all MSA sequences).
+    msa_one_hot = make_one_hot(protein["msa"], 22)
+
+    protein["hhblits_profile"] = torch.mean(msa_one_hot, dim=0)
+    return protein
+
+
+@curry1
+def make_masked_msa(protein, config, replace_fraction):
+    """Create data for BERT on raw MSA."""
+    # Add a random amino acid uniformly.
+    random_aa = torch.tensor(
+        [0.05] * 20 + [0.0, 0.0], 
+        dtype=torch.float32, 
+        device=protein["aatype"].device
+    )
+
+    categorical_probs = (
+        config.uniform_prob * random_aa
+        + config.profile_prob * protein["hhblits_profile"]
+        + config.same_prob * make_one_hot(protein["msa"], 22)
+    )
+
+    # Put all remaining probability on [MASK] which is a new column
+    pad_shapes = list(
+        reduce(add, [(0, 0) for _ in range(len(categorical_probs.shape))])
+    )
+    pad_shapes[1] = 1
+    mask_prob = (
+        1.0 - config.profile_prob - config.same_prob - config.uniform_prob
+    )
+    assert mask_prob >= 0.0
+    categorical_probs = torch.nn.functional.pad(
+        categorical_probs, pad_shapes, value=mask_prob
+    )
+
+    sh = protein["msa"].shape
+    mask_position = torch.rand(sh) < replace_fraction
+
+    bert_msa = shaped_categorical(categorical_probs)
+    bert_msa = torch.where(mask_position, bert_msa, protein["msa"])
+
+    # Mix real and masked MSA
+    protein["bert_mask"] = mask_position.to(torch.float32)
+    protein["true_msa"] = protein["msa"]
+    protein["msa"] = bert_msa
+
+    return protein
+
+
+@curry1
+def make_fixed_size(
+    protein,
+    shape_schema,
+    msa_cluster_size,
+    extra_msa_size,
+    num_res=0,
+    num_templates=0,
+):
+    """Guess at the MSA and sequence dimension to make fixed size."""
+    pad_size_map = {
+        NUM_RES: num_res,
+        NUM_MSA_SEQ: msa_cluster_size,
+        NUM_EXTRA_SEQ: extra_msa_size,
+        NUM_TEMPLATES: num_templates,
+    }
+
+    for k, v in protein.items():
+        # Don't transfer this to the accelerator.
+        if k == "extra_cluster_assignment":
+            continue
+        shape = list(v.shape)
+        schema = shape_schema[k]
+        msg = "Rank mismatch between shape and shape schema for"
+        assert len(shape) == len(schema), f"{msg} {k}: {shape} vs {schema}"
+        pad_size = [
+            pad_size_map.get(s2, None) or s1 for (s1, s2) in zip(shape, schema)
+        ]
+
+        padding = [(0, p - v.shape[i]) for i, p in enumerate(pad_size)]
+        padding.reverse()
+        padding = list(itertools.chain(*padding))
+        if padding:
+            protein[k] = torch.nn.functional.pad(v, padding)
+            protein[k] = torch.reshape(protein[k], pad_size)
+    
+    return protein
+
+
+@curry1
+def make_msa_feat(protein):
+    """Create and concatenate MSA features."""
+    # Whether there is a domain break. Always zero for chains, but keeping for
+    # compatibility with domain datasets.
+    has_break = torch.clip(
+        protein["between_segment_residues"].to(torch.float32), 0, 1
+    )
+    aatype_1hot = make_one_hot(protein["aatype"], 21)
+
+    target_feat = [
+        torch.unsqueeze(has_break, dim=-1),
+        aatype_1hot,  # Everyone gets the original sequence.
+    ]
+
+    msa_1hot = make_one_hot(protein["msa"], 23)
+    has_deletion = torch.clip(protein["deletion_matrix"], 0.0, 1.0)
+    deletion_value = torch.atan(protein["deletion_matrix"] / 3.0) * (
+        2.0 / np.pi
+    )
+
+    msa_feat = [
+        msa_1hot,
+        torch.unsqueeze(has_deletion, dim=-1),
+        torch.unsqueeze(deletion_value, dim=-1),
+    ]
+
+    if "cluster_profile" in protein:
+        deletion_mean_value = torch.atan(
+            protein["cluster_deletion_mean"] / 3.0
+        ) * (2.0 / np.pi)
+        msa_feat.extend(
+            [
+                protein["cluster_profile"],
+                torch.unsqueeze(deletion_mean_value, dim=-1),
+            ]
+        )
+
+    if "extra_deletion_matrix" in protein:
+        protein["extra_has_deletion"] = torch.clip(
+            protein["extra_deletion_matrix"], 0.0, 1.0
+        )
+        protein["extra_deletion_value"] = torch.atan(
+            protein["extra_deletion_matrix"] / 3.0
+        ) * (2.0 / np.pi)
+
+    protein["msa_feat"] = torch.cat(msa_feat, dim=-1)
+    protein["target_feat"] = torch.cat(target_feat, dim=-1)
+    return protein
+
+
+@curry1
+def select_feat(protein, feature_list):
+    return {k: v for k, v in protein.items() if k in feature_list}
+
+
+@curry1
+def crop_templates(protein, max_templates):
+    for k, v in protein.items():
+        if k.startswith("template_"):
+            protein[k] = v[:max_templates]
+    return protein
+
+
+def make_atom14_masks(protein):
+    """Construct denser atom positions (14 dimensions instead of 37)."""
+    restype_atom14_to_atom37 = []
+    restype_atom37_to_atom14 = []
+    restype_atom14_mask = []
+
+    for rt in rc.restypes:
+        atom_names = rc.restype_name_to_atom14_names[rc.restype_1to3[rt]]
+        restype_atom14_to_atom37.append(
+            [(rc.atom_order[name] if name else 0) for name in atom_names]
+        )
+        atom_name_to_idx14 = {name: i for i, name in enumerate(atom_names)}
+        restype_atom37_to_atom14.append(
+            [
+                (atom_name_to_idx14[name] if name in atom_name_to_idx14 else 0)
+                for name in rc.atom_types
+            ]
+        )
+
+        restype_atom14_mask.append(
+            [(1.0 if name else 0.0) for name in atom_names]
+        )
+
+    # Add dummy mapping for restype 'UNK'
+    restype_atom14_to_atom37.append([0] * 14)
+    restype_atom37_to_atom14.append([0] * 37)
+    restype_atom14_mask.append([0.0] * 14)
+
+    restype_atom14_to_atom37 = torch.tensor(
+        restype_atom14_to_atom37,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom37_to_atom14 = torch.tensor(
+        restype_atom37_to_atom14,
+        dtype=torch.int32,
+        device=protein["aatype"].device,
+    )
+    restype_atom14_mask = torch.tensor(
+        restype_atom14_mask,
+        dtype=torch.float32,
+        device=protein["aatype"].device,
+    )
+    protein_aatype = protein['aatype'].to(torch.long)
+
+    # create the mapping for (residx, atom14) --> atom37, i.e. an array
+    # with shape (num_res, 14) containing the atom37 indices for this protein
+    residx_atom14_to_atom37 = restype_atom14_to_atom37[protein_aatype]
+    residx_atom14_mask = restype_atom14_mask[protein_aatype]
+
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["residx_atom14_to_atom37"] = residx_atom14_to_atom37.long()
+
+    # create the gather indices for mapping back
+    residx_atom37_to_atom14 = restype_atom37_to_atom14[protein_aatype]
+    protein["residx_atom37_to_atom14"] = residx_atom37_to_atom14.long()
+
+    # create the corresponding mask
+    restype_atom37_mask = torch.zeros(
+        [21, 37], dtype=torch.float32, device=protein["aatype"].device
+    )
+    for restype, restype_letter in enumerate(rc.restypes):
+        restype_name = rc.restype_1to3[restype_letter]
+        atom_names = rc.residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = rc.atom_order[atom_name]
+            restype_atom37_mask[restype, atom_type] = 1
+
+    residx_atom37_mask = restype_atom37_mask[protein_aatype]
+    protein["atom37_atom_exists"] = residx_atom37_mask
+
+    return protein
+
+
+def make_atom14_masks_np(batch):
+    batch = tree_map(
+        lambda n: torch.tensor(n, device=batch["aatype"].device), 
+        batch, 
+        np.ndarray
+    )
+    out = make_atom14_masks(batch)
+    out = tensor_tree_map(lambda t: np.array(t), out)
+    return out
+
+
+def make_atom14_positions(protein):
+    """Constructs denser atom positions (14 dimensions instead of 37)."""
+    residx_atom14_mask = protein["atom14_atom_exists"]
+    residx_atom14_to_atom37 = protein["residx_atom14_to_atom37"]
+
+    # Create a mask for known ground truth positions.
+    residx_atom14_gt_mask = residx_atom14_mask * batched_gather(
+        protein["all_atom_mask"],
+        residx_atom14_to_atom37,
+        dim=-1,
+        no_batch_dims=len(protein["all_atom_mask"].shape[:-1]),
+    )
+
+    # Gather the ground truth positions.
+    residx_atom14_gt_positions = residx_atom14_gt_mask[..., None] * (
+        batched_gather(
+            protein["all_atom_positions"],
+            residx_atom14_to_atom37,
+            dim=-2,
+            no_batch_dims=len(protein["all_atom_positions"].shape[:-2]),
+        )
+    )
+
+    protein["atom14_atom_exists"] = residx_atom14_mask
+    protein["atom14_gt_exists"] = residx_atom14_gt_mask
+    protein["atom14_gt_positions"] = residx_atom14_gt_positions
+
+    # As the atom naming is ambiguous for 7 of the 20 amino acids, provide
+    # alternative ground truth coordinates where the naming is swapped
+    restype_3 = [rc.restype_1to3[res] for res in rc.restypes]
+    restype_3 += ["UNK"]
+
+    # Matrices for renaming ambiguous atoms.
+    all_matrices = {
+        res: torch.eye(
+            14,
+            dtype=protein["all_atom_mask"].dtype,
+            device=protein["all_atom_mask"].device,
+        )
+        for res in restype_3
+    }
+    for resname, swap in rc.residue_atom_renaming_swaps.items():
+        correspondences = torch.arange(
+            14, device=protein["all_atom_mask"].device
+        )
+        for source_atom_swap, target_atom_swap in swap.items():
+            source_index = rc.restype_name_to_atom14_names[resname].index(
+                source_atom_swap
+            )
+            target_index = rc.restype_name_to_atom14_names[resname].index(
+                target_atom_swap
+            )
+            correspondences[source_index] = target_index
+            correspondences[target_index] = source_index
+            renaming_matrix = protein["all_atom_mask"].new_zeros((14, 14))
+            for index, correspondence in enumerate(correspondences):
+                renaming_matrix[index, correspondence] = 1.0
+        all_matrices[resname] = renaming_matrix
+    renaming_matrices = torch.stack(
+        [all_matrices[restype] for restype in restype_3]
+    )
+
+    # Pick the transformation matrices for the given residue sequence
+    # shape (num_res, 14, 14).
+    renaming_transform = renaming_matrices[protein["aatype"]]
+
+    # Apply it to the ground truth positions. shape (num_res, 14, 3).
+    alternative_gt_positions = torch.einsum(
+        "...rac,...rab->...rbc", residx_atom14_gt_positions, renaming_transform
+    )
+    protein["atom14_alt_gt_positions"] = alternative_gt_positions
+
+    # Create the mask for the alternative ground truth (differs from the
+    # ground truth mask, if only one of the atoms in an ambiguous pair has a
+    # ground truth position).
+    alternative_gt_mask = torch.einsum(
+        "...ra,...rab->...rb", residx_atom14_gt_mask, renaming_transform
+    )
+    protein["atom14_alt_gt_exists"] = alternative_gt_mask
+
+    # Create an ambiguous atoms mask.  shape: (21, 14).
+    restype_atom14_is_ambiguous = protein["all_atom_mask"].new_zeros((21, 14))
+    for resname, swap in rc.residue_atom_renaming_swaps.items():
+        for atom_name1, atom_name2 in swap.items():
+            restype = rc.restype_order[rc.restype_3to1[resname]]
+            atom_idx1 = rc.restype_name_to_atom14_names[resname].index(
+                atom_name1
+            )
+            atom_idx2 = rc.restype_name_to_atom14_names[resname].index(
+                atom_name2
+            )
+            restype_atom14_is_ambiguous[restype, atom_idx1] = 1
+            restype_atom14_is_ambiguous[restype, atom_idx2] = 1
+
+    # From this create an ambiguous_mask for the given sequence.
+    protein["atom14_atom_is_ambiguous"] = restype_atom14_is_ambiguous[
+        protein["aatype"]
+    ]
+
+    return protein
+
+
+def atom37_to_frames(protein, eps=1e-8):
+    aatype = protein["aatype"]
+    all_atom_positions = protein["all_atom_positions"]
+    all_atom_mask = protein["all_atom_mask"]
+
+    batch_dims = len(aatype.shape[:-1])
+
+    restype_rigidgroup_base_atom_names = np.full([21, 8, 3], "", dtype=object)
+    restype_rigidgroup_base_atom_names[:, 0, :] = ["C", "CA", "N"]
+    restype_rigidgroup_base_atom_names[:, 3, :] = ["CA", "C", "O"]
+
+    for restype, restype_letter in enumerate(rc.restypes):
+        resname = rc.restype_1to3[restype_letter]
+        for chi_idx in range(4):
+            if rc.chi_angles_mask[restype][chi_idx]:
+                names = rc.chi_angles_atoms[resname][chi_idx]
+                restype_rigidgroup_base_atom_names[
+                    restype, chi_idx + 4, :
+                ] = names[1:]
+
+    restype_rigidgroup_mask = all_atom_mask.new_zeros(
+        (*aatype.shape[:-1], 21, 8),
+    )
+    restype_rigidgroup_mask[..., 0] = 1
+    restype_rigidgroup_mask[..., 3] = 1
+    restype_rigidgroup_mask[..., :20, 4:] = all_atom_mask.new_tensor(
+        rc.chi_angles_mask
+    )
+
+    lookuptable = rc.atom_order.copy()
+    lookuptable[""] = 0
+    lookup = np.vectorize(lambda x: lookuptable[x])
+    restype_rigidgroup_base_atom37_idx = lookup(
+        restype_rigidgroup_base_atom_names,
+    )
+    restype_rigidgroup_base_atom37_idx = aatype.new_tensor(
+        restype_rigidgroup_base_atom37_idx,
+    )
+    restype_rigidgroup_base_atom37_idx = (
+        restype_rigidgroup_base_atom37_idx.view(
+            *((1,) * batch_dims), *restype_rigidgroup_base_atom37_idx.shape
+        )
+    )
+
+    residx_rigidgroup_base_atom37_idx = batched_gather(
+        restype_rigidgroup_base_atom37_idx,
+        aatype,
+        dim=-3,
+        no_batch_dims=batch_dims,
+    )
+
+    base_atom_pos = batched_gather(
+        all_atom_positions,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-2,
+        no_batch_dims=len(all_atom_positions.shape[:-2]),
+    )
+
+    gt_frames = Rigid.from_3_points(
+        p_neg_x_axis=base_atom_pos[..., 0, :],
+        origin=base_atom_pos[..., 1, :],
+        p_xy_plane=base_atom_pos[..., 2, :],
+        eps=eps,
+    )
+
+    group_exists = batched_gather(
+        restype_rigidgroup_mask,
+        aatype,
+        dim=-2,
+        no_batch_dims=batch_dims,
+    )
+
+    gt_atoms_exist = batched_gather(
+        all_atom_mask,
+        residx_rigidgroup_base_atom37_idx,
+        dim=-1,
+        no_batch_dims=len(all_atom_mask.shape[:-1]),
+    )
+    gt_exists = torch.min(gt_atoms_exist, dim=-1)[0] * group_exists
+
+    rots = torch.eye(3, dtype=all_atom_mask.dtype, device=aatype.device)
+    rots = torch.tile(rots, (*((1,) * batch_dims), 8, 1, 1))
+    rots[..., 0, 0, 0] = -1
+    rots[..., 0, 2, 2] = -1
+    rots = Rotation(rot_mats=rots)
+
+    gt_frames = gt_frames.compose(Rigid(rots, None))
+
+    restype_rigidgroup_is_ambiguous = all_atom_mask.new_zeros(
+        *((1,) * batch_dims), 21, 8
+    )
+    restype_rigidgroup_rots = torch.eye(
+        3, dtype=all_atom_mask.dtype, device=aatype.device
+    )
+    restype_rigidgroup_rots = torch.tile(
+        restype_rigidgroup_rots,
+        (*((1,) * batch_dims), 21, 8, 1, 1),
+    )
+
+    for resname, _ in rc.residue_atom_renaming_swaps.items():
+        restype = rc.restype_order[rc.restype_3to1[resname]]
+        chi_idx = int(sum(rc.chi_angles_mask[restype]) - 1)
+        restype_rigidgroup_is_ambiguous[..., restype, chi_idx + 4] = 1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 1, 1] = -1
+        restype_rigidgroup_rots[..., restype, chi_idx + 4, 2, 2] = -1
+
+    residx_rigidgroup_is_ambiguous = batched_gather(
+        restype_rigidgroup_is_ambiguous,
+        aatype,
+        dim=-2,
+        no_batch_dims=batch_dims,
+    )
+
+    residx_rigidgroup_ambiguity_rot = batched_gather(
+        restype_rigidgroup_rots,
+        aatype,
+        dim=-4,
+        no_batch_dims=batch_dims,
+    )
+
+    residx_rigidgroup_ambiguity_rot = Rotation(
+        rot_mats=residx_rigidgroup_ambiguity_rot
+    )
+    alt_gt_frames = gt_frames.compose(
+        Rigid(residx_rigidgroup_ambiguity_rot, None)
+    )
+
+    gt_frames_tensor = gt_frames.to_tensor_4x4()
+    alt_gt_frames_tensor = alt_gt_frames.to_tensor_4x4()
+
+    protein["rigidgroups_gt_frames"] = gt_frames_tensor
+    protein["rigidgroups_gt_exists"] = gt_exists
+    protein["rigidgroups_group_exists"] = group_exists
+    protein["rigidgroups_group_is_ambiguous"] = residx_rigidgroup_is_ambiguous
+    protein["rigidgroups_alt_gt_frames"] = alt_gt_frames_tensor
+
+    return protein
+
+
+def get_chi_atom_indices():
+    """Returns atom indices needed to compute chi angles for all residue types.
+
+    Returns:
+      A tensor of shape [residue_types=21, chis=4, atoms=4]. The residue types are
+      in the order specified in rc.restypes + unknown residue type
+      at the end. For chi angles which are not defined on the residue, the
+      positions indices are by default set to 0.
+    """
+    chi_atom_indices = []
+    for residue_name in rc.restypes:
+        residue_name = rc.restype_1to3[residue_name]
+        residue_chi_angles = rc.chi_angles_atoms[residue_name]
+        atom_indices = []
+        for chi_angle in residue_chi_angles:
+            atom_indices.append([rc.atom_order[atom] for atom in chi_angle])
+        for _ in range(4 - len(atom_indices)):
+            atom_indices.append(
+                [0, 0, 0, 0]
+            )  # For chi angles not defined on the AA.
+        chi_atom_indices.append(atom_indices)
+
+    chi_atom_indices.append([[0, 0, 0, 0]] * 4)  # For UNKNOWN residue.
+
+    return chi_atom_indices
+
+
+@curry1
+def atom37_to_torsion_angles(
+    protein,
+    prefix="",
+):
+    """
+    Convert coordinates to torsion angles.
+
+    This function is extremely sensitive to floating point imprecisions
+    and should be run with double precision whenever possible.
+
+    Args:
+        Dict containing:
+            * (prefix)aatype:
+                [*, N_res] residue indices
+            * (prefix)all_atom_positions:
+                [*, N_res, 37, 3] atom positions (in atom37
+                format)
+            * (prefix)all_atom_mask:
+                [*, N_res, 37] atom position mask
+    Returns:
+        The same dictionary updated with the following features:
+
+        "(prefix)torsion_angles_sin_cos" ([*, N_res, 7, 2])
+            Torsion angles
+        "(prefix)alt_torsion_angles_sin_cos" ([*, N_res, 7, 2])
+            Alternate torsion angles (accounting for 180-degree symmetry)
+        "(prefix)torsion_angles_mask" ([*, N_res, 7])
+            Torsion angles mask
+    """
+    aatype = protein[prefix + "aatype"]
+    all_atom_positions = protein[prefix + "all_atom_positions"]
+    all_atom_mask = protein[prefix + "all_atom_mask"]
+
+    aatype = torch.clamp(aatype, max=20)
+
+    pad = all_atom_positions.new_zeros(
+        [*all_atom_positions.shape[:-3], 1, 37, 3]
+    )
+    prev_all_atom_positions = torch.cat(
+        [pad, all_atom_positions[..., :-1, :, :]], dim=-3
+    )
+
+    pad = all_atom_mask.new_zeros([*all_atom_mask.shape[:-2], 1, 37])
+    prev_all_atom_mask = torch.cat([pad, all_atom_mask[..., :-1, :]], dim=-2)
+
+    pre_omega_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 1:3, :], all_atom_positions[..., :2, :]],
+        dim=-2,
+    )
+    phi_atom_pos = torch.cat(
+        [prev_all_atom_positions[..., 2:3, :], all_atom_positions[..., :3, :]],
+        dim=-2,
+    )
+    psi_atom_pos = torch.cat(
+        [all_atom_positions[..., :3, :], all_atom_positions[..., 4:5, :]],
+        dim=-2,
+    )
+
+    pre_omega_mask = torch.prod(
+        prev_all_atom_mask[..., 1:3], dim=-1
+    ) * torch.prod(all_atom_mask[..., :2], dim=-1)
+    phi_mask = prev_all_atom_mask[..., 2] * torch.prod(
+        all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype
+    )
+    psi_mask = (
+        torch.prod(all_atom_mask[..., :3], dim=-1, dtype=all_atom_mask.dtype)
+        * all_atom_mask[..., 4]
+    )
+
+    chi_atom_indices = torch.as_tensor(
+        get_chi_atom_indices(), device=aatype.device
+    )
+
+    atom_indices = chi_atom_indices[..., aatype, :, :]
+    chis_atom_pos = batched_gather(
+        all_atom_positions, atom_indices, -2, len(atom_indices.shape[:-2])
+    )
+
+    chi_angles_mask = list(rc.chi_angles_mask)
+    chi_angles_mask.append([0.0, 0.0, 0.0, 0.0])
+    chi_angles_mask = all_atom_mask.new_tensor(chi_angles_mask)
+
+    chis_mask = chi_angles_mask[aatype, :]
+
+    chi_angle_atoms_mask = batched_gather(
+        all_atom_mask,
+        atom_indices,
+        dim=-1,
+        no_batch_dims=len(atom_indices.shape[:-2]),
+    )
+    chi_angle_atoms_mask = torch.prod(
+        chi_angle_atoms_mask, dim=-1, dtype=chi_angle_atoms_mask.dtype
+    )
+    chis_mask = chis_mask * chi_angle_atoms_mask
+
+    torsions_atom_pos = torch.cat(
+        [
+            pre_omega_atom_pos[..., None, :, :],
+            phi_atom_pos[..., None, :, :],
+            psi_atom_pos[..., None, :, :],
+            chis_atom_pos,
+        ],
+        dim=-3,
+    )
+
+    torsion_angles_mask = torch.cat(
+        [
+            pre_omega_mask[..., None],
+            phi_mask[..., None],
+            psi_mask[..., None],
+            chis_mask,
+        ],
+        dim=-1,
+    )
+
+    torsion_frames = Rigid.from_3_points(
+        torsions_atom_pos[..., 1, :],
+        torsions_atom_pos[..., 2, :],
+        torsions_atom_pos[..., 0, :],
+        eps=1e-8,
+    )
+
+    fourth_atom_rel_pos = torsion_frames.invert().apply(
+        torsions_atom_pos[..., 3, :]
+    )
+
+    torsion_angles_sin_cos = torch.stack(
+        [fourth_atom_rel_pos[..., 2], fourth_atom_rel_pos[..., 1]], dim=-1
+    )
+
+    denom = torch.sqrt(
+        torch.sum(
+            torch.square(torsion_angles_sin_cos),
+            dim=-1,
+            dtype=torsion_angles_sin_cos.dtype,
+            keepdims=True,
+        )
+        + 1e-8
+    )
+    torsion_angles_sin_cos = torsion_angles_sin_cos / denom
+
+    torsion_angles_sin_cos = torsion_angles_sin_cos * all_atom_mask.new_tensor(
+        [1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0],
+    )[((None,) * len(torsion_angles_sin_cos.shape[:-2])) + (slice(None), None)]
+
+    chi_is_ambiguous = torsion_angles_sin_cos.new_tensor(
+        rc.chi_pi_periodic,
+    )[aatype, ...]
+
+    mirror_torsion_angles = torch.cat(
+        [
+            all_atom_mask.new_ones(*aatype.shape, 3),
+            1.0 - 2.0 * chi_is_ambiguous,
+        ],
+        dim=-1,
+    )
+
+    alt_torsion_angles_sin_cos = (
+        torsion_angles_sin_cos * mirror_torsion_angles[..., None]
+    )
+
+    protein[prefix + "torsion_angles_sin_cos"] = torsion_angles_sin_cos
+    protein[prefix + "alt_torsion_angles_sin_cos"] = alt_torsion_angles_sin_cos
+    protein[prefix + "torsion_angles_mask"] = torsion_angles_mask
+
+    return protein
+
+
+def get_backbone_frames(protein):
+    # DISCREPANCY: AlphaFold uses tensor_7s here. I don't know why.
+    protein["backbone_rigid_tensor"] = protein["rigidgroups_gt_frames"][
+        ..., 0, :, :
+    ]
+    protein["backbone_rigid_mask"] = protein["rigidgroups_gt_exists"][..., 0]
+
+    return protein
+
+
+def get_chi_angles(protein):
+    dtype = protein["all_atom_mask"].dtype
+    protein["chi_angles_sin_cos"] = (
+        protein["torsion_angles_sin_cos"][..., 3:, :]
+    ).to(dtype)
+    protein["chi_mask"] = protein["torsion_angles_mask"][..., 3:].to(dtype)
+
+    return protein
+
+
+@curry1
+def random_crop_to_size(
+    protein,
+    crop_size,
+    max_templates,
+    shape_schema,
+    subsample_templates=False,
+    seed=None,
+):
+    """Crop randomly to `crop_size`, or keep as is if shorter than that."""
+    # We want each ensemble to be cropped the same way
+    g = torch.Generator(device=protein["seq_length"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+
+    seq_length = protein["seq_length"]
+
+    if "template_mask" in protein:
+        num_templates = protein["template_mask"].shape[-1]
+    else:
+        num_templates = 0
+
+    # No need to subsample templates if there aren't any
+    subsample_templates = subsample_templates and num_templates
+
+    num_res_crop_size = min(int(seq_length), crop_size)
+
+    def _randint(lower, upper):
+        return int(torch.randint(
+                lower,
+                upper + 1,
+                (1,),
+                device=protein["seq_length"].device,
+                generator=g,
+        )[0])
+
+    if subsample_templates:
+        templates_crop_start = _randint(0, num_templates)
+        templates_select_indices = torch.randperm(
+            num_templates, device=protein["seq_length"].device, generator=g
+        )
+    else:
+        templates_crop_start = 0
+
+    num_templates_crop_size = min(
+        num_templates - templates_crop_start, max_templates
+    )
+
+    n = seq_length - num_res_crop_size
+    if "use_clamped_fape" in protein and protein["use_clamped_fape"] == 1.:
+        right_anchor = n
+    else:
+        x = _randint(0, n)
+        right_anchor = n - x
+
+    num_res_crop_start = _randint(0, right_anchor)
+
+    for k, v in protein.items():
+        if k not in shape_schema or (
+            "template" not in k and NUM_RES not in shape_schema[k]
+        ):
+            continue
+
+        # randomly permute the templates before cropping them.
+        if k.startswith("template") and subsample_templates:
+            v = v[templates_select_indices]
+
+        slices = []
+        for i, (dim_size, dim) in enumerate(zip(shape_schema[k], v.shape)):
+            is_num_res = dim_size == NUM_RES
+            if i == 0 and k.startswith("template"):
+                crop_size = num_templates_crop_size
+                crop_start = templates_crop_start
+            else:
+                crop_start = num_res_crop_start if is_num_res else 0
+                crop_size = num_res_crop_size if is_num_res else dim
+            slices.append(slice(crop_start, crop_start + crop_size))
+        protein[k] = v[slices]
+
+    protein["seq_length"] = protein["seq_length"].new_tensor(num_res_crop_size)
+    
+    return protein
--- a/fastfold/data/data_transforms_multimer.py
+++ b/fastfold/data/data_transforms_multimer.py
+from typing import Sequence
+
+import torch
+
+from fastfold.data.data_transforms import curry1
+from fastfold.utils.tensor_utils import masked_mean
+
+
+def gumbel_noise(
+    shape: Sequence[int], 
+    device: torch.device, 
+    eps=1e-6,
+    generator=None,
+) -> torch.Tensor:
+    """Generate Gumbel Noise of given Shape.
+
+    This generates samples from Gumbel(0, 1).
+
+    Args:
+        shape: Shape of noise to return.
+
+    Returns:
+        Gumbel noise of given shape.
+    """
+    uniform_noise = torch.rand(
+        shape, dtype=torch.float32, device=device, generator=generator
+    )
+    gumbel = -torch.log(-torch.log(uniform_noise + eps) + eps)
+    return gumbel
+
+
+def gumbel_max_sample(logits: torch.Tensor, generator=None) -> torch.Tensor:
+    """Samples from a probability distribution given by 'logits'.
+
+    This uses Gumbel-max trick to implement the sampling in an efficient manner.
+
+    Args:
+        logits: Logarithm of probabilities to sample from, probabilities can be
+            unnormalized.
+
+    Returns:
+        Sample from logprobs in one-hot form.
+    """
+    z = gumbel_noise(logits.shape, device=logits.device, generator=generator)
+    return torch.nn.functional.one_hot(
+        torch.argmax(logits + z, dim=-1),
+        logits.shape[-1],
+    )
+
+
+def gumbel_argsort_sample_idx(
+    logits: torch.Tensor, 
+    generator=None
+) -> torch.Tensor:
+    """Samples with replacement from a distribution given by 'logits'.
+
+    This uses Gumbel trick to implement the sampling an efficient manner. For a
+    distribution over k items this samples k times without replacement, so this
+    is effectively sampling a random permutation with probabilities over the
+    permutations derived from the logprobs.
+
+    Args:
+        logits: Logarithm of probabilities to sample from, probabilities can be
+            unnormalized.
+
+    Returns:
+        Sample from logprobs in one-hot form.
+    """
+    z = gumbel_noise(logits.shape, device=logits.device, generator=generator)
+    return torch.argsort(logits + z, dim=-1, descending=True)
+
+
+@curry1
+def make_masked_msa(batch, config, replace_fraction, seed, eps=1e-6):
+    """Create data for BERT on raw MSA."""
+    # Add a random amino acid uniformly.
+    random_aa = torch.Tensor(
+        [0.05] * 20 + [0., 0.], 
+        device=batch['msa'].device
+    )
+
+    categorical_probs = (
+        config.uniform_prob * random_aa +
+        config.profile_prob * batch['msa_profile'] +
+        config.same_prob * torch.nn.functional.one_hot(batch['msa'], 22)
+    )
+
+    # Put all remaining probability on [MASK] which is a new column.
+    mask_prob = 1. - config.profile_prob - config.same_prob - config.uniform_prob
+   
+    categorical_probs = torch.nn.functional.pad(
+        categorical_probs, [0,1], value=mask_prob
+    )
+
+    sh = batch['msa'].shape
+    mask_position = torch.rand(sh, device=batch['msa'].device) < replace_fraction
+    mask_position *= batch['msa_mask'].to(mask_position.dtype)
+    
+    logits = torch.log(categorical_probs + eps)
+    
+    g = torch.Generator(device=batch["msa"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+
+    bert_msa = gumbel_max_sample(logits, generator=g)
+    
+    bert_msa = torch.where(
+        mask_position,
+        torch.argmax(bert_msa, dim=-1), 
+        batch['msa']
+    )
+    bert_msa *= batch['msa_mask'].to(bert_msa.dtype)
+
+    # Mix real and masked MSA.
+    if 'bert_mask' in batch:
+        batch['bert_mask'] *= mask_position.to(torch.float32)
+    else:
+        batch['bert_mask'] = mask_position.to(torch.float32)
+    batch['true_msa'] = batch['msa']
+    batch['msa'] = bert_msa
+
+    return batch
+
+
+@curry1
+def nearest_neighbor_clusters(batch, gap_agreement_weight=0.):
+    """Assign each extra MSA sequence to its nearest neighbor in sampled MSA."""
+    device = batch["msa_mask"].device
+
+    # Determine how much weight we assign to each agreement.    In theory, we could
+    # use a full blosum matrix here, but right now let's just down-weight gap
+    # agreement because it could be spurious.
+    # Never put weight on agreeing on BERT mask.
+
+    weights = torch.Tensor(
+        [1.] * 21 + [gap_agreement_weight] + [0.], 
+        device=device,
+    )
+
+    msa_mask = batch['msa_mask']
+    msa_one_hot = torch.nn.functional.one_hot(batch['msa'], 23)
+
+    extra_mask = batch['extra_msa_mask']
+    extra_one_hot = torch.nn.functional.one_hot(batch['extra_msa'], 23)
+
+    msa_one_hot_masked = msa_mask[:, :, None] * msa_one_hot
+    extra_one_hot_masked = extra_mask[:, :, None] * extra_one_hot
+
+    agreement = torch.einsum(
+        'mrc, nrc->nm', 
+        extra_one_hot_masked,
+        weights * msa_one_hot_masked
+    )
+
+    cluster_assignment = torch.nn.functional.softmax(1e3 * agreement, dim=0)
+    cluster_assignment *= torch.einsum('mr, nr->mn', msa_mask, extra_mask)
+
+    cluster_count = torch.sum(cluster_assignment, dim=-1)
+    cluster_count += 1.    # We always include the sequence itself.
+
+    msa_sum = torch.einsum('nm, mrc->nrc', cluster_assignment, extra_one_hot_masked)
+    msa_sum += msa_one_hot_masked
+
+    cluster_profile = msa_sum / cluster_count[:, None, None]
+
+    extra_deletion_matrix = batch['extra_deletion_matrix']
+    deletion_matrix = batch['deletion_matrix']
+
+    del_sum = torch.einsum(
+        'nm, mc->nc', 
+        cluster_assignment,
+        extra_mask * extra_deletion_matrix
+    )
+    del_sum += deletion_matrix    # Original sequence.
+    cluster_deletion_mean = del_sum / cluster_count[:, None]
+
+    batch['cluster_profile'] = cluster_profile
+    batch['cluster_deletion_mean'] = cluster_deletion_mean
+
+    return batch
+
+
+def create_target_feat(batch):
+    """Create the target features"""
+    batch["target_feat"] = torch.nn.functional.one_hot(
+        batch["aatype"], 21
+    ).to(torch.float32)
+    return batch
+
+
+def create_msa_feat(batch):
+    """Create and concatenate MSA features."""
+    device = batch["msa"]
+    msa_1hot = torch.nn.functional.one_hot(batch['msa'], 23)
+    deletion_matrix = batch['deletion_matrix']
+    has_deletion = torch.clamp(deletion_matrix, min=0., max=1.)[..., None]
+    pi = torch.acos(torch.zeros(1, device=deletion_matrix.device)) * 2
+    deletion_value = (torch.atan(deletion_matrix / 3.) * (2. / pi))[..., None]
+
+    deletion_mean_value = (
+        torch.atan(
+            batch['cluster_deletion_mean'] / 3.) *
+            (2. / pi)
+    )[..., None]
+
+    msa_feat = torch.cat(
+        [
+            msa_1hot,
+            has_deletion,
+            deletion_value,
+            batch['cluster_profile'],
+            deletion_mean_value
+        ], 
+        dim=-1,
+    )
+
+    batch["msa_feat"] = msa_feat
+
+    return batch 
+
+
+def build_extra_msa_feat(batch):
+    """Expand extra_msa into 1hot and concat with other extra msa features.
+
+    We do this as late as possible as the one_hot extra msa can be very large.
+
+    Args:
+        batch: a dictionary with the following keys:
+         * 'extra_msa': [num_seq, num_res] MSA that wasn't selected as a cluster
+             centre. Note - This isn't one-hotted.
+         * 'extra_deletion_matrix': [num_seq, num_res] Number of deletions at given
+                position.
+        num_extra_msa: Number of extra msa to use.
+
+    Returns:
+        Concatenated tensor of extra MSA features.
+    """
+    # 23 = 20 amino acids + 'X' for unknown + gap + bert mask
+    extra_msa = batch['extra_msa']
+    deletion_matrix = batch['extra_deletion_matrix']
+    msa_1hot = torch.nn.functional.one_hot(extra_msa, 23)
+    has_deletion = torch.clamp(deletion_matrix, min=0., max=1.)[..., None]
+    pi = torch.acos(torch.zeros(1, device=deletion_matrix.device)) * 2
+    deletion_value = (
+        (torch.atan(deletion_matrix / 3.) * (2. / pi))[..., None]
+    )
+    extra_msa_mask = batch['extra_msa_mask']
+    catted = torch.cat([msa_1hot, has_deletion, deletion_value], dim=-1)
+  
+    return catted
+
+
+@curry1
+def sample_msa(batch, max_seq, max_extra_msa_seq, seed, inf=1e6):
+    """Sample MSA randomly, remaining sequences are stored as `extra_*`.
+
+    Args:
+        batch: batch to sample msa from.
+        max_seq: number of sequences to sample.
+    Returns:
+        Protein with sampled msa.
+    """
+    g = torch.Generator(device=batch["msa"].device)
+    if seed is not None:
+        g.manual_seed(seed)
+
+    # Sample uniformly among sequences with at least one non-masked position.
+    logits = (torch.clamp(torch.sum(batch['msa_mask'], dim=-1), 0., 1.) - 1.) * inf
+    # The cluster_bias_mask can be used to preserve the first row (target
+    # sequence) for each chain, for example.
+    if 'cluster_bias_mask' not in batch:
+        cluster_bias_mask = torch.nn.functional.pad(
+            batch['msa'].new_zeros(batch['msa'].shape[0] - 1), 
+            (1, 0), 
+            value=1.
+        )
+    else:
+        cluster_bias_mask = batch['cluster_bias_mask']
+
+    logits += cluster_bias_mask * inf
+    index_order = gumbel_argsort_sample_idx(logits, generator=g)
+    sel_idx = index_order[:max_seq]
+    extra_idx = index_order[max_seq:][:max_extra_msa_seq]
+
+    for k in ['msa', 'deletion_matrix', 'msa_mask', 'bert_mask']:
+        if k in batch:
+            batch['extra_' + k] = batch[k][extra_idx]
+            batch[k] = batch[k][sel_idx]
+
+    return batch
+
+
+def make_msa_profile(batch):
+    """Compute the MSA profile."""
+
+    # Compute the profile for every residue (over all MSA sequences).
+    batch["msa_profile"] = masked_mean(
+        batch['msa_mask'][..., None], 
+        torch.nn.functional.one_hot(batch['msa'], 22), 
+        dim=-3,
+    )
+
+    return batch
--- a/fastfold/data/errors.py
+++ b/fastfold/data/errors.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""General-purpose errors used throughout the data pipeline"""
+class Error(Exception):
+    """Base class for exceptions."""
+
+
+class MultipleChainsError(Error):
+    """An error indicating that multiple chains were found for a given ID."""
--- a/fastfold/data/feature_pipeline.py
+++ b/fastfold/data/feature_pipeline.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Mapping, Tuple, List, Optional, Dict, Sequence
+
+import ml_collections
+import numpy as np
+import torch
+
+import fastfold.habana as habana
+from fastfold.data import input_pipeline, input_pipeline_multimer
+
+
+FeatureDict = Mapping[str, np.ndarray]
+TensorDict = Dict[str, torch.Tensor]
+
+
+def np_to_tensor_dict(
+    np_example: Mapping[str, np.ndarray],
+    features: Sequence[str],
+) -> TensorDict:
+    """Creates dict of tensors from a dict of NumPy arrays.
+
+    Args:
+        np_example: A dict of NumPy feature arrays.
+        features: A list of strings of feature names to be returned in the dataset.
+
+    Returns:
+        A dictionary of features mapping feature names to features. Only the given
+        features are returned, all other ones are filtered out.
+    """
+    tensor_dict = {
+        k: torch.tensor(v) for k, v in np_example.items() if k in features
+    }
+    return tensor_dict
+
+
+def make_data_config(
+    config: ml_collections.ConfigDict,
+    mode: str,
+    num_res: int,
+) -> Tuple[ml_collections.ConfigDict, List[str]]:
+    cfg = copy.deepcopy(config)
+    mode_cfg = cfg[mode]
+    with cfg.unlocked():
+        if mode_cfg.crop_size is None:
+            mode_cfg.crop_size = num_res
+
+    feature_names = cfg.common.unsupervised_features
+
+    if cfg.common.use_templates:
+        feature_names += cfg.common.template_features
+
+    if cfg[mode].supervised:
+        feature_names += cfg.supervised.supervised_features
+
+    return cfg, feature_names
+
+
+def np_example_to_features(
+    np_example: FeatureDict,
+    config: ml_collections.ConfigDict,
+    is_multimer: bool,
+    mode: str,
+):
+    np_example = dict(np_example)
+    if is_multimer:
+        num_res = int(np_example["seq_length"][0])
+    else:
+        num_res = int(np_example["seq_length"][0])
+    cfg, feature_names = make_data_config(config, mode=mode, num_res=num_res)
+
+    if "deletion_matrix_int" in np_example:
+        np_example["deletion_matrix"] = np_example.pop(
+            "deletion_matrix_int"
+        ).astype(np.float32)
+
+    tensor_dict = np_to_tensor_dict(
+        np_example=np_example, features=feature_names
+    )
+
+    if is_multimer:
+        input_pipeline_fn = input_pipeline_multimer.process_tensors_from_config
+    else:
+        input_pipeline_fn = input_pipeline.process_tensors_from_config
+
+    if habana.is_habana():
+        from habana_frameworks.torch.hpex import hmp
+        with torch.no_grad(), hmp.disable_casts():
+            features = input_pipeline_fn(tensor_dict, cfg.common, cfg[mode])
+    else:
+        with torch.no_grad():
+            features = input_pipeline_fn(tensor_dict, cfg.common, cfg[mode])
+
+    return {k: v for k, v in features.items()}
+
+
+class FeaturePipeline:
+    def __init__(
+        self,
+        config: ml_collections.ConfigDict,
+    ):
+        self.config = config
+
+    def process_features(
+        self,
+        raw_features: FeatureDict,
+        mode: str = "train",
+        is_multimer: bool = False,
+    ) -> FeatureDict:
+        return np_example_to_features(
+            np_example=raw_features,
+            config=self.config,
+            mode=mode,
+            is_multimer=is_multimer,
+        )
--- a/fastfold/data/feature_processing_multimer.py
+++ b/fastfold/data/feature_processing_multimer.py
+# Copyright 2021 DeepMind Technologies Limited
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Feature processing logic for multimer data pipeline."""
+
+from typing import Iterable, MutableMapping, List, Mapping
+
+from fastfold.data import msa_pairing
+from fastfold.common import residue_constants
+import numpy as np
+
+
+# TODO: Move this into the config
+REQUIRED_FEATURES = frozenset({
+    'aatype', 'all_atom_mask', 'all_atom_positions', 'all_chains_entity_ids',
+    'all_crops_all_chains_mask', 'all_crops_all_chains_positions',
+    'all_crops_all_chains_residue_ids', 'assembly_num_chains', 'asym_id',
+    'bert_mask', 'cluster_bias_mask', 'deletion_matrix', 'deletion_mean',
+    'entity_id', 'entity_mask', 'mem_peak', 'msa', 'msa_mask', 'num_alignments',
+    'num_templates', 'queue_size', 'residue_index', 'resolution',
+    'seq_length', 'seq_mask', 'sym_id', 'template_aatype',
+    'template_all_atom_mask', 'template_all_atom_positions'
+})
+
+MAX_TEMPLATES = 4
+MSA_CROP_SIZE = 2048
+
+
+def _is_homomer_or_monomer(chains: Iterable[Mapping[str, np.ndarray]]) -> bool:
+  """Checks if a list of chains represents a homomer/monomer example."""
+  # Note that an entity_id of 0 indicates padding.
+  num_unique_chains = len(np.unique(np.concatenate(
+      [np.unique(chain['entity_id'][chain['entity_id'] > 0]) for
+       chain in chains])))
+  return num_unique_chains == 1
+
+
+def pair_and_merge(
+    all_chain_features: MutableMapping[str, Mapping[str, np.ndarray]],
+) -> Mapping[str, np.ndarray]:
+  """Runs processing on features to augment, pair and merge.
+
+  Args:
+    all_chain_features: A MutableMap of dictionaries of features for each chain.
+
+  Returns:
+    A dictionary of features.
+  """
+
+  process_unmerged_features(all_chain_features)
+
+  np_chains_list = list(all_chain_features.values())
+
+  pair_msa_sequences = not _is_homomer_or_monomer(np_chains_list)
+
+  if pair_msa_sequences:
+    np_chains_list = msa_pairing.create_paired_features(
+        chains=np_chains_list
+    )
+    np_chains_list = msa_pairing.deduplicate_unpaired_sequences(np_chains_list)
+  np_chains_list = crop_chains(
+      np_chains_list,
+      msa_crop_size=MSA_CROP_SIZE,
+      pair_msa_sequences=pair_msa_sequences,
+      max_templates=MAX_TEMPLATES
+  )
+  np_example = msa_pairing.merge_chain_features(
+      np_chains_list=np_chains_list, pair_msa_sequences=pair_msa_sequences,
+      max_templates=MAX_TEMPLATES
+  )
+  np_example = process_final(np_example)
+  return np_example
+
+
+def crop_chains(
+    chains_list: List[Mapping[str, np.ndarray]],
+    msa_crop_size: int,
+    pair_msa_sequences: bool,
+    max_templates: int
+) -> List[Mapping[str, np.ndarray]]:
+  """Crops the MSAs for a set of chains.
+
+  Args:
+    chains_list: A list of chains to be cropped.
+    msa_crop_size: The total number of sequences to crop from the MSA.
+    pair_msa_sequences: Whether we are operating in sequence-pairing mode.
+    max_templates: The maximum templates to use per chain.
+
+  Returns:
+    The chains cropped.
+  """
+
+  # Apply the cropping.
+  cropped_chains = []
+  for chain in chains_list:
+    cropped_chain = _crop_single_chain(
+        chain,
+        msa_crop_size=msa_crop_size,
+        pair_msa_sequences=pair_msa_sequences,
+        max_templates=max_templates)
+    cropped_chains.append(cropped_chain)
+
+  return cropped_chains
+
+
+def _crop_single_chain(chain: Mapping[str, np.ndarray],
+                       msa_crop_size: int,
+                       pair_msa_sequences: bool,
+                       max_templates: int) -> Mapping[str, np.ndarray]:
+  """Crops msa sequences to `msa_crop_size`."""
+  msa_size = chain['num_alignments']
+
+  if pair_msa_sequences:
+    msa_size_all_seq = chain['num_alignments_all_seq']
+    msa_crop_size_all_seq = np.minimum(msa_size_all_seq, msa_crop_size // 2)
+
+    # We reduce the number of un-paired sequences, by the number of times a
+    # sequence from this chain's MSA is included in the paired MSA.  This keeps
+    # the MSA size for each chain roughly constant.
+    msa_all_seq = chain['msa_all_seq'][:msa_crop_size_all_seq, :]
+    num_non_gapped_pairs = np.sum(
+        np.any(msa_all_seq != msa_pairing.MSA_GAP_IDX, axis=1))
+    num_non_gapped_pairs = np.minimum(num_non_gapped_pairs,
+                                      msa_crop_size_all_seq)
+
+    # Restrict the unpaired crop size so that paired+unpaired sequences do not
+    # exceed msa_seqs_per_chain for each chain.
+    max_msa_crop_size = np.maximum(msa_crop_size - num_non_gapped_pairs, 0)
+    msa_crop_size = np.minimum(msa_size, max_msa_crop_size)
+  else:
+    msa_crop_size = np.minimum(msa_size, msa_crop_size)
+
+  include_templates = 'template_aatype' in chain and max_templates
+  if include_templates:
+    num_templates = chain['template_aatype'].shape[0]
+    templates_crop_size = np.minimum(num_templates, max_templates)
+
+  for k in chain:
+    k_split = k.split('_all_seq')[0]
+    if k_split in msa_pairing.TEMPLATE_FEATURES:
+      chain[k] = chain[k][:templates_crop_size, :]
+    elif k_split in msa_pairing.MSA_FEATURES:
+      if '_all_seq' in k and pair_msa_sequences:
+        chain[k] = chain[k][:msa_crop_size_all_seq, :]
+      else:
+        chain[k] = chain[k][:msa_crop_size, :]
+
+  chain['num_alignments'] = np.asarray(msa_crop_size, dtype=np.int32)
+  if include_templates:
+    chain['num_templates'] = np.asarray(templates_crop_size, dtype=np.int32)
+  if pair_msa_sequences:
+    chain['num_alignments_all_seq'] = np.asarray(
+        msa_crop_size_all_seq, dtype=np.int32)
+  return chain
+
+
+def process_final(
+    np_example: Mapping[str, np.ndarray]
+) -> Mapping[str, np.ndarray]:
+  """Final processing steps in data pipeline, after merging and pairing."""
+  np_example = _correct_msa_restypes(np_example)
+  np_example = _make_seq_mask(np_example)
+  np_example = _make_msa_mask(np_example)
+  np_example = _filter_features(np_example)
+   
+  return np_example
+
+
+def _correct_msa_restypes(np_example):
+  """Correct MSA restype to have the same order as residue_constants."""
+  new_order_list = residue_constants.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+  np_example['msa'] = np.take(new_order_list, np_example['msa'], axis=0)
+  np_example['msa'] = np_example['msa'].astype(np.int32)
+  return np_example
+
+
+def _make_seq_mask(np_example):
+  np_example['seq_mask'] = (np_example['entity_id'] > 0).astype(np.float32)
+  return np_example
+
+
+def _make_msa_mask(np_example):
+  """Mask features are all ones, but will later be zero-padded."""
+
+  np_example['msa_mask'] = np.ones_like(np_example['msa'], dtype=np.float32)
+
+  seq_mask = (np_example['entity_id'] > 0).astype(np.float32)
+  np_example['msa_mask'] *= seq_mask[None]
+
+  return np_example
+
+
+def _filter_features(
+    np_example: Mapping[str, np.ndarray]
+) -> Mapping[str, np.ndarray]:
+  """Filters features of example to only those requested."""
+  return {k: v for (k, v) in np_example.items() if k in REQUIRED_FEATURES}
+
+
+def process_unmerged_features(
+    all_chain_features: MutableMapping[str, Mapping[str, np.ndarray]]
+):
+  """Postprocessing stage for per-chain features before merging."""
+  num_chains = len(all_chain_features)
+  for chain_features in all_chain_features.values():
+    # Convert deletion matrices to float.
+    chain_features['deletion_matrix'] = np.asarray(
+        chain_features.pop('deletion_matrix_int'), dtype=np.float32
+    )
+    if 'deletion_matrix_int_all_seq' in chain_features:
+      chain_features['deletion_matrix_all_seq'] = np.asarray(
+          chain_features.pop('deletion_matrix_int_all_seq'), dtype=np.float32
+      )
+
+    chain_features['deletion_mean'] = np.mean(
+        chain_features['deletion_matrix'], axis=0
+    )
+
+    # Add all_atom_mask and dummy all_atom_positions based on aatype.
+    all_atom_mask = residue_constants.STANDARD_ATOM_MASK[
+        chain_features['aatype']]
+    chain_features['all_atom_mask'] = all_atom_mask
+    chain_features['all_atom_positions'] = np.zeros(
+        list(all_atom_mask.shape) + [3])
+
+    # Add assembly_num_chains.
+    chain_features['assembly_num_chains'] = np.asarray(num_chains)
+
+  # Add entity_mask.
+  for chain_features in all_chain_features.values():
+    chain_features['entity_mask'] = (
+        chain_features['entity_id'] != 0).astype(np.int32)
--- a/fastfold/data/input_pipeline.py
+++ b/fastfold/data/input_pipeline.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import torch
+
+from fastfold.data import data_transforms
+
+
+def nonensembled_transform_fns(common_cfg, mode_cfg):
+    """Input pipeline data transformers that are not ensembled."""
+    transforms = [
+        data_transforms.cast_to_64bit_ints,
+        data_transforms.correct_msa_restypes,
+        data_transforms.squeeze_features,
+        data_transforms.randomly_replace_msa_with_unknown(0.0),
+        data_transforms.make_seq_mask,
+        data_transforms.make_msa_mask,
+        data_transforms.make_hhblits_profile,
+    ]
+    if common_cfg.use_templates:
+        transforms.extend(
+            [
+                data_transforms.fix_templates_aatype,
+                data_transforms.make_template_mask,
+                data_transforms.make_pseudo_beta("template_"),
+            ]
+        )
+        if common_cfg.use_template_torsion_angles:
+            transforms.extend(
+                [
+                    data_transforms.atom37_to_torsion_angles("template_"),
+                ]
+            )
+
+    transforms.extend(
+        [
+            data_transforms.make_atom14_masks,
+        ]
+    )
+
+    if mode_cfg.supervised:
+        transforms.extend(
+            [
+                data_transforms.make_atom14_positions,
+                data_transforms.atom37_to_frames,
+                data_transforms.atom37_to_torsion_angles(""),
+                data_transforms.make_pseudo_beta(""),
+                data_transforms.get_backbone_frames,
+                data_transforms.get_chi_angles,
+            ]
+        )
+
+    return transforms
+
+
+def ensembled_transform_fns(common_cfg, mode_cfg, ensemble_seed):
+    """Input pipeline data transformers that can be ensembled and averaged."""
+    transforms = []
+
+    if "max_distillation_msa_clusters" in mode_cfg:
+        transforms.append(
+            data_transforms.sample_msa_distillation(
+                mode_cfg.max_distillation_msa_clusters
+            )
+        )
+
+    if common_cfg.reduce_msa_clusters_by_max_templates:
+        pad_msa_clusters = mode_cfg.max_msa_clusters - mode_cfg.max_templates
+    else:
+        pad_msa_clusters = mode_cfg.max_msa_clusters
+
+    max_msa_clusters = pad_msa_clusters
+    max_extra_msa = common_cfg.max_extra_msa
+
+    msa_seed = None
+    if(not common_cfg.resample_msa_in_recycling):
+        msa_seed = ensemble_seed
+    
+    transforms.append(
+        data_transforms.sample_msa(
+            max_msa_clusters, 
+            keep_extra=True,
+            seed=msa_seed,
+        )
+    )
+
+    if "masked_msa" in common_cfg:
+        # Masked MSA should come *before* MSA clustering so that
+        # the clustering and full MSA profile do not leak information about
+        # the masked locations and secret corrupted locations.
+        transforms.append(
+            data_transforms.make_masked_msa(
+                common_cfg.masked_msa, mode_cfg.masked_msa_replace_fraction
+            )
+        )
+
+    if common_cfg.msa_cluster_features:
+        transforms.append(data_transforms.nearest_neighbor_clusters())
+        transforms.append(data_transforms.summarize_clusters())
+
+    # Crop after creating the cluster profiles.
+    if max_extra_msa:
+        transforms.append(data_transforms.crop_extra_msa(max_extra_msa))
+    else:
+        transforms.append(data_transforms.delete_extra_msa)
+
+    transforms.append(data_transforms.make_msa_feat())
+
+    crop_feats = dict(common_cfg.feat)
+
+    if mode_cfg.fixed_size:
+        transforms.append(data_transforms.select_feat(list(crop_feats)))
+        transforms.append(
+            data_transforms.random_crop_to_size(
+                mode_cfg.crop_size,
+                mode_cfg.max_templates,
+                crop_feats,
+                mode_cfg.subsample_templates,
+                seed=ensemble_seed + 1,
+            )
+        )
+        transforms.append(
+            data_transforms.make_fixed_size(
+                crop_feats,
+                pad_msa_clusters,
+                common_cfg.max_extra_msa,
+                mode_cfg.crop_size,
+                mode_cfg.max_templates,
+            )
+        )
+    else:
+        transforms.append(
+            data_transforms.crop_templates(mode_cfg.max_templates)
+        )
+
+    return transforms
+
+
+def process_tensors_from_config(tensors, common_cfg, mode_cfg):
+    """Based on the config, apply filters and transformations to the data."""
+
+    ensemble_seed = torch.Generator().seed()
+
+    def wrap_ensemble_fn(data, i):
+        """Function to be mapped over the ensemble dimension."""
+        d = data.copy()
+        fns = ensembled_transform_fns(
+            common_cfg, 
+            mode_cfg, 
+            ensemble_seed,
+        )
+        fn = compose(fns)
+        d["ensemble_index"] = i
+        return fn(d)
+
+    no_templates = True
+    if("template_aatype" in tensors):
+        no_templates = tensors["template_aatype"].shape[0] == 0
+
+    nonensembled = nonensembled_transform_fns(
+        common_cfg,
+        mode_cfg,
+    )
+
+    tensors = compose(nonensembled)(tensors)
+
+    if("no_recycling_iters" in tensors):
+        num_recycling = int(tensors["no_recycling_iters"])
+    else:
+        num_recycling = common_cfg.max_recycling_iters
+
+    tensors = map_fn(
+        lambda x: wrap_ensemble_fn(tensors, x), torch.arange(num_recycling + 1)
+    )
+
+    return tensors
+
+
+@data_transforms.curry1
+def compose(x, fs):
+    for f in fs:
+        x = f(x)
+    return x
+
+
+def map_fn(fun, x):
+    ensembles = [fun(elem) for elem in x]
+    features = ensembles[0].keys()
+    ensembled_dict = {}
+    for feat in features:
+        ensembled_dict[feat] = torch.stack(
+            [dict_i[feat] for dict_i in ensembles], dim=-1
+        )
+    return ensembled_dict
--- a/fastfold/data/input_pipeline_multimer.py
+++ b/fastfold/data/input_pipeline_multimer.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import torch
+
+from fastfold.data import (
+    data_transforms,
+    data_transforms_multimer,
+)
+
+
+def nonensembled_transform_fns(common_cfg, mode_cfg):
+    """Input pipeline data transformers that are not ensembled."""
+    transforms = [
+        data_transforms.cast_to_64bit_ints,
+        data_transforms_multimer.make_msa_profile,
+        data_transforms_multimer.create_target_feat,
+        data_transforms.make_atom14_masks,
+    ]
+
+    if(common_cfg.use_templates):
+        transforms.extend([
+            data_transforms.make_pseudo_beta("template_"),
+        ])
+
+    return transforms
+
+
+def ensembled_transform_fns(common_cfg, mode_cfg, ensemble_seed):
+    """Input pipeline data transformers that can be ensembled and averaged."""
+    transforms = []
+
+    pad_msa_clusters = mode_cfg.max_msa_clusters
+    max_msa_clusters = pad_msa_clusters
+    max_extra_msa = common_cfg.max_extra_msa
+
+    msa_seed = None
+    if(not common_cfg.resample_msa_in_recycling):
+        msa_seed = ensemble_seed
+    
+    transforms.append(
+        data_transforms_multimer.sample_msa(
+            max_msa_clusters, 
+            max_extra_msa,
+            seed=msa_seed,
+        )
+    )
+
+    if "masked_msa" in common_cfg:
+        # Masked MSA should come *before* MSA clustering so that
+        # the clustering and full MSA profile do not leak information about
+        # the masked locations and secret corrupted locations.
+        transforms.append(
+            data_transforms_multimer.make_masked_msa(
+                common_cfg.masked_msa, 
+                mode_cfg.masked_msa_replace_fraction,
+                seed=(msa_seed + 1) if msa_seed else None,
+            )
+        )
+
+    transforms.append(data_transforms_multimer.nearest_neighbor_clusters())
+    transforms.append(data_transforms_multimer.create_msa_feat)
+
+    return transforms
+
+
+def process_tensors_from_config(tensors, common_cfg, mode_cfg):
+    """Based on the config, apply filters and transformations to the data."""
+
+    ensemble_seed = torch.Generator().seed()
+
+    def wrap_ensemble_fn(data, i):
+        """Function to be mapped over the ensemble dimension."""
+        d = data.copy()
+        fns = ensembled_transform_fns(
+            common_cfg, 
+            mode_cfg, 
+            ensemble_seed,
+        )
+        fn = compose(fns)
+        d["ensemble_index"] = i
+        return fn(d)
+
+    no_templates = True
+    if("template_aatype" in tensors):
+        no_templates = tensors["template_aatype"].shape[0] == 0
+
+    nonensembled = nonensembled_transform_fns(
+        common_cfg,
+        mode_cfg,
+    )
+
+    tensors = compose(nonensembled)(tensors)
+
+    if("no_recycling_iters" in tensors):
+        num_recycling = int(tensors["no_recycling_iters"])
+    else:
+        num_recycling = common_cfg.max_recycling_iters
+
+    tensors = map_fn(
+        lambda x: wrap_ensemble_fn(tensors, x), torch.arange(num_recycling + 1)
+    )
+
+    return tensors
+
+
+@data_transforms.curry1
+def compose(x, fs):
+    for f in fs:
+        x = f(x)
+    return x
+
+
+def map_fn(fun, x):
+    ensembles = [fun(elem) for elem in x]
+    features = ensembles[0].keys()
+    ensembled_dict = {}
+    for feat in features:
+        ensembled_dict[feat] = torch.stack(
+            [dict_i[feat] for dict_i in ensembles], dim=-1
+        )
+    return ensembled_dict
--- a/fastfold/data/mmcif_parsing.py
+++ b/fastfold/data/mmcif_parsing.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Parses the mmCIF file format."""
+import collections
+import dataclasses
+import io
+import json
+import logging
+import os
+from typing import Any, Mapping, Optional, Sequence, Tuple
+
+from Bio import PDB
+from Bio.Data import SCOPData
+import numpy as np
+
+from fastfold.data.errors import MultipleChainsError
+import fastfold.common.residue_constants as residue_constants
+
+
+# Type aliases:
+ChainId = str
+PdbHeader = Mapping[str, Any]
+PdbStructure = PDB.Structure.Structure
+SeqRes = str
+MmCIFDict = Mapping[str, Sequence[str]]
+
+
+@dataclasses.dataclass(frozen=True)
+class Monomer:
+    id: str
+    num: int
+
+
+# Note - mmCIF format provides no guarantees on the type of author-assigned
+# sequence numbers. They need not be integers.
+@dataclasses.dataclass(frozen=True)
+class AtomSite:
+    residue_name: str
+    author_chain_id: str
+    mmcif_chain_id: str
+    author_seq_num: str
+    mmcif_seq_num: int
+    insertion_code: str
+    hetatm_atom: str
+    model_num: int
+
+
+# Used to map SEQRES index to a residue in the structure.
+@dataclasses.dataclass(frozen=True)
+class ResiduePosition:
+    chain_id: str
+    residue_number: int
+    insertion_code: str
+
+
+@dataclasses.dataclass(frozen=True)
+class ResidueAtPosition:
+    position: Optional[ResiduePosition]
+    name: str
+    is_missing: bool
+    hetflag: str
+
+
+@dataclasses.dataclass(frozen=True)
+class MmcifObject:
+    """Representation of a parsed mmCIF file.
+
+    Contains:
+      file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
+        files being processed.
+      header: Biopython header.
+      structure: Biopython structure.
+      chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
+        {'A': 'ABCDEFG'}
+      seqres_to_structure: Dict; for each chain_id contains a mapping between
+        SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
+                                                          1: ResidueAtPosition,
+                                                          ...}}
+      raw_string: The raw string used to construct the MmcifObject.
+    """
+
+    file_id: str
+    header: PdbHeader
+    structure: PdbStructure
+    chain_to_seqres: Mapping[ChainId, SeqRes]
+    seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
+    raw_string: Any
+
+
+@dataclasses.dataclass(frozen=True)
+class ParsingResult:
+    """Returned by the parse function.
+
+    Contains:
+      mmcif_object: A MmcifObject, may be None if no chain could be successfully
+        parsed.
+      errors: A dict mapping (file_id, chain_id) to any exception generated.
+    """
+
+    mmcif_object: Optional[MmcifObject]
+    errors: Mapping[Tuple[str, str], Any]
+
+
+class ParseError(Exception):
+    """An error indicating that an mmCIF file could not be parsed."""
+
+
+def mmcif_loop_to_list(
+    prefix: str, parsed_info: MmCIFDict
+) -> Sequence[Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a list.
+
+    Reference for loop_ in mmCIF:
+      http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
+
+    Args:
+      prefix: Prefix shared by each of the data items in the loop.
+        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+        _entity_poly_seq.mon_id. Should include the trailing period.
+      parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+        parser.
+
+    Returns:
+      Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
+    """
+    cols = []
+    data = []
+    for key, value in parsed_info.items():
+        if key.startswith(prefix):
+            cols.append(key)
+            data.append(value)
+
+    assert all([len(xs) == len(data[0]) for xs in data]), (
+        "mmCIF error: Not all loops are the same length: %s" % cols
+    )
+
+    return [dict(zip(cols, xs)) for xs in zip(*data)]
+
+
+def mmcif_loop_to_dict(
+    prefix: str,
+    index: str,
+    parsed_info: MmCIFDict,
+) -> Mapping[str, Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a dictionary.
+
+    Args:
+      prefix: Prefix shared by each of the data items in the loop.
+        e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+        _entity_poly_seq.mon_id. Should include the trailing period.
+      index: Which item of loop data should serve as the key.
+      parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+        parser.
+
+    Returns:
+      Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
+      indexed by the index column.
+    """
+    entries = mmcif_loop_to_list(prefix, parsed_info)
+    return {entry[index]: entry for entry in entries}
+
+
+def parse(
+    *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
+) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+
+    Args:
+      file_id: A string identifier for this file. Should be unique within the
+        collection of files being processed.
+      mmcif_string: Contents of an mmCIF file.
+      catch_all_errors: If True, all exceptions are caught and error messages are
+        returned as part of the ParsingResult. If False exceptions will be allowed
+        to propagate.
+
+    Returns:
+      A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = PDB.MMCIFParser(QUIET=True)
+        handle = io.StringIO(mmcif_string)
+        full_structure = parser.get_structure("", handle)
+        first_model_structure = _get_first_model(full_structure)
+        # Extract the _mmcif_dict from the parser, which contains useful fields not
+        # reflected in the Biopython structure.
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+
+        header = _get_header(parsed_info)
+
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ""): "No protein chains found in this file."}
+            )
+        seq_start_num = {
+            chain_id: min([monomer.num for monomer in seq])
+            for chain_id, seq in valid_chains.items()
+        }
+
+        # Loop over the atoms for which we have coordinates. Populate two mappings:
+        # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
+        # the authors / Biopython).
+        # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
+        mmcif_to_author_chain_id = {}
+        seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != "1":
+                # We only process the first model at the moment.
+                continue
+
+            mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
+
+            if atom.mmcif_chain_id in valid_chains:
+                hetflag = " "
+                if atom.hetatm_atom == "HETATM":
+                    # Water atoms are assigned a special hetflag of W in Biopython. We
+                    # need to do the same, so that this hetflag can be used to fetch
+                    # a residue from the Biopython structure by id.
+                    if atom.residue_name in ("HOH", "WAT"):
+                        hetflag = "W"
+                    else:
+                        hetflag = "H_" + atom.residue_name
+                insertion_code = atom.insertion_code
+                if not _is_set(atom.insertion_code):
+                    insertion_code = " "
+                position = ResiduePosition(
+                    chain_id=atom.author_chain_id,
+                    residue_number=int(atom.author_seq_num),
+                    insertion_code=insertion_code,
+                )
+                seq_idx = (
+                    int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
+                )
+                current = seq_to_structure_mappings.get(
+                    atom.author_chain_id, {}
+                )
+                current[seq_idx] = ResidueAtPosition(
+                    position=position,
+                    name=atom.residue_name,
+                    is_missing=False,
+                    hetflag=hetflag,
+                )
+                seq_to_structure_mappings[atom.author_chain_id] = current
+
+        # Add missing residue information to seq_to_structure_mappings.
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            current_mapping = seq_to_structure_mappings[author_chain]
+            for idx, monomer in enumerate(seq_info):
+                if idx not in current_mapping:
+                    current_mapping[idx] = ResidueAtPosition(
+                        position=None,
+                        name=monomer.id,
+                        is_missing=True,
+                        hetflag=" ",
+                    )
+
+        author_chain_to_sequence = {}
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            seq = []
+            for monomer in seq_info:
+                code = SCOPData.protein_letters_3to1.get(monomer.id, "X")
+                seq.append(code if len(code) == 1 else "X")
+            seq = "".join(seq)
+            author_chain_to_sequence[author_chain] = seq
+
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=first_model_structure,
+            chain_to_seqres=author_chain_to_sequence,
+            seqres_to_structure=seq_to_structure_mappings,
+            raw_string=parsed_info,
+        )
+
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, "")] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+
+
+def _get_first_model(structure: PdbStructure) -> PdbStructure:
+    """Returns the first model in a Biopython structure."""
+    return next(structure.get_models())
+
+
+_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
+
+
+def get_release_date(parsed_info: MmCIFDict) -> str:
+    """Returns the oldest revision date."""
+    revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"]
+    return min(revision_dates)
+
+
+def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
+    """Returns a basic header containing method, release date and resolution."""
+    header = {}
+
+    experiments = mmcif_loop_to_list("_exptl.", parsed_info)
+    header["structure_method"] = ",".join(
+        [experiment["_exptl.method"].lower() for experiment in experiments]
+    )
+
+    # Note: The release_date here corresponds to the oldest revision. We prefer to
+    # use this for dataset filtering over the deposition_date.
+    if "_pdbx_audit_revision_history.revision_date" in parsed_info:
+        header["release_date"] = get_release_date(parsed_info)
+    else:
+        logging.warning(
+            "Could not determine release_date: %s", parsed_info["_entry.id"]
+        )
+
+    header["resolution"] = 0.00
+    for res_key in (
+        "_refine.ls_d_res_high",
+        "_em_3d_reconstruction.resolution",
+        "_reflns.d_resolution_high",
+    ):
+        if res_key in parsed_info:
+            try:
+                raw_resolution = parsed_info[res_key][0]
+                header["resolution"] = float(raw_resolution)
+            except ValueError:
+                logging.info(
+                    "Invalid resolution format: %s", parsed_info[res_key]
+                )
+
+    return header
+
+
+def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
+    """Returns list of atom sites; contains data not present in the structure."""
+    return [
+        AtomSite(*site)
+        for site in zip(  # pylint:disable=g-complex-comprehension
+            parsed_info["_atom_site.label_comp_id"],
+            parsed_info["_atom_site.auth_asym_id"],
+            parsed_info["_atom_site.label_asym_id"],
+            parsed_info["_atom_site.auth_seq_id"],
+            parsed_info["_atom_site.label_seq_id"],
+            parsed_info["_atom_site.pdbx_PDB_ins_code"],
+            parsed_info["_atom_site.group_PDB"],
+            parsed_info["_atom_site.pdbx_PDB_model_num"],
+        )
+    ]
+
+
+def _get_protein_chains(
+    *, parsed_info: Mapping[str, Any]
+) -> Mapping[ChainId, Sequence[Monomer]]:
+    """Extracts polymer information for protein chains only.
+
+    Args:
+      parsed_info: _mmcif_dict produced by the Biopython parser.
+
+    Returns:
+      A dict mapping mmcif chain id to a list of Monomers.
+    """
+    # Get polymer information for each entity in the structure.
+    entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info)
+
+    polymers = collections.defaultdict(list)
+    for entity_poly_seq in entity_poly_seqs:
+        polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append(
+            Monomer(
+                id=entity_poly_seq["_entity_poly_seq.mon_id"],
+                num=int(entity_poly_seq["_entity_poly_seq.num"]),
+            )
+        )
+
+    # Get chemical compositions. Will allow us to identify which of these polymers
+    # are proteins.
+    chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info)
+
+    # Get chains information for each entity. Necessary so that we can return a
+    # dict keyed on chain id rather than entity.
+    struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info)
+
+    entity_to_mmcif_chains = collections.defaultdict(list)
+    for struct_asym in struct_asyms:
+        chain_id = struct_asym["_struct_asym.id"]
+        entity_id = struct_asym["_struct_asym.entity_id"]
+        entity_to_mmcif_chains[entity_id].append(chain_id)
+
+    # Identify and return the valid protein chains.
+    valid_chains = {}
+    for entity_id, seq_info in polymers.items():
+        chain_ids = entity_to_mmcif_chains[entity_id]
+
+        # Reject polymers without any peptide-like components, such as DNA/RNA.
+        if any(
+            [
+                "peptide" in chem_comps[monomer.id]["_chem_comp.type"]
+                for monomer in seq_info
+            ]
+        ):
+            for chain_id in chain_ids:
+                valid_chains[chain_id] = seq_info
+    return valid_chains
+
+
+def _is_set(data: str) -> bool:
+    """Returns False if data is a special mmCIF character indicating 'unset'."""
+    return data not in (".", "?")
+
+
+def get_atom_coords(
+    mmcif_object: MmcifObject, 
+    chain_id: str, 
+    _zero_center_positions: bool = True
+) -> Tuple[np.ndarray, np.ndarray]:
+    # Locate the right chain
+    chains = list(mmcif_object.structure.get_chains())
+    relevant_chains = [c for c in chains if c.id == chain_id]
+    if len(relevant_chains) != 1:
+        raise MultipleChainsError(
+            f"Expected exactly one chain in structure with id {chain_id}."
+        )
+    chain = relevant_chains[0]
+
+    # Extract the coordinates
+    num_res = len(mmcif_object.chain_to_seqres[chain_id])
+    all_atom_positions = np.zeros(
+        [num_res, residue_constants.atom_type_num, 3], dtype=np.float32
+    )
+    all_atom_mask = np.zeros(
+        [num_res, residue_constants.atom_type_num], dtype=np.float32
+    )
+    for res_index in range(num_res):
+        pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
+        mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
+        res_at_position = mmcif_object.seqres_to_structure[chain_id][res_index]
+        if not res_at_position.is_missing:
+            res = chain[
+                (
+                    res_at_position.hetflag,
+                    res_at_position.position.residue_number,
+                    res_at_position.position.insertion_code,
+                )
+            ]
+            for atom in res.get_atoms():
+                atom_name = atom.get_name()
+                x, y, z = atom.get_coord()
+                if atom_name in residue_constants.atom_order.keys():
+                    pos[residue_constants.atom_order[atom_name]] = [x, y, z]
+                    mask[residue_constants.atom_order[atom_name]] = 1.0
+                elif atom_name.upper() == "SE" and res.get_resname() == "MSE":
+                    # Put the coords of the selenium atom in the sulphur column
+                    pos[residue_constants.atom_order["SD"]] = [x, y, z]
+                    mask[residue_constants.atom_order["SD"]] = 1.0
+
+        all_atom_positions[res_index] = pos
+        all_atom_mask[res_index] = mask
+
+    if _zero_center_positions:
+        binary_mask = all_atom_mask.astype(bool)
+        translation_vec = all_atom_positions[binary_mask].mean(axis=0)
+        all_atom_positions[binary_mask] -= translation_vec
+
+    return all_atom_positions, all_atom_mask
--- a/fastfold/data/msa_identifiers.py
+++ b/fastfold/data/msa_identifiers.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for extracting identifiers from MSA sequence descriptions."""
+
+import dataclasses
+import re
+from typing import Optional
+
+
+# Sequences coming from UniProtKB database come in the
+# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
+_UNIPROT_PATTERN = re.compile(
+    r"""
+    ^
+    # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
+    (?:tr|sp)
+    \|
+    # A primary accession number of the UniProtKB entry.
+    (?P<AccessionIdentifier>[A-Za-z0-9]{6,10})
+    # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
+    (?:_\d)?
+    \|
+    # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
+    # protein ID code.
+    (?:[A-Za-z0-9]+)
+    _
+    # A mnemonic species identification code.
+    (?P<SpeciesIdentifier>([A-Za-z0-9]){1,5})
+    # Small BFD uses a final value after an underscore, which we ignore.
+    (?:_\d+)?
+    $
+    """,
+    re.VERBOSE)
+
+
+@dataclasses.dataclass(frozen=True)
+class Identifiers:
+  species_id: str = ''
+
+
+def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
+  """Gets accession id and species from an msa sequence identifier.
+
+  The sequence identifier has the format specified by
+  _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
+  An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+
+  Args:
+    msa_sequence_identifier: a sequence identifier.
+
+  Returns:
+    An `Identifiers` instance with a uniprot_accession_id and species_id. These
+    can be empty in the case where no identifier was found.
+  """
+  matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
+  if matches:
+    return Identifiers(
+        species_id=matches.group('SpeciesIdentifier')
+    )
+  return Identifiers()
+
+
+def _extract_sequence_identifier(description: str) -> Optional[str]:
+  """Extracts sequence identifier from description. Returns None if no match."""
+  split_description = description.split()
+  if split_description:
+    return split_description[0].partition('/')[0]
+  else:
+    return None
+
+
+def get_identifiers(description: str) -> Identifiers:
+  """Computes extra MSA features from the description."""
+  sequence_identifier = _extract_sequence_identifier(description)
+  if sequence_identifier is None:
+    return Identifiers()
+  else:
+    return _parse_sequence_identifier(sequence_identifier)
--- a/fastfold/data/msa_pairing.py
+++ b/fastfold/data/msa_pairing.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pairing logic for multimer data pipeline."""
+
+import collections
+import functools
+import string
+from typing import Any, Dict, Iterable, List, Sequence, Mapping
+
+import numpy as np
+import pandas as pd
+import scipy.linalg
+
+from fastfold.common import residue_constants
+
+
+# TODO: This stuff should probably also be in a config
+MSA_GAP_IDX = residue_constants.restypes_with_x_and_gap.index('-')
+SEQUENCE_GAP_CUTOFF = 0.5
+SEQUENCE_SIMILARITY_CUTOFF = 0.9
+
+MSA_PAD_VALUES = {'msa_all_seq': MSA_GAP_IDX,
+                  'msa_mask_all_seq': 1,
+                  'deletion_matrix_all_seq': 0,
+                  'deletion_matrix_int_all_seq': 0,
+                  'msa': MSA_GAP_IDX,
+                  'msa_mask': 1,
+                  'deletion_matrix': 0,
+                  'deletion_matrix_int': 0}
+
+MSA_FEATURES = ('msa', 'msa_mask', 'deletion_matrix', 'deletion_matrix_int')
+SEQ_FEATURES = ('residue_index', 'aatype', 'all_atom_positions',
+                'all_atom_mask', 'seq_mask', 'between_segment_residues',
+                'has_alt_locations', 'has_hetatoms', 'asym_id', 'entity_id',
+                'sym_id', 'entity_mask', 'deletion_mean',
+                'prediction_atom_mask',
+                'literature_positions', 'atom_indices_to_group_indices',
+                'rigid_group_default_frame')
+TEMPLATE_FEATURES = ('template_aatype', 'template_all_atom_positions',
+                     'template_all_atom_mask')
+CHAIN_FEATURES = ('num_alignments', 'seq_length')
+
+
+def create_paired_features(
+    chains: Iterable[Mapping[str, np.ndarray]],
+) ->  List[Mapping[str, np.ndarray]]:
+  """Returns the original chains with paired NUM_SEQ features.
+
+  Args:
+    chains:  A list of feature dictionaries for each chain.
+
+  Returns:
+    A list of feature dictionaries with sequence features including only
+    rows to be paired.
+  """
+  chains = list(chains)
+  chain_keys = chains[0].keys()
+
+  if len(chains) < 2:
+    return chains
+  else:
+    updated_chains = []
+    paired_chains_to_paired_row_indices = pair_sequences(chains)
+    paired_rows = reorder_paired_rows(
+        paired_chains_to_paired_row_indices)
+
+    for chain_num, chain in enumerate(chains):
+      new_chain = {k: v for k, v in chain.items() if '_all_seq' not in k}
+      for feature_name in chain_keys:
+        if feature_name.endswith('_all_seq'):
+          feats_padded = pad_features(chain[feature_name], feature_name)
+          new_chain[feature_name] = feats_padded[paired_rows[:, chain_num]]
+      new_chain['num_alignments_all_seq'] = np.asarray(
+          len(paired_rows[:, chain_num]))
+      updated_chains.append(new_chain)
+    return updated_chains
+
+
+def pad_features(feature: np.ndarray, feature_name: str) -> np.ndarray:
+  """Add a 'padding' row at the end of the features list.
+
+  The padding row will be selected as a 'paired' row in the case of partial
+  alignment - for the chain that doesn't have paired alignment.
+
+  Args:
+    feature: The feature to be padded.
+    feature_name: The name of the feature to be padded.
+
+  Returns:
+    The feature with an additional padding row.
+  """
+  assert feature.dtype != np.dtype(np.string_)
+  if feature_name in ('msa_all_seq', 'msa_mask_all_seq',
+                      'deletion_matrix_all_seq', 'deletion_matrix_int_all_seq'):
+    num_res = feature.shape[1]
+    padding = MSA_PAD_VALUES[feature_name] * np.ones([1, num_res],
+                                                     feature.dtype)
+  elif feature_name == 'msa_species_identifiers_all_seq':
+    padding = [b'']
+  else:
+    return feature
+  feats_padded = np.concatenate([feature, padding], axis=0)
+  return feats_padded
+
+
+def _make_msa_df(chain_features: Mapping[str, np.ndarray]) -> pd.DataFrame:
+  """Makes dataframe with msa features needed for msa pairing."""
+  chain_msa = chain_features['msa_all_seq']
+  query_seq = chain_msa[0]
+  per_seq_similarity = np.sum(
+      query_seq[None] == chain_msa, axis=-1) / float(len(query_seq))
+  per_seq_gap = np.sum(chain_msa == 21, axis=-1) / float(len(query_seq))
+  msa_df = pd.DataFrame({
+      'msa_species_identifiers':
+          chain_features['msa_species_identifiers_all_seq'],
+      'msa_row':
+          np.arange(len(
+              chain_features['msa_species_identifiers_all_seq'])),
+      'msa_similarity': per_seq_similarity,
+      'gap': per_seq_gap
+  })
+  return msa_df
+
+
+def _create_species_dict(msa_df: pd.DataFrame) -> Dict[bytes, pd.DataFrame]:
+  """Creates mapping from species to msa dataframe of that species."""
+  species_lookup = {}
+  for species, species_df in msa_df.groupby('msa_species_identifiers'):
+    species_lookup[species] = species_df
+  return species_lookup
+
+
+def _match_rows_by_sequence_similarity(this_species_msa_dfs: List[pd.DataFrame]
+                                       ) -> List[List[int]]:
+  """Finds MSA sequence pairings across chains based on sequence similarity.
+
+  Each chain's MSA sequences are first sorted by their sequence similarity to
+  their respective target sequence. The sequences are then paired, starting
+  from the sequences most similar to their target sequence.
+
+  Args:
+    this_species_msa_dfs: a list of dataframes containing MSA features for
+      sequences for a specific species.
+
+  Returns:
+   A list of lists, each containing M indices corresponding to paired MSA rows,
+   where M is the number of chains.
+  """
+  all_paired_msa_rows = []
+
+  num_seqs = [len(species_df) for species_df in this_species_msa_dfs
+              if species_df is not None]
+  take_num_seqs = np.min(num_seqs)
+
+  sort_by_similarity = (
+      lambda x: x.sort_values('msa_similarity', axis=0, ascending=False))
+
+  for species_df in this_species_msa_dfs:
+    if species_df is not None:
+      species_df_sorted = sort_by_similarity(species_df)
+      msa_rows = species_df_sorted.msa_row.iloc[:take_num_seqs].values
+    else:
+      msa_rows = [-1] * take_num_seqs  # take the last 'padding' row
+    all_paired_msa_rows.append(msa_rows)
+  all_paired_msa_rows = list(np.array(all_paired_msa_rows).transpose())
+  return all_paired_msa_rows
+
+
+def pair_sequences(
+    examples: List[Mapping[str, np.ndarray]],
+) -> Dict[int, np.ndarray]:
+  """Returns indices for paired MSA sequences across chains."""
+
+  num_examples = len(examples)
+
+  all_chain_species_dict = []
+  common_species = set()
+  for chain_features in examples:
+    msa_df = _make_msa_df(chain_features)
+    species_dict = _create_species_dict(msa_df)
+    all_chain_species_dict.append(species_dict)
+    common_species.update(set(species_dict))
+
+  common_species = sorted(common_species)
+  common_species.remove(b'')  # Remove target sequence species.
+
+  all_paired_msa_rows = [np.zeros(len(examples), int)]
+  all_paired_msa_rows_dict = {k: [] for k in range(num_examples)}
+  all_paired_msa_rows_dict[num_examples] = [np.zeros(len(examples), int)]
+
+  for species in common_species:
+    if not species:
+      continue
+    this_species_msa_dfs = []
+    species_dfs_present = 0
+    for species_dict in all_chain_species_dict:
+      if species in species_dict:
+        this_species_msa_dfs.append(species_dict[species])
+        species_dfs_present += 1
+      else:
+        this_species_msa_dfs.append(None)
+
+    # Skip species that are present in only one chain.
+    if species_dfs_present <= 1:
+      continue
+
+    if np.any(
+        np.array([len(species_df) for species_df in
+                  this_species_msa_dfs if
+                  isinstance(species_df, pd.DataFrame)]) > 600):
+      continue
+
+    paired_msa_rows = _match_rows_by_sequence_similarity(this_species_msa_dfs)
+    all_paired_msa_rows.extend(paired_msa_rows)
+    all_paired_msa_rows_dict[species_dfs_present].extend(paired_msa_rows)
+  all_paired_msa_rows_dict = {
+      num_examples: np.array(paired_msa_rows) for
+      num_examples, paired_msa_rows in all_paired_msa_rows_dict.items()
+  }
+  return all_paired_msa_rows_dict
+
+
+def reorder_paired_rows(all_paired_msa_rows_dict: Dict[int, np.ndarray]
+                        ) -> np.ndarray:
+  """Creates a list of indices of paired MSA rows across chains.
+
+  Args:
+    all_paired_msa_rows_dict: a mapping from the number of paired chains to the
+      paired indices.
+
+  Returns:
+    a list of lists, each containing indices of paired MSA rows across chains.
+    The paired-index lists are ordered by:
+      1) the number of chains in the paired alignment, i.e, all-chain pairings
+         will come first.
+      2) e-values
+  """
+  all_paired_msa_rows = []
+
+  for num_pairings in sorted(all_paired_msa_rows_dict, reverse=True):
+    paired_rows = all_paired_msa_rows_dict[num_pairings]
+    paired_rows_product = abs(np.array([np.prod(rows) for rows in paired_rows]))
+    paired_rows_sort_index = np.argsort(paired_rows_product)
+    all_paired_msa_rows.extend(paired_rows[paired_rows_sort_index])
+
+  return np.array(all_paired_msa_rows)
+
+
+def block_diag(*arrs: np.ndarray, pad_value: float = 0.0) -> np.ndarray:
+  """Like scipy.linalg.block_diag but with an optional padding value."""
+  ones_arrs = [np.ones_like(x) for x in arrs]
+  off_diag_mask = 1.0 - scipy.linalg.block_diag(*ones_arrs)
+  diag = scipy.linalg.block_diag(*arrs)
+  diag += (off_diag_mask * pad_value).astype(diag.dtype)
+  return diag
+
+
+def _correct_post_merged_feats(
+    np_example: Mapping[str, np.ndarray],
+    np_chains_list: Sequence[Mapping[str, np.ndarray]],
+    pair_msa_sequences: bool
+) -> Mapping[str, np.ndarray]:
+    """Adds features that need to be computed/recomputed post merging."""
+
+    num_res = np_example['aatype'].shape[0]
+    np_example['seq_length'] = np.asarray(
+        [num_res] * num_res,
+        dtype=np.int32
+    )
+    np_example['num_alignments'] = np.asarray(
+        np_example['msa'].shape[0],
+        dtype=np.int32
+    )
+
+    if not pair_msa_sequences:
+      # Generate a bias that is 1 for the first row of every block in the
+      # block diagonal MSA - i.e. make sure the cluster stack always includes
+      # the query sequences for each chain (since the first row is the query
+      # sequence).
+        cluster_bias_masks = []
+        for chain in np_chains_list:
+            mask = np.zeros(chain['msa'].shape[0])
+            mask[0] = 1
+            cluster_bias_masks.append(mask)
+        
+        np_example['cluster_bias_mask'] = np.concatenate(cluster_bias_masks)
+  
+        # Initialize Bert mask with masked out off diagonals.
+        msa_masks = [
+            np.ones(x['msa'].shape, dtype=np.float32)
+            for x in np_chains_list
+        ]
+  
+        np_example['bert_mask'] = block_diag(
+            *msa_masks, pad_value=0
+        )
+    else:
+        np_example['cluster_bias_mask'] = np.zeros(np_example['msa'].shape[0])
+        np_example['cluster_bias_mask'][0] = 1
+  
+        # Initialize Bert mask with masked out off diagonals.
+        msa_masks = [
+            np.ones(x['msa'].shape, dtype=np.float32) for
+            x in np_chains_list
+        ]
+        msa_masks_all_seq = [
+            np.ones(x['msa_all_seq'].shape, dtype=np.float32) for
+            x in np_chains_list
+        ]
+  
+        msa_mask_block_diag = block_diag(
+            *msa_masks, pad_value=0
+        )
+        msa_mask_all_seq = np.concatenate(msa_masks_all_seq, axis=1)
+        np_example['bert_mask'] = np.concatenate(
+            [msa_mask_all_seq, msa_mask_block_diag], 
+            axis=0
+        )
+    
+    return np_example
+
+
+def _pad_templates(chains: Sequence[Mapping[str, np.ndarray]],
+                   max_templates: int) -> Sequence[Mapping[str, np.ndarray]]:
+  """For each chain pad the number of templates to a fixed size.
+
+  Args:
+    chains: A list of protein chains.
+    max_templates: Each chain will be padded to have this many templates.
+
+  Returns:
+    The list of chains, updated to have template features padded to
+    max_templates.
+  """
+  for chain in chains:
+    for k, v in chain.items():
+      if k in TEMPLATE_FEATURES:
+        padding = np.zeros_like(v.shape)
+        padding[0] = max_templates - v.shape[0]
+        padding = [(0, p) for p in padding]
+        chain[k] = np.pad(v, padding, mode='constant')
+  return chains
+
+
+def _merge_features_from_multiple_chains(
+    chains: Sequence[Mapping[str, np.ndarray]],
+    pair_msa_sequences: bool) -> Mapping[str, np.ndarray]:
+  """Merge features from multiple chains.
+
+  Args:
+    chains: A list of feature dictionaries that we want to merge.
+    pair_msa_sequences: Whether to concatenate MSA features along the
+      num_res dimension (if True), or to block diagonalize them (if False).
+
+  Returns:
+    A feature dictionary for the merged example.
+  """
+  merged_example = {}
+  for feature_name in chains[0]:
+    feats = [x[feature_name] for x in chains]
+    feature_name_split = feature_name.split('_all_seq')[0]
+    if feature_name_split in MSA_FEATURES:
+      if pair_msa_sequences or '_all_seq' in feature_name:
+        merged_example[feature_name] = np.concatenate(feats, axis=1)
+      else:
+        merged_example[feature_name] = block_diag(
+            *feats, pad_value=MSA_PAD_VALUES[feature_name])
+    elif feature_name_split in SEQ_FEATURES:
+      merged_example[feature_name] = np.concatenate(feats, axis=0)
+    elif feature_name_split in TEMPLATE_FEATURES:
+      merged_example[feature_name] = np.concatenate(feats, axis=1)
+    elif feature_name_split in CHAIN_FEATURES:
+      merged_example[feature_name] = np.sum(x for x in feats).astype(np.int32)
+    else:
+      merged_example[feature_name] = feats[0]
+  return merged_example
+
+
+def _merge_homomers_dense_msa(
+    chains: Iterable[Mapping[str, np.ndarray]]) -> Sequence[Mapping[str, np.ndarray]]:
+  """Merge all identical chains, making the resulting MSA dense.
+
+  Args:
+    chains: An iterable of features for each chain.
+
+  Returns:
+    A list of feature dictionaries.  All features with the same entity_id
+    will be merged - MSA features will be concatenated along the num_res
+    dimension - making them dense.
+  """
+  entity_chains = collections.defaultdict(list)
+  for chain in chains:
+    entity_id = chain['entity_id'][0]
+    entity_chains[entity_id].append(chain)
+
+  grouped_chains = []
+  for entity_id in sorted(entity_chains):
+    chains = entity_chains[entity_id]
+    grouped_chains.append(chains)
+  chains = [
+      _merge_features_from_multiple_chains(chains, pair_msa_sequences=True)
+      for chains in grouped_chains]
+  return chains
+
+
+def _concatenate_paired_and_unpaired_features(
+    example: Mapping[str, np.ndarray]) -> Mapping[str, np.ndarray]:
+  """Merges paired and block-diagonalised features."""
+  features = MSA_FEATURES
+  for feature_name in features:
+    if feature_name in example:
+      feat = example[feature_name]
+      feat_all_seq = example[feature_name + '_all_seq']
+      merged_feat = np.concatenate([feat_all_seq, feat], axis=0)
+      example[feature_name] = merged_feat
+  example['num_alignments'] = np.array(example['msa'].shape[0],
+                                       dtype=np.int32)
+  return example
+
+
+def merge_chain_features(np_chains_list: List[Mapping[str, np.ndarray]],
+                         pair_msa_sequences: bool,
+                         max_templates: int) -> Mapping[str, np.ndarray]:
+  """Merges features for multiple chains to single FeatureDict.
+
+  Args:
+    np_chains_list: List of FeatureDicts for each chain.
+    pair_msa_sequences: Whether to merge paired MSAs.
+    max_templates: The maximum number of templates to include.
+
+  Returns:
+    Single FeatureDict for entire complex.
+  """
+  np_chains_list = _pad_templates(
+      np_chains_list, max_templates=max_templates)
+  np_chains_list = _merge_homomers_dense_msa(np_chains_list)
+  # Unpaired MSA features will be always block-diagonalised; paired MSA
+  # features will be concatenated.
+  np_example = _merge_features_from_multiple_chains(
+      np_chains_list, pair_msa_sequences=False)
+  if pair_msa_sequences:
+    np_example = _concatenate_paired_and_unpaired_features(np_example)
+  np_example = _correct_post_merged_feats(
+      np_example=np_example,
+      np_chains_list=np_chains_list,
+      pair_msa_sequences=pair_msa_sequences)
+
+  return np_example
+
+
+def deduplicate_unpaired_sequences(
+    np_chains: List[Mapping[str, np.ndarray]]) -> List[Mapping[str, np.ndarray]]:
+  """Removes unpaired sequences which duplicate a paired sequence."""
+
+  feature_names = np_chains[0].keys()
+  msa_features = MSA_FEATURES
+
+  for chain in np_chains:
+    # Convert the msa_all_seq numpy array to a tuple for hashing.
+    sequence_set = set(tuple(s) for s in chain['msa_all_seq'])
+    keep_rows = []
+    # Go through unpaired MSA seqs and remove any rows that correspond to the
+    # sequences that are already present in the paired MSA.
+    for row_num, seq in enumerate(chain['msa']):
+      if tuple(seq) not in sequence_set:
+        keep_rows.append(row_num)
+    for feature_name in feature_names:
+      if feature_name in msa_features:
+        chain[feature_name] = chain[feature_name][keep_rows]
+    chain['num_alignments'] = np.array(chain['msa'].shape[0], dtype=np.int32)
+  return np_chains
--- a/fastfold/data/parsers.py
+++ b/fastfold/data/parsers.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for parsing various file formats."""
+import collections
+import dataclasses
+import itertools
+import re
+import string
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Set
+
+
+DeletionMatrix = Sequence[Sequence[int]]
+
+
+@dataclasses.dataclass(frozen=True)
+class Msa:
+    """Class representing a parsed MSA file"""
+    sequences: Sequence[str]
+    deletion_matrix: DeletionMatrix
+    descriptions: Optional[Sequence[str]]
+
+    def __post_init__(self):
+        if(not (
+            len(self.sequences) == 
+            len(self.deletion_matrix) == 
+            len(self.descriptions)
+        )):
+            raise ValueError(
+                "All fields for an MSA must have the same length"
+            )
+
+    def __len__(self):
+        return len(self.sequences)
+
+    def truncate(self, max_seqs: int):
+        return Msa(
+            sequences=self.sequences[:max_seqs],
+            deletion_matrix=self.deletion_matrix[:max_seqs],
+            descriptions=self.descriptions[:max_seqs],
+        )
+
+
+@dataclasses.dataclass(frozen=True)
+class TemplateHit:
+    """Class representing a template hit."""
+
+    index: int
+    name: str
+    aligned_cols: int
+    sum_probs: Optional[float]
+    query: str
+    hit_sequence: str
+    indices_query: List[int]
+    indices_hit: List[int]
+
+
+def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
+    """Parses FASTA string and returns list of strings with amino-acid sequences.
+
+    Arguments:
+        fasta_string: The string contents of a FASTA file.
+
+    Returns:
+        A tuple of two lists:
+        * A list of sequences.
+        * A list of sequence descriptions taken from the comment lines. In the
+            same order as the sequences.
+    """
+    sequences = []
+    descriptions = []
+    index = -1
+    for line in fasta_string.splitlines():
+        line = line.strip()
+        if line.startswith(">"):
+            index += 1
+            descriptions.append(line[1:])  # Remove the '>' at the beginning.
+            sequences.append("")
+            continue
+        elif not line:
+            continue  # Skip blank lines.
+        sequences[index] += line
+
+    return sequences, descriptions
+
+
+def parse_stockholm(stockholm_string: str) -> Msa:
+    """Parses sequences and deletion matrix from stockholm format alignment.
+
+    Args:
+        stockholm_string: The string contents of a stockholm file. The first
+            sequence in the file should be the query sequence.
+
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * The names of the targets matched, including the jackhmmer subsequence
+                suffix.
+    """
+    name_to_sequence = collections.OrderedDict()
+    for line in stockholm_string.splitlines():
+        line = line.strip()
+        if not line or line.startswith(("#", "//")):
+            continue
+        name, sequence = line.split()
+        if name not in name_to_sequence:
+            name_to_sequence[name] = ""
+        name_to_sequence[name] += sequence
+
+    msa = []
+    deletion_matrix = []
+
+    query = ""
+    keep_columns = []
+    for seq_index, sequence in enumerate(name_to_sequence.values()):
+        if seq_index == 0:
+            # Gather the columns with gaps from the query
+            query = sequence
+            keep_columns = [i for i, res in enumerate(query) if res != "-"]
+
+        # Remove the columns with gaps in the query from all sequences.
+        aligned_sequence = "".join([sequence[c] for c in keep_columns])
+
+        msa.append(aligned_sequence)
+
+        # Count the number of deletions w.r.t. query.
+        deletion_vec = []
+        deletion_count = 0
+        for seq_res, query_res in zip(sequence, query):
+            if seq_res != "-" or query_res != "-":
+                if query_res == "-":
+                    deletion_count += 1
+                else:
+                    deletion_vec.append(deletion_count)
+                    deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+
+    return Msa(
+        sequences=msa, 
+        deletion_matrix=deletion_matrix, 
+        descriptions=list(name_to_sequence.keys())
+    )
+
+
+def parse_a3m(a3m_string: str) -> Msa:
+    """Parses sequences and deletion matrix from a3m format alignment.
+
+    Args:
+        a3m_string: The string contents of a a3m file. The first sequence in the
+            file should be the query sequence.
+
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+    """
+    sequences, descriptions = parse_fasta(a3m_string) 
+    deletion_matrix = []
+    for msa_sequence in sequences:
+        deletion_vec = []
+        deletion_count = 0
+        for j in msa_sequence:
+            if j.islower():
+                deletion_count += 1
+            else:
+                deletion_vec.append(deletion_count)
+                deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+
+    # Make the MSA matrix out of aligned (deletion-free) sequences.
+    deletion_table = str.maketrans("", "", string.ascii_lowercase)
+    aligned_sequences = [s.translate(deletion_table) for s in sequences]
+    return Msa(
+        sequences=aligned_sequences, 
+        deletion_matrix=deletion_matrix,
+        descriptions=descriptions
+    )    
+
+
+def _convert_sto_seq_to_a3m(
+    query_non_gaps: Sequence[bool], sto_seq: str
+) -> Iterable[str]:
+    for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
+        if is_query_res_non_gap:
+            yield sequence_res
+        elif sequence_res != "-":
+            yield sequence_res.lower()
+
+
+def convert_stockholm_to_a3m(
+    stockholm_format: str,
+    max_sequences: Optional[int] = None,
+    remove_first_row_gaps: bool = True,
+) -> str:
+    """Converts MSA in Stockholm format to the A3M format."""
+    descriptions = {}
+    sequences = {}
+    reached_max_sequences = False
+
+    for line in stockholm_format.splitlines():
+        reached_max_sequences = (
+            max_sequences and len(sequences) >= max_sequences
+        )
+        if line.strip() and not line.startswith(("#", "//")):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname, aligned_seq = line.split(maxsplit=1)
+            if seqname not in sequences:
+                if reached_max_sequences:
+                    continue
+                sequences[seqname] = ""
+            sequences[seqname] += aligned_seq
+
+    for line in stockholm_format.splitlines():
+        if line[:4] == "#=GS":
+            # Description row - example format is:
+            # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
+            columns = line.split(maxsplit=3)
+            seqname, feature = columns[1:3]
+            value = columns[3] if len(columns) == 4 else ""
+            if feature != "DE":
+                continue
+            if reached_max_sequences and seqname not in sequences:
+                continue
+            descriptions[seqname] = value
+            if len(descriptions) == len(sequences):
+                break
+
+    # Convert sto format to a3m line by line
+    a3m_sequences = {}
+    if(remove_first_row_gaps):
+        # query_sequence is assumed to be the first sequence
+        query_sequence = next(iter(sequences.values()))
+        query_non_gaps = [res != "-" for res in query_sequence]
+    for seqname, sto_sequence in sequences.items():
+        # Dots are optional in a3m format and are commonly removed.
+        out_sequence = sto_sequence.replace('.', '')
+        if(remove_first_row_gaps):
+            out_sequence = ''.join(
+                _convert_sto_seq_to_a3m(query_non_gaps, out_sequence)
+            )
+        a3m_sequences[seqname] = out_sequence
+
+
+    fasta_chunks = (
+        f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
+        for k in a3m_sequences
+    )
+    return "\n".join(fasta_chunks) + "\n"  # Include terminating newline.
+
+
+def _keep_line(line: str, seqnames: Set[str]) -> bool:
+  """Function to decide which lines to keep."""
+  if not line.strip():
+    return True
+  if line.strip() == '//':  # End tag
+    return True
+  if line.startswith('# STOCKHOLM'):  # Start tag
+    return True
+  if line.startswith('#=GC RF'):  # Reference Annotation Line
+    return True
+  if line[:4] == '#=GS':  # Description lines - keep if sequence in list.
+    _, seqname, _ = line.split(maxsplit=2)
+    return seqname in seqnames
+  elif line.startswith('#'):  # Other markup - filter out
+    return False
+  else:  # Alignment data - keep if sequence in list.
+    seqname = line.partition(' ')[0]
+    return seqname in seqnames
+
+
+def truncate_stockholm_msa(stockholm_msa_path: str, max_sequences: int) -> str:
+  """Reads + truncates a Stockholm file while preventing excessive RAM usage."""
+  seqnames = set()
+  filtered_lines = []
+
+  with open(stockholm_msa_path) as f:
+    for line in f:
+      if line.strip() and not line.startswith(('#', '//')):
+        # Ignore blank lines, markup and end symbols - remainder are alignment
+        # sequence parts.
+        seqname = line.partition(' ')[0]
+        seqnames.add(seqname)
+        if len(seqnames) >= max_sequences:
+          break
+
+    f.seek(0)
+    for line in f:
+      if _keep_line(line, seqnames):
+        filtered_lines.append(line)
+
+  return ''.join(filtered_lines)
+
+
+def remove_empty_columns_from_stockholm_msa(stockholm_msa: str) -> str:
+  """Removes empty columns (dashes-only) from a Stockholm MSA."""
+  processed_lines = {}
+  unprocessed_lines = {}
+  for i, line in enumerate(stockholm_msa.splitlines()):
+    if line.startswith('#=GC RF'):
+      reference_annotation_i = i
+      reference_annotation_line = line
+      # Reached the end of this chunk of the alignment. Process chunk.
+      _, _, first_alignment = line.rpartition(' ')
+      mask = []
+      for j in range(len(first_alignment)):
+        for _, unprocessed_line in unprocessed_lines.items():
+          prefix, _, alignment = unprocessed_line.rpartition(' ')
+          if alignment[j] != '-':
+            mask.append(True)
+            break
+        else:  # Every row contained a hyphen - empty column.
+          mask.append(False)
+      # Add reference annotation for processing with mask.
+      unprocessed_lines[reference_annotation_i] = reference_annotation_line
+
+      if not any(mask):  # All columns were empty. Output empty lines for chunk.
+        for line_index in unprocessed_lines:
+          processed_lines[line_index] = ''
+      else:
+        for line_index, unprocessed_line in unprocessed_lines.items():
+          prefix, _, alignment = unprocessed_line.rpartition(' ')
+          masked_alignment = ''.join(itertools.compress(alignment, mask))
+          processed_lines[line_index] = f'{prefix} {masked_alignment}'
+
+      # Clear raw_alignments.
+      unprocessed_lines = {}
+    elif line.strip() and not line.startswith(('#', '//')):
+      unprocessed_lines[i] = line
+    else:
+      processed_lines[i] = line
+  return '\n'.join((processed_lines[i] for i in range(len(processed_lines))))
+
+
+def deduplicate_stockholm_msa(stockholm_msa: str) -> str:
+  """Remove duplicate sequences (ignoring insertions wrt query)."""
+  sequence_dict = collections.defaultdict(str)
+
+  # First we must extract all sequences from the MSA.
+  for line in stockholm_msa.splitlines():
+    # Only consider the alignments - ignore reference annotation, empty lines,
+    # descriptions or markup.
+    if line.strip() and not line.startswith(('#', '//')):
+      line = line.strip()
+      seqname, alignment = line.split()
+      sequence_dict[seqname] += alignment
+
+  seen_sequences = set()
+  seqnames = set()
+  # First alignment is the query.
+  query_align = next(iter(sequence_dict.values()))
+  mask = [c != '-' for c in query_align]  # Mask is False for insertions.
+  for seqname, alignment in sequence_dict.items():
+    # Apply mask to remove all insertions from the string.
+    masked_alignment = ''.join(itertools.compress(alignment, mask))
+    if masked_alignment in seen_sequences:
+      continue
+    else:
+      seen_sequences.add(masked_alignment)
+      seqnames.add(seqname)
+
+  filtered_lines = []
+  for line in stockholm_msa.splitlines():
+    if _keep_line(line, seqnames):
+      filtered_lines.append(line)
+
+  return '\n'.join(filtered_lines) + '\n'
+
+
+def _get_hhr_line_regex_groups(
+    regex_pattern: str, line: str
+) -> Sequence[Optional[str]]:
+    match = re.match(regex_pattern, line)
+    if match is None:
+        raise RuntimeError(f"Could not parse query line {line}")
+    return match.groups()
+
+
+def _update_hhr_residue_indices_list(
+    sequence: str, start_index: int, indices_list: List[int]
+):
+    """Computes the relative indices for each residue with respect to the original sequence."""
+    counter = start_index
+    for symbol in sequence:
+        if symbol == "-":
+            indices_list.append(-1)
+        else:
+            indices_list.append(counter)
+            counter += 1
+
+
+def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
+    """Parses the detailed HMM HMM comparison section for a single Hit.
+
+    This works on .hhr files generated from both HHBlits and HHSearch.
+
+    Args:
+        detailed_lines: A list of lines from a single comparison section between 2
+            sequences (which each have their own HMM's)
+
+    Returns:
+        A dictionary with the information from that detailed comparison section
+
+    Raises:
+        RuntimeError: If a certain line cannot be processed
+    """
+    # Parse first 2 lines.
+    number_of_hit = int(detailed_lines[0].split()[-1])
+    name_hit = detailed_lines[1][1:]
+
+    # Parse the summary line.
+    pattern = (
+        "Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t"
+        " ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t "
+        "]*Template_Neff=(.*)"
+    )
+    match = re.match(pattern, detailed_lines[2])
+    if match is None:
+        raise RuntimeError(
+            "Could not parse section: %s. Expected this: \n%s to contain summary."
+            % (detailed_lines, detailed_lines[2])
+        )
+    (_, _, _, aligned_cols, _, _, sum_probs, _) = [
+        float(x) for x in match.groups()
+    ]
+
+    # The next section reads the detailed comparisons. These are in a 'human
+    # readable' format which has a fixed length. The strategy employed is to
+    # assume that each block starts with the query sequence line, and to parse
+    # that with a regexp in order to deduce the fixed length used for that block.
+    query = ""
+    hit_sequence = ""
+    indices_query = []
+    indices_hit = []
+    length_block = None
+
+    for line in detailed_lines[3:]:
+        # Parse the query sequence line
+        if (
+            line.startswith("Q ")
+            and not line.startswith("Q ss_dssp")
+            and not line.startswith("Q ss_pred")
+            and not line.startswith("Q Consensus")
+        ):
+            # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
+            # everything after that.
+            #              start    sequence       end       total_sequence_length
+            patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)"
+            groups = _get_hhr_line_regex_groups(patt, line[17:])
+
+            # Get the length of the parsed block using the start and finish indices,
+            # and ensure it is the same as the actual block length.
+            start = int(groups[0]) - 1  # Make index zero based.
+            delta_query = groups[1]
+            end = int(groups[2])
+            num_insertions = len([x for x in delta_query if x == "-"])
+            length_block = end - start + num_insertions
+            assert length_block == len(delta_query)
+
+            # Update the query sequence and indices list.
+            query += delta_query
+            _update_hhr_residue_indices_list(delta_query, start, indices_query)
+
+        elif line.startswith("T "):
+            # Parse the hit sequence.
+            if (
+                not line.startswith("T ss_dssp")
+                and not line.startswith("T ss_pred")
+                and not line.startswith("T Consensus")
+            ):
+                # Thus the first 17 characters must be 'T <hit_name> ', and we can
+                # parse everything after that.
+                #              start    sequence       end     total_sequence_length
+                patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)"
+                groups = _get_hhr_line_regex_groups(patt, line[17:])
+                start = int(groups[0]) - 1  # Make index zero based.
+                delta_hit_sequence = groups[1]
+                assert length_block == len(delta_hit_sequence)
+
+                # Update the hit sequence and indices list.
+                hit_sequence += delta_hit_sequence
+                _update_hhr_residue_indices_list(
+                    delta_hit_sequence, start, indices_hit
+                )
+
+    return TemplateHit(
+        index=number_of_hit,
+        name=name_hit,
+        aligned_cols=int(aligned_cols),
+        sum_probs=sum_probs,
+        query=query,
+        hit_sequence=hit_sequence,
+        indices_query=indices_query,
+        indices_hit=indices_hit,
+    )
+
+
+def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
+    """Parses the content of an entire HHR file."""
+    lines = hhr_string.splitlines()
+
+    # Each .hhr file starts with a results table, then has a sequence of hit
+    # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
+    # iterate through each paragraph to parse each hit.
+
+    block_starts = [i for i, line in enumerate(lines) if line.startswith("No ")]
+
+    hits = []
+    if block_starts:
+        block_starts.append(len(lines))  # Add the end of the final block.
+        for i in range(len(block_starts) - 1):
+            hits.append(
+                _parse_hhr_hit(lines[block_starts[i] : block_starts[i + 1]])
+            )
+    return hits
+
+
+def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
+    """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
+    e_values = {"query": 0}
+    lines = [line for line in tblout.splitlines() if line[0] != "#"]
+    # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
+    # space-delimited. Relevant fields are (1) target name:  and
+    # (5) E-value (full sequence) (numbering from 1).
+    for line in lines:
+        fields = line.split()
+        e_value = fields[4]
+        target_name = fields[0]
+        e_values[target_name] = float(e_value)
+    return e_values
+
+
+def _get_indices(sequence: str, start: int) -> List[int]:
+    """Returns indices for non-gap/insert residues starting at the given index."""
+    indices = []
+    counter = start
+    for symbol in sequence:
+      # Skip gaps but add a placeholder so that the alignment is preserved.
+      if symbol == '-':
+        indices.append(-1)
+      # Skip deleted residues, but increase the counter.
+      elif symbol.islower():
+        counter += 1
+      # Normal aligned residue. Increase the counter and append to indices.
+      else:
+        indices.append(counter)
+        counter += 1
+    return indices
+
+
+@dataclasses.dataclass(frozen=True)
+class HitMetadata:
+    pdb_id: str
+    chain: str
+    start: int
+    end: int
+    length: int
+    text: str
+
+
+def _parse_hmmsearch_description(description: str) -> HitMetadata:
+    """Parses the hmmsearch A3M sequence description line."""
+    # Example 1: >4pqx_A/2-217 [subseq from] mol:protein length:217  Free text
+    # Example 2: >5g3r_A/1-55 [subseq from] mol:protein length:352
+    match = re.match(
+        r'^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$',
+        description.strip())
+  
+    if not match:
+      raise ValueError(f'Could not parse description: "{description}".')
+  
+    return HitMetadata(
+        pdb_id=match[1],
+        chain=match[2],
+        start=int(match[3]),
+        end=int(match[4]),
+        length=int(match[5]),
+        text=match[6]
+    )
+
+
+def parse_hmmsearch_a3m(
+    query_sequence: str,
+    a3m_string: str,
+    skip_first: bool = True
+) -> Sequence[TemplateHit]:
+    """Parses an a3m string produced by hmmsearch.
+  
+    Args:
+      query_sequence: The query sequence.
+      a3m_string: The a3m string produced by hmmsearch.
+      skip_first: Whether to skip the first sequence in the a3m string.
+  
+    Returns:
+      A sequence of `TemplateHit` results.
+    """
+    # Zip the descriptions and MSAs together, skip the first query sequence.
+    parsed_a3m = list(zip(*parse_fasta(a3m_string)))
+    if skip_first:
+      parsed_a3m = parsed_a3m[1:]
+  
+    indices_query = _get_indices(query_sequence, start=0)
+  
+    hits = []
+    for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
+      if 'mol:protein' not in hit_description:
+        continue  # Skip non-protein chains.
+      metadata = _parse_hmmsearch_description(hit_description)
+      # Aligned columns are only the match states.
+      aligned_cols = sum([r.isupper() and r != '-' for r in hit_sequence])
+      indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)
+  
+      hit = TemplateHit(
+          index=i,
+          name=f'{metadata.pdb_id}_{metadata.chain}',
+          aligned_cols=aligned_cols,
+          sum_probs=None,
+          query=query_sequence,
+          hit_sequence=hit_sequence.upper(),
+          indices_query=indices_query,
+          indices_hit=indices_hit,
+      )
+      hits.append(hit)
+  
+    return hits
+
+
+def parse_hmmsearch_sto(
+        output_string: str,
+        input_sequence: str
+) -> Sequence[TemplateHit]:
+    """Gets parsed template hits from the raw string output by the tool."""
+    a3m_string = convert_stockholm_to_a3m(
+        output_string,
+        remove_first_row_gaps=False
+    )
+    template_hits = parse_hmmsearch_a3m(
+        query_sequence=input_sequence,
+        a3m_string=a3m_string,
+        skip_first=False
+    )
+    return template_hits
--- a/fastfold/data/templates.py
+++ b/fastfold/data/templates.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for getting templates and calculating template features."""
+import dataclasses
+import datetime
+import glob
+import json
+import logging
+import os
+import re
+from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
+
+import numpy as np
+
+from fastfold.data import parsers, mmcif_parsing
+from fastfold.data.errors import Error
+from fastfold.data.tools import kalign
+from fastfold.data.tools.utils import to_date
+from fastfold.common import residue_constants
+
+
+class NoChainsError(Error):
+    """An error indicating that template mmCIF didn't have any chains."""
+
+
+class SequenceNotInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain the sequence."""
+
+
+class NoAtomDataInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain atom positions."""
+
+
+class TemplateAtomMaskAllZerosError(Error):
+    """An error indicating that template mmCIF had all atom positions masked."""
+
+
+class QueryToTemplateAlignError(Error):
+    """An error indicating that the query can't be aligned to the template."""
+
+
+class CaDistanceError(Error):
+    """An error indicating that a CA atom distance exceeds a threshold."""
+
+
+# Prefilter exceptions.
+class PrefilterError(Exception):
+    """A base class for template prefilter exceptions."""
+
+
+class DateError(PrefilterError):
+    """An error indicating that the hit date was after the max allowed date."""
+
+
+class PdbIdError(PrefilterError):
+    """An error indicating that the hit PDB ID was identical to the query."""
+
+
+class AlignRatioError(PrefilterError):
+    """An error indicating that the hit align ratio to the query was too small."""
+
+
+class DuplicateError(PrefilterError):
+    """An error indicating that the hit was an exact subsequence of the query."""
+
+
+class LengthError(PrefilterError):
+    """An error indicating that the hit was too short."""
+
+
+TEMPLATE_FEATURES = {
+    "template_aatype": np.int64,
+    "template_all_atom_mask": np.float32,
+    "template_all_atom_positions": np.float32,
+    "template_domain_names": np.object,
+    "template_sequence": np.object,
+    "template_sum_probs": np.float32,
+}
+
+
+def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
+    """Returns PDB id and chain id for an HHSearch Hit."""
+    # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
+    id_match = re.match(r"[a-zA-Z\d]{4}_[a-zA-Z0-9.]+", hit.name)
+    if not id_match:
+        raise ValueError(f"hit.name did not start with PDBID_chain: {hit.name}")
+    pdb_id, chain_id = id_match.group(0).split("_")
+    return pdb_id.lower(), chain_id
+
+
+def _is_after_cutoff(
+    pdb_id: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: Optional[datetime.datetime],
+) -> bool:
+    """Checks if the template date is after the release date cutoff.
+
+    Args:
+        pdb_id: 4 letter pdb code.
+        release_dates: Dictionary mapping PDB ids to their structure release dates.
+        release_date_cutoff: Max release date that is valid for this query.
+
+    Returns:
+        True if the template release date is after the cutoff, False otherwise.
+    """
+    pdb_id_upper = pdb_id.upper()
+    if release_date_cutoff is None:
+        raise ValueError("The release_date_cutoff must not be None.")
+    if pdb_id_upper in release_dates:
+        return release_dates[pdb_id_upper] > release_date_cutoff
+    else:
+        # Since this is just a quick prefilter to reduce the number of mmCIF files
+        # we need to parse, we don't have to worry about returning True here.
+        logging.info(
+            "Template structure not in release dates dict: %s", pdb_id
+        )
+        return False
+
+
+def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, str]:
+    """Parses the data file from PDB that lists which PDB ids are obsolete."""
+    with open(obsolete_file_path) as f:
+        result = {}
+        for line in f:
+            line = line.strip()
+            # We skip obsolete entries that don't contain a mapping to a new entry.
+            if line.startswith("OBSLTE") and len(line) > 30:
+                # Format:    Date      From     To
+                # 'OBSLTE    31-JUL-94 116L     216L'
+                from_id = line[20:24].lower()
+                to_id = line[29:33].lower()
+                result[from_id] = to_id
+        return result
+
+
+def generate_release_dates_cache(mmcif_dir: str, out_path: str):
+    dates = {}
+    for f in os.listdir(mmcif_dir):
+        if f.endswith(".cif"):
+            path = os.path.join(mmcif_dir, f)
+            with open(path, "r") as fp:
+                mmcif_string = fp.read()
+
+            file_id = os.path.splitext(f)[0]
+            mmcif = mmcif_parsing.parse(
+                file_id=file_id, mmcif_string=mmcif_string
+            )
+            if mmcif.mmcif_object is None:
+                logging.info(f"Failed to parse {f}. Skipping...")
+                continue
+
+            mmcif = mmcif.mmcif_object
+            release_date = mmcif.header["release_date"]
+
+            dates[file_id] = release_date
+
+    with open(out_path, "r") as fp:
+        fp.write(json.dumps(dates))
+
+
+def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
+    """Parses release dates file, returns a mapping from PDBs to release dates."""
+    with open(path, "r") as fp:
+        data = json.load(fp)
+
+    return {
+        pdb.upper(): to_date(v)
+        for pdb, d in data.items()
+        for k, v in d.items()
+        if k == "release_date"
+    }
+
+
+def _assess_hhsearch_hit(
+    hit: parsers.TemplateHit,
+    hit_pdb_code: str,
+    query_sequence: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: datetime.datetime,
+    query_pdb_code: Optional[str] = None,
+    max_subsequence_ratio: float = 0.95,
+    min_align_ratio: float = 0.1,
+) -> bool:
+    """Determines if template is valid (without parsing the template mmcif file).
+
+    Args:
+        hit: HhrHit for the template.
+        hit_pdb_code: The 4 letter pdb code of the template hit. This might be
+            different from the value in the actual hit since the original pdb might
+            have become obsolete.
+        query_sequence: Amino acid sequence of the query.
+        query_pdb_code: 4 letter pdb code of the query.
+        release_dates: Dictionary mapping pdb codes to their structure release
+            dates.
+        release_date_cutoff: Max release date that is valid for this query.
+        max_subsequence_ratio: Exclude any exact matches with this much overlap.
+        min_align_ratio: Minimum overlap between the template and query.
+
+    Returns:
+        True if the hit passed the prefilter. Raises an exception otherwise.
+
+    Raises:
+        DateError: If the hit date was after the max allowed date.
+        PdbIdError: If the hit PDB ID was identical to the query.
+        AlignRatioError: If the hit align ratio to the query was too small.
+        DuplicateError: If the hit was an exact subsequence of the query.
+        LengthError: If the hit was too short.
+    """
+    aligned_cols = hit.aligned_cols
+    align_ratio = aligned_cols / len(query_sequence)
+
+    template_sequence = hit.hit_sequence.replace("-", "")
+    length_ratio = float(len(template_sequence)) / len(query_sequence)
+
+    # Check whether the template is a large subsequence or duplicate of original
+    # query. This can happen due to duplicate entries in the PDB database.
+    duplicate = (
+        template_sequence in query_sequence
+        and length_ratio > max_subsequence_ratio
+    )
+
+    if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
+        date = release_dates[hit_pdb_code.upper()]
+        raise DateError(
+            f"Date ({date}) > max template date "
+            f"({release_date_cutoff})."
+        )
+
+    if query_pdb_code is not None:
+        if query_pdb_code.lower() == hit_pdb_code.lower():
+            raise PdbIdError("PDB code identical to Query PDB code.")
+
+    if align_ratio <= min_align_ratio:
+        raise AlignRatioError(
+            "Proportion of residues aligned to query too small. "
+            f"Align ratio: {align_ratio}."
+        )
+
+    if duplicate:
+        raise DuplicateError(
+            "Template is an exact subsequence of query with large "
+            f"coverage. Length ratio: {length_ratio}."
+        )
+
+    if len(template_sequence) < 10:
+        raise LengthError(
+            f"Template too short. Length: {len(template_sequence)}."
+        )
+
+    return True
+
+
+def _find_template_in_pdb(
+    template_chain_id: str,
+    template_sequence: str,
+    mmcif_object: mmcif_parsing.MmcifObject,
+) -> Tuple[str, str, int]:
+    """Tries to find the template chain in the given pdb file.
+
+    This method tries the three following things in order:
+        1. Tries if there is an exact match in both the chain ID and the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        2. Tries if there is an exact match only in the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
+             If yes, the chain sequence is returned.
+    If none of these succeed, a SequenceNotInTemplateError is thrown.
+
+    Args:
+        template_chain_id: The template chain ID.
+        template_sequence: The template chain sequence.
+        mmcif_object: The PDB object to search for the template in.
+
+    Returns:
+        A tuple with:
+        * The chain sequence that was found to match the template in the PDB object.
+        * The ID of the chain that is being returned.
+        * The offset where the template sequence starts in the chain sequence.
+
+    Raises:
+        SequenceNotInTemplateError: If no match is found after the steps described
+            above.
+    """
+    # Try if there is an exact match in both the chain ID and the (sub)sequence.
+    pdb_id = mmcif_object.file_id
+    chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
+    if chain_sequence and (template_sequence in chain_sequence):
+        logging.info(
+            "Found an exact template match %s_%s.", pdb_id, template_chain_id
+        )
+        mapping_offset = chain_sequence.find(template_sequence)
+        return chain_sequence, template_chain_id, mapping_offset
+
+    # Try if there is an exact match in the (sub)sequence only.
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        if chain_sequence and (template_sequence in chain_sequence):
+            logging.info("Found a sequence-only match %s_%s.", pdb_id, chain_id)
+            mapping_offset = chain_sequence.find(template_sequence)
+            return chain_sequence, chain_id, mapping_offset
+
+    # Return a chain sequence that fuzzy matches (X = wildcard) the template.
+    # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
+    regex = ["." if aa == "X" else "(?:%s|X)" % aa for aa in template_sequence]
+    regex = re.compile("".join(regex))
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        match = re.search(regex, chain_sequence)
+        if match:
+            logging.info(
+                "Found a fuzzy sequence-only match %s_%s.", pdb_id, chain_id
+            )
+            mapping_offset = match.start()
+            return chain_sequence, chain_id, mapping_offset
+
+    # No hits, raise an error.
+    raise SequenceNotInTemplateError(
+        "Could not find the template sequence in %s_%s. Template sequence: %s, "
+        "chain_to_seqres: %s"
+        % (
+            pdb_id,
+            template_chain_id,
+            template_sequence,
+            mmcif_object.chain_to_seqres,
+        )
+    )
+
+
+def _realign_pdb_template_to_query(
+    old_template_sequence: str,
+    template_chain_id: str,
+    mmcif_object: mmcif_parsing.MmcifObject,
+    old_mapping: Mapping[int, int],
+    kalign_binary_path: str,
+) -> Tuple[str, Mapping[int, int]]:
+    """Aligns template from the mmcif_object to the query.
+
+    In case PDB70 contains a different version of the template sequence, we need
+    to perform a realignment to the actual sequence that is in the mmCIF file.
+    This method performs such realignment, but returns the new sequence and
+    mapping only if the sequence in the mmCIF file is 90% identical to the old
+    sequence.
+
+    Note that the old_template_sequence comes from the hit, and contains only that
+    part of the chain that matches with the query while the new_template_sequence
+    is the full chain.
+
+    Args:
+        old_template_sequence: The template sequence that was returned by the PDB
+            template search (typically done using HHSearch).
+        template_chain_id: The template chain id was returned by the PDB template
+            search (typically done using HHSearch). This is used to find the right
+            chain in the mmcif_object chain_to_seqres mapping.
+        mmcif_object: A mmcif_object which holds the actual template data.
+        old_mapping: A mapping from the query sequence to the template sequence.
+            This mapping will be used to compute the new mapping from the query
+            sequence to the actual mmcif_object template sequence by aligning the
+            old_template_sequence and the actual template sequence.
+        kalign_binary_path: The path to a kalign executable.
+
+    Returns:
+        A tuple (new_template_sequence, new_query_to_template_mapping) where:
+        * new_template_sequence is the actual template sequence that was found in
+            the mmcif_object.
+        * new_query_to_template_mapping is the new mapping from the query to the
+            actual template found in the mmcif_object.
+
+    Raises:
+        QueryToTemplateAlignError:
+        * If there was an error thrown by the alignment tool.
+        * Or if the actual template sequence differs by more than 10% from the
+            old_template_sequence.
+    """
+    aligner = kalign.Kalign(binary_path=kalign_binary_path)
+    new_template_sequence = mmcif_object.chain_to_seqres.get(
+        template_chain_id, ""
+    )
+
+    # Sometimes the template chain id is unknown. But if there is only a single
+    # sequence within the mmcif_object, it is safe to assume it is that one.
+    if not new_template_sequence:
+        if len(mmcif_object.chain_to_seqres) == 1:
+            logging.info(
+                "Could not find %s in %s, but there is only 1 sequence, so "
+                "using that one.",
+                template_chain_id,
+                mmcif_object.file_id,
+            )
+            new_template_sequence = list(mmcif_object.chain_to_seqres.values())[
+                0
+            ]
+        else:
+            raise QueryToTemplateAlignError(
+                f"Could not find chain {template_chain_id} in {mmcif_object.file_id}. "
+                "If there are no mmCIF parsing errors, it is possible it was not a "
+                "protein chain."
+            )
+
+    try:
+        (old_aligned_template, new_aligned_template), _ = parsers.parse_a3m(
+            aligner.align([old_template_sequence, new_template_sequence])
+        )
+    except Exception as e:
+        raise QueryToTemplateAlignError(
+            "Could not align old template %s to template %s (%s_%s). Error: %s"
+            % (
+                old_template_sequence,
+                new_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                str(e),
+            )
+        )
+
+    logging.info(
+        "Old aligned template: %s\nNew aligned template: %s",
+        old_aligned_template,
+        new_aligned_template,
+    )
+
+    old_to_new_template_mapping = {}
+    old_template_index = -1
+    new_template_index = -1
+    num_same = 0
+    for old_template_aa, new_template_aa in zip(
+        old_aligned_template, new_aligned_template
+    ):
+        if old_template_aa != "-":
+            old_template_index += 1
+        if new_template_aa != "-":
+            new_template_index += 1
+        if old_template_aa != "-" and new_template_aa != "-":
+            old_to_new_template_mapping[old_template_index] = new_template_index
+            if old_template_aa == new_template_aa:
+                num_same += 1
+
+    # Require at least 90 % sequence identity wrt to the shorter of the sequences.
+    if (
+        float(num_same)
+        / min(len(old_template_sequence), len(new_template_sequence))
+        < 0.9
+    ):
+        raise QueryToTemplateAlignError(
+            "Insufficient similarity of the sequence in the database: %s to the "
+            "actual sequence in the mmCIF file %s_%s: %s. We require at least "
+            "90 %% similarity wrt to the shorter of the sequences. This is not a "
+            "problem unless you think this is a template that should be included."
+            % (
+                old_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                new_template_sequence,
+            )
+        )
+
+    new_query_to_template_mapping = {}
+    for query_index, old_template_index in old_mapping.items():
+        new_query_to_template_mapping[
+            query_index
+        ] = old_to_new_template_mapping.get(old_template_index, -1)
+
+    new_template_sequence = new_template_sequence.replace("-", "")
+
+    return new_template_sequence, new_query_to_template_mapping
+
+
+def _check_residue_distances(
+    all_positions: np.ndarray,
+    all_positions_mask: np.ndarray,
+    max_ca_ca_distance: float,
+):
+    """Checks if the distance between unmasked neighbor residues is ok."""
+    ca_position = residue_constants.atom_order["CA"]
+    prev_is_unmasked = False
+    prev_calpha = None
+    for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
+        this_is_unmasked = bool(mask[ca_position])
+        if this_is_unmasked:
+            this_calpha = coords[ca_position]
+            if prev_is_unmasked:
+                distance = np.linalg.norm(this_calpha - prev_calpha)
+                if distance > max_ca_ca_distance:
+                    raise CaDistanceError(
+                        "The distance between residues %d and %d is %f > limit %f."
+                        % (i, i + 1, distance, max_ca_ca_distance)
+                    )
+            prev_calpha = this_calpha
+        prev_is_unmasked = this_is_unmasked
+
+
+def _get_atom_positions(
+    mmcif_object: mmcif_parsing.MmcifObject,
+    auth_chain_id: str,
+    max_ca_ca_distance: float,
+    _zero_center_positions: bool = True,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Gets atom positions and mask from a list of Biopython Residues."""
+    coords_with_mask = mmcif_parsing.get_atom_coords(
+        mmcif_object=mmcif_object, 
+        chain_id=auth_chain_id,
+        _zero_center_positions=_zero_center_positions,
+    )
+    all_atom_positions, all_atom_mask = coords_with_mask
+    _check_residue_distances(
+        all_atom_positions, all_atom_mask, max_ca_ca_distance
+    )
+    return all_atom_positions, all_atom_mask
+
+
+def _extract_template_features(
+    mmcif_object: mmcif_parsing.MmcifObject,
+    pdb_id: str,
+    mapping: Mapping[int, int],
+    template_sequence: str,
+    query_sequence: str,
+    template_chain_id: str,
+    kalign_binary_path: str,
+    _zero_center_positions: bool = True,
+) -> Tuple[Dict[str, Any], Optional[str]]:
+    """Parses atom positions in the target structure and aligns with the query.
+
+    Atoms for each residue in the template structure are indexed to coincide
+    with their corresponding residue in the query sequence, according to the
+    alignment mapping provided.
+
+    Args:
+        mmcif_object: mmcif_parsing.MmcifObject representing the template.
+        pdb_id: PDB code for the template.
+        mapping: Dictionary mapping indices in the query sequence to indices in
+            the template sequence.
+        template_sequence: String describing the amino acid sequence for the
+            template protein.
+        query_sequence: String describing the amino acid sequence for the query
+            protein.
+        template_chain_id: String ID describing which chain in the structure proto
+            should be used.
+        kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+
+    Returns:
+        A tuple with:
+        * A dictionary containing the extra features derived from the template
+            protein structure.
+        * A warning message if the hit was realigned to the actual mmCIF sequence.
+            Otherwise None.
+
+    Raises:
+        NoChainsError: If the mmcif object doesn't contain any chains.
+        SequenceNotInTemplateError: If the given chain id / sequence can't
+            be found in the mmcif object.
+        QueryToTemplateAlignError: If the actual template in the mmCIF file
+            can't be aligned to the query.
+        NoAtomDataInTemplateError: If the mmcif object doesn't contain
+            atom positions.
+        TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
+            unmasked residues.
+    """
+    if mmcif_object is None or not mmcif_object.chain_to_seqres:
+        raise NoChainsError(
+            "No chains in PDB: %s_%s" % (pdb_id, template_chain_id)
+        )
+
+    warning = None
+    try:
+        seqres, chain_id, mapping_offset = _find_template_in_pdb(
+            template_chain_id=template_chain_id,
+            template_sequence=template_sequence,
+            mmcif_object=mmcif_object,
+        )
+    except SequenceNotInTemplateError:
+        # If PDB70 contains a different version of the template, we use the sequence
+        # from the mmcif_object.
+        chain_id = template_chain_id
+        warning = (
+            f"The exact sequence {template_sequence} was not found in "
+            f"{pdb_id}_{chain_id}. Realigning the template to the actual sequence."
+        )
+        logging.warning(warning)
+        # This throws an exception if it fails to realign the hit.
+        seqres, mapping = _realign_pdb_template_to_query(
+            old_template_sequence=template_sequence,
+            template_chain_id=template_chain_id,
+            mmcif_object=mmcif_object,
+            old_mapping=mapping,
+            kalign_binary_path=kalign_binary_path,
+        )
+        logging.info(
+            "Sequence in %s_%s: %s successfully realigned to %s",
+            pdb_id,
+            chain_id,
+            template_sequence,
+            seqres,
+        )
+        # The template sequence changed.
+        template_sequence = seqres
+        # No mapping offset, the query is aligned to the actual sequence.
+        mapping_offset = 0
+
+    try:
+        # Essentially set to infinity - we don't want to reject templates unless
+        # they're really really bad.
+        all_atom_positions, all_atom_mask = _get_atom_positions(
+            mmcif_object, 
+            chain_id, 
+            max_ca_ca_distance=150.0, 
+            _zero_center_positions=_zero_center_positions,
+        )
+    except (CaDistanceError, KeyError) as ex:
+        raise NoAtomDataInTemplateError(
+            "Could not get atom data (%s_%s): %s" % (pdb_id, chain_id, str(ex))
+        ) from ex
+
+    all_atom_positions = np.split(
+        all_atom_positions, all_atom_positions.shape[0]
+    )
+    all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
+
+    output_templates_sequence = []
+    templates_all_atom_positions = []
+    templates_all_atom_masks = []
+
+    for _ in query_sequence:
+        # Residues in the query_sequence that are not in the template_sequence:
+        templates_all_atom_positions.append(
+            np.zeros((residue_constants.atom_type_num, 3))
+        )
+        templates_all_atom_masks.append(
+            np.zeros(residue_constants.atom_type_num)
+        )
+        output_templates_sequence.append("-")
+
+    for k, v in mapping.items():
+        template_index = v + mapping_offset
+        templates_all_atom_positions[k] = all_atom_positions[template_index][0]
+        templates_all_atom_masks[k] = all_atom_masks[template_index][0]
+        output_templates_sequence[k] = template_sequence[v]
+
+    # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
+    if np.sum(templates_all_atom_masks) < 5:
+        raise TemplateAtomMaskAllZerosError(
+            "Template all atom mask was all zeros: %s_%s. Residue range: %d-%d"
+            % (
+                pdb_id,
+                chain_id,
+                min(mapping.values()) + mapping_offset,
+                max(mapping.values()) + mapping_offset,
+            )
+        )
+
+    output_templates_sequence = "".join(output_templates_sequence)
+
+    templates_aatype = residue_constants.sequence_to_onehot(
+        output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID
+    )
+
+    return (
+        {
+            "template_all_atom_positions": np.array(
+                templates_all_atom_positions
+            ),
+            "template_all_atom_mask": np.array(templates_all_atom_masks),
+            "template_sequence": output_templates_sequence.encode(),
+            "template_aatype": np.array(templates_aatype),
+            "template_domain_names": f"{pdb_id.lower()}_{chain_id}".encode(),
+        },
+        warning,
+    )
+
+
+def _build_query_to_hit_index_mapping(
+    hit_query_sequence: str,
+    hit_sequence: str,
+    indices_hit: Sequence[int],
+    indices_query: Sequence[int],
+    original_query_sequence: str,
+) -> Mapping[int, int]:
+    """Gets mapping from indices in original query sequence to indices in the hit.
+
+    hit_query_sequence and hit_sequence are two aligned sequences containing gap
+    characters. hit_query_sequence contains only the part of the original query
+    sequence that matched the hit. When interpreting the indices from the .hhr, we
+    need to correct for this to recover a mapping from original query sequence to
+    the hit sequence.
+
+    Args:
+        hit_query_sequence: The portion of the query sequence that is in the .hhr
+            hit
+        hit_sequence: The portion of the hit sequence that is in the .hhr
+        indices_hit: The indices for each aminoacid relative to the hit sequence
+        indices_query: The indices for each aminoacid relative to the original query
+            sequence
+        original_query_sequence: String describing the original query sequence.
+
+    Returns:
+        Dictionary with indices in the original query sequence as keys and indices
+        in the hit sequence as values.
+    """
+    # If the hit is empty (no aligned residues), return empty mapping
+    if not hit_query_sequence:
+        return {}
+
+    # Remove gaps and find the offset of hit.query relative to original query.
+    hhsearch_query_sequence = hit_query_sequence.replace("-", "")
+    hit_sequence = hit_sequence.replace("-", "")
+    hhsearch_query_offset = original_query_sequence.find(
+        hhsearch_query_sequence
+    )
+
+    # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
+    min_idx = min(x for x in indices_hit if x > -1)
+    fixed_indices_hit = [x - min_idx if x > -1 else -1 for x in indices_hit]
+
+    min_idx = min(x for x in indices_query if x > -1)
+    fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]
+
+    # Zip the corrected indices, ignore case where both seqs have gap characters.
+    mapping = {}
+    for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
+        if q_t != -1 and q_i != -1:
+            if q_t >= len(hit_sequence) or q_i + hhsearch_query_offset >= len(
+                original_query_sequence
+            ):
+                continue
+            mapping[q_i + hhsearch_query_offset] = q_t
+
+    return mapping
+
+
+@dataclasses.dataclass(frozen=True)
+class PrefilterResult:
+    valid: bool
+    error: Optional[str]
+    warning: Optional[str]
+
+@dataclasses.dataclass(frozen=True)
+class SingleHitResult:
+    features: Optional[Mapping[str, Any]]
+    error: Optional[str]
+    warning: Optional[str]
+
+
+def _prefilter_hit(
+    query_sequence: str,
+    hit: parsers.TemplateHit,
+    max_template_date: datetime.datetime,
+    release_dates: Mapping[str, datetime.datetime],
+    obsolete_pdbs: Mapping[str, str],
+    strict_error_check: bool = False,
+    query_pdb_code: Optional[str] = None,
+):
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+
+    # Pass hit_pdb_code since it might have changed due to the pdb being 
+    # obsolete.
+    try:
+        _assess_hhsearch_hit(
+            hit=hit,
+            hit_pdb_code=hit_pdb_code,
+            query_sequence=query_sequence,
+            query_pdb_code=query_pdb_code,
+            release_dates=release_dates,
+            release_date_cutoff=max_template_date,
+        )
+    except PrefilterError as e:
+        hit_name = f"{hit_pdb_code}_{hit_chain_id}"
+        msg = f"hit {hit_name} did not pass prefilter: {str(e)}"
+        logging.info("%s: %s", query_pdb_code, msg)
+        if strict_error_check and isinstance(
+            e, (DateError, PdbIdError, DuplicateError)
+        ):
+            # In strict mode we treat some prefilter cases as errors.
+            return PrefilterResult(valid=False, error=msg, warning=None)
+
+        return PrefilterResult(valid=False, error=None, warning=None)
+
+    return PrefilterResult(valid=True, error=None, warning=None)
+
+
+def _process_single_hit(
+    query_sequence: str,
+    hit: parsers.TemplateHit,
+    mmcif_dir: str,
+    max_template_date: datetime.datetime,
+    release_dates: Mapping[str, datetime.datetime],
+    obsolete_pdbs: Mapping[str, str],
+    kalign_binary_path: str,
+    strict_error_check: bool = False,
+    _zero_center_positions: bool = True,
+    query_pdb_code: Optional[str] = None,
+) -> SingleHitResult:
+    """Tries to extract template features from a single HHSearch hit."""
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+
+    mapping = _build_query_to_hit_index_mapping(
+        hit.query,
+        hit.hit_sequence,
+        hit.indices_hit,
+        hit.indices_query,
+        query_sequence,
+    )
+
+    # The mapping is from the query to the actual hit sequence, so we need to
+    # remove gaps (which regardless have a missing confidence score).
+    template_sequence = hit.hit_sequence.replace("-", "")
+
+    cif_path = os.path.join(mmcif_dir, hit_pdb_code + ".cif")
+    logging.info(
+        "Reading PDB entry from %s. Query: %s, template: %s",
+        cif_path,
+        query_sequence,
+        template_sequence,
+    )
+    # Fail if we can't find the mmCIF file.
+    with open(cif_path, "r") as cif_file:
+        cif_string = cif_file.read()
+
+    parsing_result = mmcif_parsing.parse(
+        file_id=hit_pdb_code, mmcif_string=cif_string
+    )
+
+    if parsing_result.mmcif_object is not None:
+        hit_release_date = datetime.datetime.strptime(
+            parsing_result.mmcif_object.header["release_date"], "%Y-%m-%d"
+        )
+        if hit_release_date > max_template_date:
+            error = "Template %s date (%s) > max template date (%s)." % (
+                hit_pdb_code,
+                hit_release_date,
+                max_template_date,
+            )
+            if strict_error_check:
+                return SingleHitResult(features=None, error=error, warning=None)
+            else:
+                logging.info(error)
+                return SingleHitResult(features=None, error=None, warning=None)
+
+    try:
+        features, realign_warning = _extract_template_features(
+            mmcif_object=parsing_result.mmcif_object,
+            pdb_id=hit_pdb_code,
+            mapping=mapping,
+            template_sequence=template_sequence,
+            query_sequence=query_sequence,
+            template_chain_id=hit_chain_id,
+            kalign_binary_path=kalign_binary_path,
+            _zero_center_positions=_zero_center_positions,
+        )
+        if hit.sum_probs is None:
+            features['template_sum_probs'] = [0]
+        else:
+            features["template_sum_probs"] = [hit.sum_probs]
+
+        # It is possible there were some errors when parsing the other chains in the
+        # mmCIF file, but the template features for the chain we want were still
+        # computed. In such case the mmCIF parsing errors are not relevant.
+        return SingleHitResult(
+            features=features, error=None, warning=realign_warning
+        )
+    except (
+        NoChainsError,
+        NoAtomDataInTemplateError,
+        TemplateAtomMaskAllZerosError,
+    ) as e:
+        # These 3 errors indicate missing mmCIF experimental data rather than a
+        # problem with the template search, so turn them into warnings.
+        warning = (
+            "%s_%s: feature extracting errors: "
+            "%s, mmCIF parsing errors: %s"
+            % (
+                hit_pdb_code,
+                hit_chain_id,
+                str(e),
+                parsing_result.errors,
+            )
+        )
+        if strict_error_check:
+            return SingleHitResult(features=None, error=warning, warning=None)
+        else:
+            return SingleHitResult(features=None, error=None, warning=warning)
+    except Error as e:
+        error = (
+            "%s_%s: feature extracting errors: "
+            "%s, mmCIF parsing errors: %s"
+            % (
+                hit_pdb_code,
+                hit_chain_id,
+                str(e),
+                parsing_result.errors,
+            )
+        )
+        return SingleHitResult(features=None, error=error, warning=None)
+
+
+@dataclasses.dataclass(frozen=True)
+class TemplateSearchResult:
+    features: Mapping[str, Any]
+    errors: Sequence[str]
+    warnings: Sequence[str]
+
+
+class TemplateHitFeaturizer:
+    """A class for turning hhr hits to template features."""
+    def __init__(
+        self,
+        mmcif_dir: str,
+        max_template_date: str,
+        max_hits: int,
+        kalign_binary_path: str,
+        release_dates_path: Optional[str] = None,
+        obsolete_pdbs_path: Optional[str] = None,
+        strict_error_check: bool = False,
+        _shuffle_top_k_prefiltered: Optional[int] = None,
+        _zero_center_positions: bool = True,
+    ):
+        """Initializes the Template Search.
+
+        Args:
+            mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
+                is found by HHSearch, this directory is used to retrieve the template
+                data.
+            max_template_date: The maximum date permitted for template structures. No
+                template with date higher than this date will be returned. In ISO8601
+                date format, YYYY-MM-DD.
+            max_hits: The maximum number of templates that will be returned.
+            kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+            release_dates_path: An optional path to a file with a mapping from PDB IDs
+                to their release dates. Thanks to this we don't have to redundantly
+                parse mmCIF files to get that information.
+            obsolete_pdbs_path: An optional path to a file containing a mapping from
+                obsolete PDB IDs to the PDB IDs of their replacements.
+            strict_error_check: If True, then the following will be treated as errors:
+                * If any template date is after the max_template_date.
+                * If any template has identical PDB ID to the query.
+                * If any template is a duplicate of the query.
+                * Any feature computation errors.
+        """
+        self._mmcif_dir = mmcif_dir
+        if not glob.glob(os.path.join(self._mmcif_dir, "*.cif")):
+            logging.error("Could not find CIFs in %s", self._mmcif_dir)
+            raise ValueError(f"Could not find CIFs in {self._mmcif_dir}")
+
+        try:
+            self._max_template_date = datetime.datetime.strptime(
+                max_template_date, "%Y-%m-%d"
+            )
+        except ValueError:
+            raise ValueError(
+                "max_template_date must be set and have format YYYY-MM-DD."
+            )
+        self.max_hits = max_hits
+        self._kalign_binary_path = kalign_binary_path
+        self._strict_error_check = strict_error_check
+
+        if release_dates_path:
+            logging.info(
+                "Using precomputed release dates %s.", release_dates_path
+            )
+            self._release_dates = _parse_release_dates(release_dates_path)
+        else:
+            self._release_dates = {}
+
+        if obsolete_pdbs_path:
+            logging.info(
+                "Using precomputed obsolete pdbs %s.", obsolete_pdbs_path
+            )
+            self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
+        else:
+            self._obsolete_pdbs = {}
+
+        self._shuffle_top_k_prefiltered = _shuffle_top_k_prefiltered
+        self._zero_center_positions = _zero_center_positions
+
+    def get_templates(
+        self,
+        query_sequence: str,
+        query_release_date: Optional[datetime.datetime],
+        hits: Sequence[parsers.TemplateHit],
+        query_pdb_code: Optional[str] = None,
+    ) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info("Searching for template for: %s", query_pdb_code)
+
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+
+        # Always use a max_template_date. Set to query_release_date minus 60 days
+        # if that's earlier.
+        template_cutoff_date = self._max_template_date
+        if query_release_date:
+            delta = datetime.timedelta(days=60)
+            if query_release_date - delta < template_cutoff_date:
+                template_cutoff_date = query_release_date - delta
+            assert template_cutoff_date < query_release_date
+        assert template_cutoff_date <= self._max_template_date
+
+        num_hits = 0
+        errors = []
+        warnings = []
+
+        filtered = []
+        for hit in hits:
+            prefilter_result = _prefilter_hit(
+                query_sequence=query_sequence,
+                query_pdb_code=query_pdb_code,
+                hit=hit,
+                max_template_date=template_cutoff_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+            )
+
+            if prefilter_result.error:
+                errors.append(prefilter_result.error)
+
+            if prefilter_result.warning:
+                warnings.append(prefilter_result.warning)
+
+            if prefilter_result.valid:
+                filtered.append(hit)
+
+        filtered = list(
+            sorted(filtered, key=lambda x: x.sum_probs, reverse=True)
+        )
+        idx = list(range(len(filtered)))
+        if(self._shuffle_top_k_prefiltered):
+            stk = self._shuffle_top_k_prefiltered
+            idx[:stk] = np.random.permutation(idx[:stk])
+
+        for i in idx:
+            # We got all the templates we wanted, stop processing hits.
+            if num_hits >= self.max_hits:
+                break
+
+            hit = filtered[i]
+
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                query_pdb_code=query_pdb_code,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=template_cutoff_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path,
+                _zero_center_positions=self._zero_center_positions,
+            )
+
+            if result.error:
+                errors.append(result.error)
+
+            # There could be an error even if there are some results, e.g. thrown by
+            # other unparsable chains in the same mmCIF file.
+            if result.warning:
+                warnings.append(result.warning)
+
+            if result.features is None:
+                logging.info(
+                    "Skipped invalid hit %s, error: %s, warning: %s",
+                    hit.name,
+                    result.error,
+                    result.warning,
+                )
+            else:
+                # Increment the hit counter, since we got features out of this hit.
+                num_hits += 1
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+
+        for name in template_features:
+            if num_hits > 0:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+            else:
+                # Make sure the feature has correct dtype even if empty.
+                template_features[name] = np.array(
+                    [], dtype=TEMPLATE_FEATURES[name]
+                )
+
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings
+        )
+
+
+class HmmsearchHitFeaturizer(TemplateHitFeaturizer):
+    def get_templates(
+        self,
+        query_sequence: str,
+        hits: Sequence[parsers.TemplateHit]
+    ) -> TemplateSearchResult:
+        logging.info("Searching for template for: %s", query_sequence)
+
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+
+        already_seen = set()
+        errors = []
+        warnings = []
+
+        # DISCREPANCY: This filtering scheme that saves time
+        filtered = []
+        for hit in hits:
+            prefilter_result = _prefilter_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+            )
+
+            if prefilter_result.error:
+                errors.append(prefilter_result.error)
+
+            if prefilter_result.warning:
+                warnings.append(prefilter_result.warning)
+
+            if prefilter_result.valid:
+                filtered.append(hit)
+
+        filtered = list(
+            sorted(
+                filtered, key=lambda x: x.sum_probs if x.sum_probs else 0., reverse=True
+            )
+        )
+        idx = list(range(len(filtered)))
+        if(self._shuffle_top_k_prefiltered):
+            stk = self._shuffle_top_k_prefiltered
+            idx[:stk] = np.random.permutation(idx[:stk])
+
+        for i in idx:
+            if(len(already_seen) >= self.max_hits):
+                break
+
+            hit = filtered[i]
+
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date = self._max_template_date,
+                release_dates = self._release_dates,
+                obsolete_pdbs = self._obsolete_pdbs,
+                strict_error_check = self._strict_error_check,
+                kalign_binary_path = self._kalign_binary_path
+            )
+
+            if result.error:
+                errors.append(result.error)
+
+            if result.warning:
+                warnings.append(result.warning)
+
+            if result.features is None:
+                logging.debug(
+                    "Skipped invalid hit %s, error: %s, warning: %s",
+                    hit.name, result.error, result.warning,
+                )
+            else:
+                already_seen_key = result.features["template_sequence"]
+                if(already_seen_key in already_seen):
+                    continue
+                # Increment the hit counter, since we got features out of this hit.
+                already_seen.add(already_seen_key)
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+
+        if already_seen:
+            for name in template_features:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+        else:
+            num_res = len(query_sequence)
+            # Construct a default template with all zeros.
+            template_features = {
+                "template_aatype": np.zeros(
+                    (1, num_res, len(residue_constants.restypes_with_x_and_gap)),
+                    np.float32
+                ),
+                "template_all_atom_masks": np.zeros(
+                    (1, num_res, residue_constants.atom_type_num), np.float32
+                ),
+                "template_all_atom_positions": np.zeros(
+                    (1, num_res, residue_constants.atom_type_num, 3), np.float32
+                ),
+                "template_domain_names": np.array([''.encode()], dtype=np.object),
+                "template_sequence": np.array([''.encode()], dtype=np.object),
+                "template_sum_probs": np.array([0], dtype=np.float32),
+            }
+
+        return TemplateSearchResult(
+            features=template_features,
+            errors=errors,
+            warnings=warnings,
+        )
--- a/fastfold/data/tools/__init__.py
+++ b/fastfold/data/tools/__init__.py
--- a/fastfold/data/tools/hhblits.py
+++ b/fastfold/data/tools/hhblits.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to run HHblits from Python."""
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Mapping, Optional, Sequence
+
+from fastfold.data.tools import utils
+
+
+_HHBLITS_DEFAULT_P = 20
+_HHBLITS_DEFAULT_Z = 500
+
+
+class HHBlits:
+    """Python wrapper of the HHblits binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 4,
+        n_iter: int = 3,
+        e_value: float = 0.001,
+        maxseq: int = 1_000_000,
+        realign_max: int = 100_000,
+        maxfilt: int = 100_000,
+        min_prefilter_hits: int = 1000,
+        all_seqs: bool = False,
+        alt: Optional[int] = None,
+        p: int = _HHBLITS_DEFAULT_P,
+        z: int = _HHBLITS_DEFAULT_Z,
+        cov: int = 0,
+    ):
+        """Initializes the Python HHblits wrapper.
+
+        Args:
+          binary_path: The path to the HHblits executable.
+          databases: A sequence of HHblits database paths. This should be the
+            common prefix for the database files (i.e. up to but not including
+            _hhm.ffindex etc.)
+          n_cpu: The number of CPUs to give HHblits.
+          n_iter: The number of HHblits iterations.
+          e_value: The E-value, see HHblits docs for more details.
+          maxseq: The maximum number of rows in an input alignment. Note that this
+            parameter is only supported in HHBlits version 3.1 and higher.
+          realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
+          maxfilt: Max number of hits allowed to pass the 2nd prefilter.
+            HHblits default: 20000.
+          min_prefilter_hits: Min number of hits to pass prefilter.
+            HHblits default: 100.
+          all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
+            HHblits default: False.
+          alt: Show up to this many alternative alignments.
+          p: Minimum Prob for a hit to be included in the output hhr file.
+            HHblits default: 20.
+          z: Hard cap on number of hits reported in the hhr file.
+            HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
+          cov: Minimum coverage with master sequence (%).
+            HHBlits default: 0
+
+        Raises:
+          RuntimeError: If HHblits binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error(
+                    "Could not find HHBlits database %s", database_path
+                )
+                raise ValueError(
+                    f"Could not find HHBlits database {database_path}"
+                )
+
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.maxseq = maxseq
+        self.realign_max = realign_max
+        self.maxfilt = maxfilt
+        self.min_prefilter_hits = min_prefilter_hits
+        self.all_seqs = all_seqs
+        self.alt = alt
+        self.p = p
+        self.z = z
+        self.cov = cov
+
+    def query(self, input_fasta_path: str) -> Mapping[str, Any]:
+        """Queries the database using HHblits."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-cpu",
+                str(self.n_cpu),
+                "-oa3m",
+                a3m_path,
+                "-o",
+                "/dev/null",
+                "-n",
+                str(self.n_iter),
+                "-e",
+                str(self.e_value),
+                "-maxseq",
+                str(self.maxseq),
+                "-realign_max",
+                str(self.realign_max),
+                "-maxfilt",
+                str(self.maxfilt),
+                "-min_prefilter_hits",
+                str(self.min_prefilter_hits),
+            ]
+            if self.all_seqs:
+                cmd += ["-all"]
+            if self.alt:
+                cmd += ["-alt", str(self.alt)]
+            if self.p != _HHBLITS_DEFAULT_P:
+                cmd += ["-p", str(self.p)]
+            if self.z != _HHBLITS_DEFAULT_Z:
+                cmd += ["-Z", str(self.z)]
+            if self.cov:
+                cmd += ["-cov", str(self.cov)]
+            cmd += db_cmd
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+
+            with utils.timing("HHblits query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                # Logs have a 15k character limit, so log HHblits error line by line.
+                logging.error("HHblits failed. HHblits stderr begin:")
+                for error_line in stderr.decode("utf-8").splitlines():
+                    if error_line.strip():
+                        logging.error(error_line.strip())
+                logging.error("HHblits stderr end")
+                raise RuntimeError(
+                    "HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:500_000].decode("utf-8"))
+                )
+
+            with open(a3m_path) as f:
+                a3m = f.read()
+
+        raw_output = dict(
+            a3m=a3m,
+            output=stdout,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return raw_output
--- a/fastfold/data/tools/hhsearch.py
+++ b/fastfold/data/tools/hhsearch.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to run HHsearch from Python."""
+import glob
+import logging
+import os
+import subprocess
+from typing import Sequence, Union
+
+from fastfold.data.tools import utils
+
+
+class HHSearch:
+    """Python wrapper of the HHsearch binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 2,
+        maxseq: int = 1_000_000,
+        mact: float = 0.35,
+        min_align: int = 10,
+        max_align: int = 500,
+        min_lines: int = 10,
+        max_lines: int = 500,
+        aliw: int = 100000,
+        e_value: float = 0.001,
+        min_prob: float = 20.0,
+    ):
+        """Initializes the Python HHsearch wrapper.
+
+        Args:
+          binary_path: The path to the HHsearch executable.
+          databases: A sequence of HHsearch database paths. This should be the
+            common prefix for the database files (i.e. up to but not including
+            _hhm.ffindex etc.)
+          n_cpu: The number of CPUs to use
+          maxseq: The maximum number of rows in an input alignment. Note that this
+            parameter is only supported in HHBlits version 3.1 and higher.
+          mact: Posterior probability threshold for MAC realignment controlling greediness at alignment
+            ends.
+          min_align: Minimum number of alignments in alignment list. (-b)
+          max_align: Maximum number of alignments in alignment list. (-B)
+          min_lines: Minimum number of lines in summary hit list. (-z)
+          max_lines: Maximum number of lines in summary hit list. (-Z)
+          aliw: Number of columns per line in alignment list.
+          e_value: E-value cutoff for inclusion in result alignment. (-e)
+          min_prob: Minimum probability in summary and alignment list. (-p)
+
+        Raises:
+          RuntimeError: If HHsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        self.n_cpu = n_cpu
+        self.maxseq = maxseq
+        self.mact = mact
+        self.min_align = min_align
+        self.max_align = max_align
+        self.min_lines = min_lines
+        self.max_lines = max_lines
+        self.aliw = aliw
+        self.e_value = e_value
+        self.min_prob = min_prob
+
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error(
+                    "Could not find HHsearch database %s", database_path
+                )
+                raise ValueError(
+                    f"Could not find HHsearch database {database_path}"
+                )
+                
+    @property
+    def output_format(self) -> str:
+        return 'hhr'
+
+    @property
+    def input_format(self) -> str:
+        return 'a3m'
+
+    def query(self, a3m: str, gen_atab: bool = False) -> Union[str, tuple]:
+        """Queries the database using HHsearch using a given a3m."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            input_path = os.path.join(query_tmp_dir, "query.a3m")
+            hhr_path = os.path.join(query_tmp_dir, "output.hhr")
+            atab_path = os.path.join(query_tmp_dir, "output.atab")
+            with open(input_path, "w") as f:
+                f.write(a3m)
+
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_path,
+                "-o",
+                hhr_path,
+                "-maxseq",
+                str(self.maxseq),
+                "-cpu",
+                str(self.n_cpu),
+                "-b",
+                str(self.min_align),
+                "-B",
+                str(self.max_align),
+                "-z",
+                str(self.min_lines),
+                "-Z",
+                str(self.max_lines),
+                "-mact",
+                str(self.mact),
+                "-aliw",
+                str(self.aliw),
+                "-e",
+                str(self.e_value),
+                "-p",
+                str(self.min_prob),
+            ] + db_cmd
+            if gen_atab:
+                cmd += ["-atab", atab_path]
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("HHsearch query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                # Stderr is truncated to prevent proto size errors in Beam.
+                raise RuntimeError(
+                    "HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:100_000].decode("utf-8"))
+                )
+
+            with open(hhr_path) as f:
+                hhr = f.read()
+            if gen_atab:
+                with open(atab_path) as f:
+                    atab = f.read()
+        if gen_atab:
+            return hhr, atab
+        else:
+            return hhr
--- a/fastfold/data/tools/hmmbuild.py
+++ b/fastfold/data/tools/hmmbuild.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Python wrapper for hmmbuild - construct HMM profiles from MSA."""
+
+import os
+import re
+import subprocess
+
+from absl import logging
+from fastfold.data.tools import utils
+
+
+class Hmmbuild(object):
+  """Python wrapper of the hmmbuild binary."""
+
+  def __init__(self,
+               *,
+               binary_path: str,
+               singlemx: bool = False):
+    """Initializes the Python hmmbuild wrapper.
+
+    Args:
+      binary_path: The path to the hmmbuild executable.
+      singlemx: Whether to use --singlemx flag. If True, it forces HMMBuild to
+        just use a common substitution score matrix.
+
+    Raises:
+      RuntimeError: If hmmbuild binary not found within the path.
+    """
+    self.binary_path = binary_path
+    self.singlemx = singlemx
+
+  def build_profile_from_sto(self, sto: str, model_construction='fast') -> str:
+    """Builds a HHM for the aligned sequences given as an A3M string.
+
+    Args:
+      sto: A string with the aligned sequences in the Stockholm format.
+      model_construction: Whether to use reference annotation in the msa to
+        determine consensus columns ('hand') or default ('fast').
+
+    Returns:
+      A string with the profile in the HMM format.
+
+    Raises:
+      RuntimeError: If hmmbuild fails.
+    """
+    return self._build_profile(sto, model_construction=model_construction)
+
+  def build_profile_from_a3m(self, a3m: str) -> str:
+    """Builds a HHM for the aligned sequences given as an A3M string.
+
+    Args:
+      a3m: A string with the aligned sequences in the A3M format.
+
+    Returns:
+      A string with the profile in the HMM format.
+
+    Raises:
+      RuntimeError: If hmmbuild fails.
+    """
+    lines = []
+    for line in a3m.splitlines():
+      if not line.startswith('>'):
+        line = re.sub('[a-z]+', '', line)  # Remove inserted residues.
+      lines.append(line + '\n')
+    msa = ''.join(lines)
+    return self._build_profile(msa, model_construction='fast')
+
+  def _build_profile(self, msa: str, model_construction: str = 'fast') -> str:
+    """Builds a HMM for the aligned sequences given as an MSA string.
+
+    Args:
+      msa: A string with the aligned sequences, in A3M or STO format.
+      model_construction: Whether to use reference annotation in the msa to
+        determine consensus columns ('hand') or default ('fast').
+
+    Returns:
+      A string with the profile in the HMM format.
+
+    Raises:
+      RuntimeError: If hmmbuild fails.
+      ValueError: If unspecified arguments are provided.
+    """
+    if model_construction not in {'hand', 'fast'}:
+      raise ValueError(f'Invalid model_construction {model_construction} - only'
+                       'hand and fast supported.')
+
+    with utils.tmpdir_manager() as query_tmp_dir:
+      input_query = os.path.join(query_tmp_dir, 'query.msa')
+      output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm')
+
+      with open(input_query, 'w') as f:
+        f.write(msa)
+
+      cmd = [self.binary_path]
+      # If adding flags, we have to do so before the output and input:
+
+      if model_construction == 'hand':
+        cmd.append(f'--{model_construction}')
+      if self.singlemx:
+        cmd.append('--singlemx')
+      cmd.extend([
+          '--amino',
+          output_hmm_path,
+          input_query,
+      ])
+
+      logging.info('Launching subprocess %s', cmd)
+      process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE)
+
+      with utils.timing('hmmbuild query'):
+        stdout, stderr = process.communicate()
+        retcode = process.wait()
+        logging.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
+                     stdout.decode('utf-8'), stderr.decode('utf-8'))
+
+      if retcode:
+        raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n'
+                           % (stdout.decode('utf-8'), stderr.decode('utf-8')))
+
+      with open(output_hmm_path, encoding='utf-8') as f:
+        hmm = f.read()
+
+    return hmm
--- a/fastfold/data/tools/hmmsearch.py
+++ b/fastfold/data/tools/hmmsearch.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Python wrapper for hmmsearch - search profile against a sequence db."""
+
+import os
+import subprocess
+from typing import Optional, Sequence
+
+from absl import logging
+from fastfold.data import parsers
+from fastfold.data.tools import hmmbuild
+from fastfold.data.tools import utils
+
+
+class Hmmsearch(object):
+    """Python wrapper of the hmmsearch binary."""
+
+    def __init__(self,
+        *,
+        binary_path: str,
+        hmmbuild_binary_path: str,
+        database_path: str,
+        n_cpu: int=8,
+        flags: Optional[Sequence[str]] = None
+    ):
+        """Initializes the Python hmmsearch wrapper.
+
+        Args:
+            binary_path: The path to the hmmsearch executable.
+            hmmbuild_binary_path: The path to the hmmbuild executable. Used to build
+                an hmm from an input a3m.
+            database_path: The path to the hmmsearch database (FASTA format).
+            flags: List of flags to be used by hmmsearch.
+
+        Raises:
+            RuntimeError: If hmmsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.hmmbuild_runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
+        self.database_path = database_path
+        self.n_cpu = n_cpu
+        if flags is None:
+            # Default hmmsearch run settings.
+            flags = ['--F1', '0.1',
+                             '--F2', '0.1',
+                             '--F3', '0.1',
+                             '--incE', '100',
+                             '-E', '100',
+                             '--domE', '100',
+                             '--incdomE', '100']
+        self.flags = flags
+
+        if not os.path.exists(self.database_path):
+            logging.error('Could not find hmmsearch database %s', database_path)
+            raise ValueError(f'Could not find hmmsearch database {database_path}')
+
+    @property
+    def output_format(self) -> str:
+        return 'sto'
+
+    @property
+    def input_format(self) -> str:
+        return 'sto'
+
+    def query(self, msa_sto: str, output_dir: Optional[str] = None) -> str:
+        """Queries the database using hmmsearch using a given stockholm msa."""
+        hmm = self.hmmbuild_runner.build_profile_from_sto(
+            msa_sto,
+            model_construction='hand'
+        )
+        return self.query_with_hmm(hmm, output_dir)
+
+    def query_with_hmm(self, 
+        hmm: str, 
+        output_dir: Optional[str] = None
+    ) -> str:
+        """Queries the database using hmmsearch using a given hmm."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            hmm_input_path = os.path.join(query_tmp_dir, 'query.hmm')
+            output_dir = query_tmp_dir if output_dir is None else output_dir
+            out_path = os.path.join(output_dir, 'hmm_output.sto')
+            with open(hmm_input_path, 'w') as f:
+                f.write(hmm)
+
+            cmd = [
+                    self.binary_path,
+                    '--noali',    # Don't include the alignment in stdout.
+                    '--cpu', str(self.n_cpu)
+            ]
+            # If adding flags, we have to do so before the output and input:
+            if self.flags:
+                cmd.extend(self.flags)
+            cmd.extend([
+                    '-A', out_path,
+                    hmm_input_path,
+                    self.database_path,
+            ])
+
+            logging.info('Launching sub-process %s', cmd)
+            process = subprocess.Popen(
+                    cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            with utils.timing(
+                    f'hmmsearch ({os.path.basename(self.database_path)}) query'):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                raise RuntimeError(
+                        'hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
+                                stdout.decode('utf-8'), stderr.decode('utf-8')))
+
+            with open(out_path) as f:
+                out_msa = f.read()
+
+        return out_msa
+
+    @staticmethod
+    def get_template_hits(
+        output_string: str,
+        input_sequence: str
+    ) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool."""
+        template_hits = parsers.parse_hmmsearch_sto(
+            output_string,
+            input_sequence,
+        )
+        return template_hits
--- a/fastfold/data/tools/jackhmmer.py
+++ b/fastfold/data/tools/jackhmmer.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to run Jackhmmer from Python."""
+
+from concurrent import futures
+import glob
+import logging
+import os
+import subprocess
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+
+from fastfold.data import parsers
+from fastfold.data.tools import utils
+
+
+class Jackhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        database_path: str,
+        n_cpu: int = 8,
+        n_iter: int = 1,
+        e_value: float = 0.0001,
+        z_value: Optional[int] = None,
+        get_tblout: bool = False,
+        filter_f1: float = 0.0005,
+        filter_f2: float = 0.00005,
+        filter_f3: float = 0.0000005,
+        incdom_e: Optional[float] = None,
+        dom_e: Optional[float] = None,
+        num_streamed_chunks: Optional[int] = None,
+        streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Jackhmmer wrapper.
+
+        Args:
+          binary_path: The path to the jackhmmer executable.
+          database_path: The path to the jackhmmer database (FASTA format).
+          n_cpu: The number of CPUs to give Jackhmmer.
+          n_iter: The number of Jackhmmer iterations.
+          e_value: The E-value, see Jackhmmer docs for more details.
+          z_value: The Z-value, see Jackhmmer docs for more details.
+          get_tblout: Whether to save tblout string.
+          filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+          filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+          filter_f3: Forward pre-filter, set to >1.0 to turn off.
+          incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+            round.
+          dom_e: Domain e-value criteria for inclusion in tblout.
+          num_streamed_chunks: Number of database chunks to stream over.
+          streaming_callback: Callback function run after each chunk iteration with
+            the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+
+        if (
+            not os.path.exists(self.database_path)
+            and num_streamed_chunks is None
+        ):
+            logging.error("Could not find Jackhmmer database %s", database_path)
+            raise ValueError(
+                f"Could not find Jackhmmer database {database_path}"
+            )
+
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.z_value = z_value
+        self.filter_f1 = filter_f1
+        self.filter_f2 = filter_f2
+        self.filter_f3 = filter_f3
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+
+    def _query_chunk(
+        self, 
+        input_fasta_path: str, 
+        database_path: str,
+        max_sequences: Optional[int] = None
+    ) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, "output.sto")
+
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.  They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                "-o",
+                "/dev/null",
+                "-A",
+                sto_path,
+                "--noali",
+                "--F1",
+                str(self.filter_f1),
+                "--F2",
+                str(self.filter_f2),
+                "--F3",
+                str(self.filter_f3),
+                "--incE",
+                str(self.e_value),
+                # Report only sequences with E-values <= x in per-sequence output.
+                "-E",
+                str(self.e_value),
+                "--cpu",
+                str(self.n_cpu),
+                "-N",
+                str(self.n_iter),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
+                cmd_flags.extend(["--tblout", tblout_path])
+
+            if self.z_value:
+                cmd_flags.extend(["-Z", str(self.z_value)])
+
+            if self.dom_e is not None:
+                cmd_flags.extend(["--domE", str(self.dom_e)])
+
+            if self.incdom_e is not None:
+                cmd_flags.extend(["--incdomE", str(self.incdom_e)])
+
+            cmd = (
+                [self.binary_path]
+                + cmd_flags
+                + [input_fasta_path, database_path]
+            )
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(
+                f"Jackhmmer ({os.path.basename(database_path)}) query"
+            ):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                raise RuntimeError(
+                    "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
+                )
+
+            # Get e-values for each target name
+            tbl = ""
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+
+            if(max_sequences is None):
+                with open(sto_path) as f:
+                    sto = f.read()
+            else:
+                sto = parsers.truncate_stockholm_msa(sto_path, max_sequences)
+
+        raw_output = dict(
+            sto=sto,
+            tbl=tbl,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+
+        return raw_output
+
+    def query(self,
+    input_fasta_path: str, 
+    max_sequences: Optional[int] = None
+    ) -> Sequence[Mapping[str, Any]]:
+        """Queries the database using Jackhmmer."""
+        if self.num_streamed_chunks is None:
+            single_chunk_result = self._query_chunk(
+                input_fasta_path, self.database_path, max_sequences,
+            )
+            return [single_chunk_result]
+
+        db_basename = os.path.basename(self.database_path)
+        db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
+        db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
+
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk("[0-9]*")):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f"OSError while deleting {f}")
+
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_output = []
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i),
+                        db_local_chunk(i),
+                    )
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+
+                # Run Jackhmmer with the chunk
+                future.result()
+                chunked_output.append(
+                    self._query_chunk(
+                        input_fasta_path, 
+                        db_local_chunk(i), 
+                        max_sequences
+                    )
+                )
+
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                future = next_future
+                # Do not set next_future for the last chunk so that this works
+                # even for databases with only 1 chunk
+                if(i < self.num_streamed_chunks):
+                    future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_output
--- a/fastfold/data/tools/kalign.py
+++ b/fastfold/data/tools/kalign.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Python wrapper for Kalign."""
+import os
+import subprocess
+from typing import Sequence
+
+from absl import logging
+
+from fastfold.data.tools import utils
+
+
+def _to_a3m(sequences: Sequence[str]) -> str:
+    """Converts sequences to an a3m file."""
+    names = ["sequence %d" % i for i in range(1, len(sequences) + 1)]
+    a3m = []
+    for sequence, name in zip(sequences, names):
+        a3m.append(u">" + name + u"\n")
+        a3m.append(sequence + u"\n")
+    return "".join(a3m)
+
+
+class Kalign:
+    """Python wrapper of the Kalign binary."""
+
+    def __init__(self, *, binary_path: str):
+        """Initializes the Python Kalign wrapper.
+
+        Args:
+          binary_path: The path to the Kalign binary.
+
+        Raises:
+          RuntimeError: If Kalign binary not found within the path.
+        """
+        self.binary_path = binary_path
+
+    def align(self, sequences: Sequence[str]) -> str:
+        """Aligns the sequences and returns the alignment in A3M string.
+
+        Args:
+          sequences: A list of query sequence strings. The sequences have to be at
+            least 6 residues long (Kalign requires this). Note that the order in
+            which you give the sequences might alter the output slightly as
+            different alignment tree might get constructed.
+
+        Returns:
+          A string with the alignment in a3m format.
+
+        Raises:
+          RuntimeError: If Kalign fails.
+          ValueError: If any of the sequences is less than 6 residues long.
+        """
+        logging.info("Aligning %d sequences", len(sequences))
+
+        for s in sequences:
+            if len(s) < 6:
+                raise ValueError(
+                    "Kalign requires all sequences to be at least 6 "
+                    "residues long. Got %s (%d residues)." % (s, len(s))
+                )
+
+        with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
+            input_fasta_path = os.path.join(query_tmp_dir, "input.fasta")
+            output_a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+
+            with open(input_fasta_path, "w") as f:
+                f.write(_to_a3m(sequences))
+
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-o",
+                output_a3m_path,
+                "-format",
+                "fasta",
+            ]
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+
+            with utils.timing("Kalign query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    "Kalign stdout:\n%s\n\nstderr:\n%s\n",
+                    stdout.decode("utf-8"),
+                    stderr.decode("utf-8"),
+                )
+
+            if retcode:
+                raise RuntimeError(
+                    "Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+                )
+
+            with open(output_a3m_path) as f:
+                a3m = f.read()
+
+            return a3m
--- a/fastfold/data/tools/utils.py
+++ b/fastfold/data/tools/utils.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common utilities for data pipeline tools."""
+import contextlib
+import datetime
+import logging
+import shutil
+import tempfile
+import time
+from typing import Optional
+
+
+@contextlib.contextmanager
+def tmpdir_manager(base_dir: Optional[str] = None):
+    """Context manager that deletes a temporary directory on exit."""
+    tmpdir = tempfile.mkdtemp(dir=base_dir)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+@contextlib.contextmanager
+def timing(msg: str):
+    logging.info("Started %s", msg)
+    tic = time.perf_counter()
+    yield
+    toc = time.perf_counter()
+    logging.info("Finished %s in %.3f seconds", msg, toc - tic)
+
+
+def to_date(s: str):
+    return datetime.datetime(
+        year=int(s[:4]), month=int(s[5:7]), day=int(s[8:10])
+    )