update uni-fold

a1c29028 · zhangqha · a1c29028 · a1c29028 · a1c29028 · a1c29028
Commit a1c29028 authored Apr 17, 2023 by zhangqha
20 changed files
--- a/unifold/msa/__pycache__/utils.cpython-37.pyc
+++ b/unifold/msa/__pycache__/utils.cpython-37.pyc
--- a/unifold/msa/mmcif.py
+++ b/unifold/msa/mmcif.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Parses the mmCIF file format."""
+import collections
+import dataclasses
+import functools
+import io
+from typing import Any, Mapping, Optional, Sequence, Tuple
+
+from absl import logging
+from Bio import PDB
+from Bio.PDB.MMCIFParser import MMCIFParser
+from Bio.Data import SCOPData
+
+# Type aliases:
+ChainId = str
+PdbHeader = Mapping[str, Any]
+PdbStructure = PDB.Structure.Structure
+SeqRes = str
+MmCIFDict = Mapping[str, Sequence[str]]
+
+
+@dataclasses.dataclass(frozen=True)
+class Monomer:
+    id: str
+    num: int
+
+
+# Note - mmCIF format provides no guarantees on the type of author-assigned
+# sequence numbers. They need not be integers.
+@dataclasses.dataclass(frozen=True)
+class AtomSite:
+    residue_name: str
+    author_chain_id: str
+    mmcif_chain_id: str
+    author_seq_num: str
+    mmcif_seq_num: int
+    insertion_code: str
+    hetatm_atom: str
+    model_num: int
+
+
+# Used to map SEQRES index to a residue in the structure.
+@dataclasses.dataclass(frozen=True)
+class ResiduePosition:
+    chain_id: str
+    residue_number: int
+    insertion_code: str
+
+
+@dataclasses.dataclass(frozen=True)
+class ResidueAtPosition:
+    position: Optional[ResiduePosition]
+    name: str
+    is_missing: bool
+    hetflag: str
+
+
+@dataclasses.dataclass(frozen=True)
+class MmcifObject:
+    """Representation of a parsed mmCIF file.
+
+    Contains:
+        file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
+            files being processed.
+        header: Biopython header.
+        structure: Biopython structure.
+        chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
+            {'A': 'ABCDEFG'}
+        seqres_to_structure: Dict; for each chain_id contains a mapping between
+            SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,  1: ResidueAtPosition, ...}}
+        raw_string: The raw string used to construct the MmcifObject.
+    """
+
+    file_id: str
+    header: PdbHeader
+    structure: PdbStructure
+    chain_to_seqres: Mapping[ChainId, SeqRes]
+    seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
+    raw_string: Any
+    mmcif_to_author_chain_id: Mapping[ChainId, ChainId]
+    valid_chains: Mapping[ChainId, str]
+
+
+@dataclasses.dataclass(frozen=True)
+class ParsingResult:
+    """Returned by the parse function.
+
+    Contains:
+        mmcif_object: A MmcifObject, may be None if no chain could be successfully
+            parsed.
+        errors: A dict mapping (file_id, chain_id) to any exception generated.
+    """
+
+    mmcif_object: Optional[MmcifObject]
+    errors: Mapping[Tuple[str, str], Any]
+
+
+class ParseError(Exception):
+    """An error indicating that an mmCIF file could not be parsed."""
+
+
+def mmcif_loop_to_list(
+    prefix: str, parsed_info: MmCIFDict
+) -> Sequence[Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a list.
+
+    Reference for loop_ in mmCIF:
+        http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
+
+    Args:
+        prefix: Prefix shared by each of the data items in the loop.
+            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+            _entity_poly_seq.mon_id. Should include the trailing period.
+        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+            parser.
+
+    Returns:
+        Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
+    """
+    cols = []
+    data = []
+    for key, value in parsed_info.items():
+        if key.startswith(prefix):
+            cols.append(key)
+            data.append(value)
+
+    assert all([len(xs) == len(data[0]) for xs in data]), (
+        "mmCIF error: Not all loops are the same length: %s" % cols
+    )
+
+    return [dict(zip(cols, xs)) for xs in zip(*data)]
+
+
+def mmcif_loop_to_dict(
+    prefix: str,
+    index: str,
+    parsed_info: MmCIFDict,
+) -> Mapping[str, Mapping[str, str]]:
+    """Extracts loop associated with a prefix from mmCIF data as a dictionary.
+
+    Args:
+        prefix: Prefix shared by each of the data items in the loop.
+            e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
+            _entity_poly_seq.mon_id. Should include the trailing period.
+        index: Which item of loop data should serve as the key.
+        parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
+            parser.
+
+    Returns:
+        Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
+        indexed by the index column.
+    """
+    entries = mmcif_loop_to_list(prefix, parsed_info)
+    return {entry[index]: entry for entry in entries}
+
+
+@functools.lru_cache(16, typed=False)
+def fast_parse(
+    *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
+) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+
+    Args:
+        file_id: A string identifier for this file. Should be unique within the
+            collection of files being processed.
+        mmcif_string: Contents of an mmCIF file.
+        catch_all_errors: If True, all exceptions are caught and error messages are
+            returned as part of the ParsingResult. If False exceptions will be allowed
+            to propagate.
+
+    Returns:
+        A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = MMCIFParser(QUIET=True)
+        handle = io.StringIO(mmcif_string)
+        full_structure = parser.get_structure("", handle)
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+
+        header = _get_header(parsed_info)
+
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ""): "No protein chains found in this file."}
+            )
+
+        mmcif_to_author_chain_id = {}
+        seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != "1":
+                # We only process the first model at the moment.
+                continue
+            mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
+
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=None,
+            chain_to_seqres=None,
+            seqres_to_structure=None,
+            raw_string=parsed_info,
+            mmcif_to_author_chain_id=mmcif_to_author_chain_id,
+            valid_chains=valid_chains,
+        )
+
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, "")] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+
+
+@functools.lru_cache(16, typed=False)
+def parse(
+    *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
+) -> ParsingResult:
+    """Entry point, parses an mmcif_string.
+
+    Args:
+        file_id: A string identifier for this file. Should be unique within the
+            collection of files being processed.
+        mmcif_string: Contents of an mmCIF file.
+        catch_all_errors: If True, all exceptions are caught and error messages are
+            returned as part of the ParsingResult. If False exceptions will be allowed
+            to propagate.
+
+    Returns:
+        A ParsingResult.
+    """
+    errors = {}
+    try:
+        parser = PDB.MMCIFParser(QUIET=True)
+        handle = io.StringIO(mmcif_string)
+        full_structure = parser.get_structure("", handle)
+        first_model_structure = _get_first_model(full_structure)
+        # Extract the _mmcif_dict from the parser, which contains useful fields not
+        # reflected in the Biopython structure.
+        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access
+
+        # Ensure all values are lists, even if singletons.
+        for key, value in parsed_info.items():
+            if not isinstance(value, list):
+                parsed_info[key] = [value]
+
+        header = _get_header(parsed_info)
+
+        # Determine the protein chains, and their start numbers according to the
+        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
+        valid_chains = _get_protein_chains(parsed_info=parsed_info)
+        if not valid_chains:
+            return ParsingResult(
+                None, {(file_id, ""): "No protein chains found in this file."}
+            )
+        seq_start_num = {
+            chain_id: min([monomer.num for monomer in seq])
+            for chain_id, seq in valid_chains.items()
+        }
+
+        # Loop over the atoms for which we have coordinates. Populate two mappings:
+        # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
+        # the authors / Biopython).
+        # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
+        mmcif_to_author_chain_id = {}
+        seq_to_structure_mappings = {}
+        for atom in _get_atom_site_list(parsed_info):
+            if atom.model_num != "1":
+                # We only process the first model at the moment.
+                continue
+
+            mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
+
+            if atom.mmcif_chain_id in valid_chains:
+                hetflag = " "
+                if atom.hetatm_atom == "HETATM":
+                    # Water atoms are assigned a special hetflag of W in Biopython. We
+                    # need to do the same, so that this hetflag can be used to fetch
+                    # a residue from the Biopython structure by id.
+                    if atom.residue_name in ("HOH", "WAT"):
+                        hetflag = "W"
+                    else:
+                        hetflag = "H_" + atom.residue_name
+                insertion_code = atom.insertion_code
+                if not _is_set(atom.insertion_code):
+                    insertion_code = " "
+                position = ResiduePosition(
+                    chain_id=atom.author_chain_id,
+                    residue_number=int(atom.author_seq_num),
+                    insertion_code=insertion_code,
+                )
+                seq_idx = int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
+                current = seq_to_structure_mappings.get(atom.author_chain_id, {})
+                current[seq_idx] = ResidueAtPosition(
+                    position=position,
+                    name=atom.residue_name,
+                    is_missing=False,
+                    hetflag=hetflag,
+                )
+                seq_to_structure_mappings[atom.author_chain_id] = current
+
+        # Add missing residue information to seq_to_structure_mappings.
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            current_mapping = seq_to_structure_mappings[author_chain]
+            for idx, monomer in enumerate(seq_info):
+                if idx not in current_mapping:
+                    current_mapping[idx] = ResidueAtPosition(
+                        position=None, name=monomer.id, is_missing=True, hetflag=" "
+                    )
+
+        author_chain_to_sequence = {}
+        for chain_id, seq_info in valid_chains.items():
+            author_chain = mmcif_to_author_chain_id[chain_id]
+            seq = []
+            for monomer in seq_info:
+                code = SCOPData.protein_letters_3to1.get(monomer.id, "X")
+                seq.append(code if len(code) == 1 else "X")
+            seq = "".join(seq)
+            author_chain_to_sequence[author_chain] = seq
+
+        mmcif_object = MmcifObject(
+            file_id=file_id,
+            header=header,
+            structure=first_model_structure,
+            chain_to_seqres=author_chain_to_sequence,
+            seqres_to_structure=seq_to_structure_mappings,
+            raw_string=parsed_info,
+            mmcif_to_author_chain_id=mmcif_to_author_chain_id,
+            valid_chains=valid_chains,
+        )
+
+        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
+    except Exception as e:  # pylint:disable=broad-except
+        errors[(file_id, "")] = e
+        if not catch_all_errors:
+            raise
+        return ParsingResult(mmcif_object=None, errors=errors)
+
+
+def _get_first_model(structure: PdbStructure) -> PdbStructure:
+    """Returns the first model in a Biopython structure."""
+    return next(structure.get_models())
+
+
+_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
+
+
+def get_release_date(parsed_info: MmCIFDict) -> str:
+    """Returns the oldest revision date."""
+    revision_dates = parsed_info["_pdbx_audit_revision_history.revision_date"]
+    return min(revision_dates)
+
+
+def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
+    """Returns a basic header containing method, release date and resolution."""
+    header = {}
+
+    experiments = mmcif_loop_to_list("_exptl.", parsed_info)
+    header["structure_method"] = ",".join(
+        [experiment["_exptl.method"].lower() for experiment in experiments]
+    )
+
+    # Note: The release_date here corresponds to the oldest revision. We prefer to
+    # use this for dataset filtering over the deposition_date.
+    if "_pdbx_audit_revision_history.revision_date" in parsed_info:
+        header["release_date"] = get_release_date(parsed_info)
+    else:
+        logging.warning(
+            "Could not determine release_date: %s", parsed_info["_entry.id"]
+        )
+
+    header["resolution"] = 0.00
+    for res_key in (
+        "_refine.ls_d_res_high",
+        "_em_3d_reconstruction.resolution",
+        "_reflns.d_resolution_high",
+    ):
+        if res_key in parsed_info:
+            try:
+                raw_resolution = parsed_info[res_key][0]
+                header["resolution"] = float(raw_resolution)
+            except ValueError:
+                logging.debug("Invalid resolution format: %s", parsed_info[res_key])
+
+    return header
+
+
+def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
+    """Returns list of atom sites; contains data not present in the structure."""
+    return [
+        AtomSite(*site)
+        for site in zip(  # pylint:disable=g-complex-comprehension
+            parsed_info["_atom_site.label_comp_id"],
+            parsed_info["_atom_site.auth_asym_id"],
+            parsed_info["_atom_site.label_asym_id"],
+            parsed_info["_atom_site.auth_seq_id"],
+            parsed_info["_atom_site.label_seq_id"],
+            parsed_info["_atom_site.pdbx_PDB_ins_code"],
+            parsed_info["_atom_site.group_PDB"],
+            parsed_info["_atom_site.pdbx_PDB_model_num"],
+        )
+    ]
+
+
+def _get_protein_chains(
+    *, parsed_info: Mapping[str, Any]
+) -> Mapping[ChainId, Sequence[Monomer]]:
+    """Extracts polymer information for protein chains only.
+
+    Args:
+        parsed_info: _mmcif_dict produced by the Biopython parser.
+
+    Returns:
+        A dict mapping mmcif chain id to a list of Monomers.
+    """
+    # Get polymer information for each entity in the structure.
+    entity_poly_seqs = mmcif_loop_to_list("_entity_poly_seq.", parsed_info)
+
+    polymers = collections.defaultdict(list)
+    for entity_poly_seq in entity_poly_seqs:
+        polymers[entity_poly_seq["_entity_poly_seq.entity_id"]].append(
+            Monomer(
+                id=entity_poly_seq["_entity_poly_seq.mon_id"],
+                num=int(entity_poly_seq["_entity_poly_seq.num"]),
+            )
+        )
+
+    # Get chemical compositions. Will allow us to identify which of these polymers
+    # are proteins.
+    chem_comps = mmcif_loop_to_dict("_chem_comp.", "_chem_comp.id", parsed_info)
+
+    # Get chains information for each entity. Necessary so that we can return a
+    # dict keyed on chain id rather than entity.
+    struct_asyms = mmcif_loop_to_list("_struct_asym.", parsed_info)
+
+    entity_to_mmcif_chains = collections.defaultdict(list)
+    for struct_asym in struct_asyms:
+        chain_id = struct_asym["_struct_asym.id"]
+        entity_id = struct_asym["_struct_asym.entity_id"]
+        entity_to_mmcif_chains[entity_id].append(chain_id)
+
+    # Identify and return the valid protein chains.
+    valid_chains = {}
+    for entity_id, seq_info in polymers.items():
+        chain_ids = entity_to_mmcif_chains[entity_id]
+
+        # Reject polymers without any peptide-like components, such as DNA/RNA.
+        if any(
+            [
+                "peptide" in chem_comps[monomer.id]["_chem_comp.type"]
+                for monomer in seq_info
+            ]
+        ):
+            for chain_id in chain_ids:
+                valid_chains[chain_id] = seq_info
+    return valid_chains
+
+
+def _is_set(data: str) -> bool:
+    """Returns False if data is a special mmCIF character indicating 'unset'."""
+    return data not in (".", "?")
--- a/unifold/msa/msa_identifiers.py
+++ b/unifold/msa/msa_identifiers.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for extracting identifiers from MSA sequence descriptions."""
+
+import dataclasses
+import re
+from typing import Optional
+
+
+# Sequences coming from UniProtKB database come in the
+# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
+_UNIPROT_PATTERN = re.compile(
+    r"""
+        ^
+        # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
+        (?:tr|sp)
+        \|
+        # A primary accession number of the UniProtKB entry.
+        (?P<AccessionIdentifier>[A-Za-z0-9]{6,10})
+        # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
+        (?:_\d)?
+        \|
+        # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
+        # protein ID code.
+        (?:[A-Za-z0-9]+)
+        _
+        # A mnemonic species identification code.
+        (?P<SpeciesIdentifier>([A-Za-z0-9]){1,5})
+        # Small BFD uses a final value after an underscore, which we ignore.
+        (?:_\d+)?
+        $
+        """,
+    re.VERBOSE,
+)
+
+
+@dataclasses.dataclass(frozen=True)
+class Identifiers:
+    species_id: str = ""
+
+
+def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
+    """Gets accession id and species from an msa sequence identifier.
+
+    The sequence identifier has the format specified by
+    _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
+    An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+
+    Args:
+        msa_sequence_identifier: a sequence identifier.
+
+    Returns:
+        An `Identifiers` instance with a species_id. These
+        can be empty in the case where no identifier was found.
+    """
+    matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
+    if matches:
+        return Identifiers(species_id=matches.group("SpeciesIdentifier"))
+    return Identifiers()
+
+
+def _extract_sequence_identifier(description: str) -> Optional[str]:
+    """Extracts sequence identifier from description. Returns None if no match."""
+    split_description = description.split()
+    if split_description:
+        return split_description[0].partition("/")[0]
+    else:
+        return None
+
+
+def get_identifiers(description: str) -> Identifiers:
+    """Computes extra MSA features from the description."""
+    sequence_identifier = _extract_sequence_identifier(description)
+    if sequence_identifier is None:
+        return Identifiers()
+    else:
+        return _parse_sequence_identifier(sequence_identifier)
--- a/unifold/msa/parsers.py
+++ b/unifold/msa/parsers.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for parsing various file formats."""
+import collections
+import dataclasses
+import itertools
+import re
+import string
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Set
+
+DeletionMatrix = Sequence[Sequence[int]]
+
+
+@dataclasses.dataclass(frozen=True)
+class Msa:
+    """Class representing a parsed MSA file."""
+
+    sequences: Sequence[str]
+    deletion_matrix: DeletionMatrix
+    descriptions: Sequence[str]
+
+    def __post_init__(self):
+        if not (
+            len(self.sequences) == len(self.deletion_matrix) == len(self.descriptions)
+        ):
+            raise ValueError(
+                "All fields for an MSA must have the same length. "
+                f"Got {len(self.sequences)} sequences, "
+                f"{len(self.deletion_matrix)} rows in the deletion matrix and "
+                f"{len(self.descriptions)} descriptions."
+            )
+
+    def __len__(self):
+        return len(self.sequences)
+
+    def truncate(self, max_seqs: int):
+        return Msa(
+            sequences=self.sequences[:max_seqs],
+            deletion_matrix=self.deletion_matrix[:max_seqs],
+            descriptions=self.descriptions[:max_seqs],
+        )
+
+
+@dataclasses.dataclass(frozen=True)
+class TemplateHit:
+    """Class representing a template hit."""
+
+    index: int
+    name: str
+    aligned_cols: int
+    sum_probs: Optional[float]
+    query: str
+    hit_sequence: str
+    indices_query: List[int]
+    indices_hit: List[int]
+
+
+def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
+    """Parses FASTA string and returns list of strings with amino-acid sequences.
+
+    Arguments:
+        fasta_string: The string contents of a FASTA file.
+
+    Returns:
+        A tuple of two lists:
+        * A list of sequences.
+        * A list of sequence descriptions taken from the comment lines. In the
+            same order as the sequences.
+    """
+    sequences = []
+    descriptions = []
+    index = -1
+    for line in fasta_string.splitlines():
+        line = line.strip()
+        if line.startswith(">"):
+            index += 1
+            descriptions.append(line[1:])  # Remove the '>' at the beginning.
+            sequences.append("")
+            continue
+        elif not line:
+            continue  # Skip blank lines.
+        sequences[index] += line
+
+    return sequences, descriptions
+
+
+def parse_stockholm(stockholm_string: str) -> Msa:
+    """Parses sequences and deletion matrix from stockholm format alignment.
+
+    Args:
+        stockholm_string: The string contents of a stockholm file. The first
+            sequence in the file should be the query sequence.
+
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * The names of the targets matched, including the jackhmmer subsequence
+                suffix.
+    """
+    name_to_sequence = collections.OrderedDict()
+    for line in stockholm_string.splitlines():
+        line = line.strip()
+        if not line or line.startswith(("#", "//")):
+            continue
+        name, sequence = line.split()
+        if name not in name_to_sequence:
+            name_to_sequence[name] = ""
+        name_to_sequence[name] += sequence
+
+    msa = []
+    deletion_matrix = []
+
+    query = ""
+    keep_columns = []
+    for seq_index, sequence in enumerate(name_to_sequence.values()):
+        if seq_index == 0:
+            # Gather the columns with gaps from the query
+            query = sequence
+            keep_columns = [i for i, res in enumerate(query) if res != "-"]
+
+        # Remove the columns with gaps in the query from all sequences.
+        aligned_sequence = "".join([sequence[c] for c in keep_columns])
+
+        msa.append(aligned_sequence)
+
+        # Count the number of deletions w.r.t. query.
+        deletion_vec = []
+        deletion_count = 0
+        for seq_res, query_res in zip(sequence, query):
+            if seq_res != "-" or query_res != "-":
+                if query_res == "-":
+                    deletion_count += 1
+                else:
+                    deletion_vec.append(deletion_count)
+                    deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+
+    return Msa(
+        sequences=msa,
+        deletion_matrix=deletion_matrix,
+        descriptions=list(name_to_sequence.keys()),
+    )
+
+
+def parse_a3m(a3m_string: str) -> Msa:
+    """Parses sequences and deletion matrix from a3m format alignment.
+
+    Args:
+        a3m_string: The string contents of a a3m file. The first sequence in the
+            file should be the query sequence.
+
+    Returns:
+        A tuple of:
+            * A list of sequences that have been aligned to the query. These
+                might contain duplicates.
+            * The deletion matrix for the alignment as a list of lists. The element
+                at `deletion_matrix[i][j]` is the number of residues deleted from
+                the aligned sequence i at residue position j.
+            * A list of descriptions, one per sequence, from the a3m file.
+    """
+    sequences, descriptions = parse_fasta(a3m_string)
+    deletion_matrix = []
+    for msa_sequence in sequences:
+        deletion_vec = []
+        deletion_count = 0
+        for j in msa_sequence:
+            if j.islower():
+                deletion_count += 1
+            else:
+                deletion_vec.append(deletion_count)
+                deletion_count = 0
+        deletion_matrix.append(deletion_vec)
+
+    # Make the MSA matrix out of aligned (deletion-free) sequences.
+    deletion_table = str.maketrans("", "", string.ascii_lowercase)
+    aligned_sequences = [s.translate(deletion_table) for s in sequences]
+    return Msa(
+        sequences=aligned_sequences,
+        deletion_matrix=deletion_matrix,
+        descriptions=descriptions,
+    )
+
+
+def _convert_sto_seq_to_a3m(
+    query_non_gaps: Sequence[bool], sto_seq: str
+) -> Iterable[str]:
+    for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
+        if is_query_res_non_gap:
+            yield sequence_res
+        elif sequence_res != "-":
+            yield sequence_res.lower()
+
+
+def convert_stockholm_to_a3m(
+    stockholm_format: str,
+    max_sequences: Optional[int] = None,
+    remove_first_row_gaps: bool = True,
+) -> str:
+    """Converts MSA in Stockholm format to the A3M format."""
+    descriptions = {}
+    sequences = {}
+    reached_max_sequences = False
+
+    for line in stockholm_format.splitlines():
+        reached_max_sequences = max_sequences and len(sequences) >= max_sequences
+        if line.strip() and not line.startswith(("#", "//")):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname, aligned_seq = line.split(maxsplit=1)
+            if seqname not in sequences:
+                if reached_max_sequences:
+                    continue
+                sequences[seqname] = ""
+            sequences[seqname] += aligned_seq
+
+    for line in stockholm_format.splitlines():
+        if line[:4] == "#=GS":
+            # Description row - example format is:
+            # #=GS UniRef90_Q9H5Z4/4-78                        DE [subseq from] cDNA: FLJ22755 ...
+            columns = line.split(maxsplit=3)
+            seqname, feature = columns[1:3]
+            value = columns[3] if len(columns) == 4 else ""
+            if feature != "DE":
+                continue
+            if reached_max_sequences and seqname not in sequences:
+                continue
+            descriptions[seqname] = value
+            if len(descriptions) == len(sequences):
+                break
+
+    # Convert sto format to a3m line by line
+    a3m_sequences = {}
+    if remove_first_row_gaps:
+        # query_sequence is assumed to be the first sequence
+        query_sequence = next(iter(sequences.values()))
+        query_non_gaps = [res != "-" for res in query_sequence]
+    for seqname, sto_sequence in sequences.items():
+        # Dots are optional in a3m format and are commonly removed.
+        out_sequence = sto_sequence.replace(".", "")
+        if remove_first_row_gaps:
+            out_sequence = "".join(
+                _convert_sto_seq_to_a3m(query_non_gaps, out_sequence)
+            )
+        a3m_sequences[seqname] = out_sequence
+
+    fasta_chunks = (
+        f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}" for k in a3m_sequences
+    )
+    return "\n".join(fasta_chunks) + "\n"  # Include terminating newline.
+
+
+def _keep_line(line: str, seqnames: Set[str]) -> bool:
+    """Function to decide which lines to keep."""
+    if not line.strip():
+        return True
+    if line.strip() == "//":  # End tag
+        return True
+    if line.startswith("# STOCKHOLM"):  # Start tag
+        return True
+    if line.startswith("#=GC RF"):  # Reference Annotation Line
+        return True
+    if line[:4] == "#=GS":  # Description lines - keep if sequence in list.
+        _, seqname, _ = line.split(maxsplit=2)
+        return seqname in seqnames
+    elif line.startswith("#"):  # Other markup - filter out
+        return False
+    else:  # Alignment data - keep if sequence in list.
+        seqname = line.partition(" ")[0]
+        return seqname in seqnames
+
+
+def truncate_stockholm_msa(stockholm_msa: str, max_sequences: int) -> str:
+    """Truncates a stockholm file to a maximum number of sequences."""
+    seqnames = set()
+    filtered_lines = []
+    for line in stockholm_msa.splitlines():
+        if line.strip() and not line.startswith(("#", "//")):
+            # Ignore blank lines, markup and end symbols - remainder are alignment
+            # sequence parts.
+            seqname = line.partition(" ")[0]
+            seqnames.add(seqname)
+            if len(seqnames) >= max_sequences:
+                break
+
+    for line in stockholm_msa.splitlines():
+        if _keep_line(line, seqnames):
+            filtered_lines.append(line)
+
+    return "\n".join(filtered_lines) + "\n"
+
+
+def remove_empty_columns_from_stockholm_msa(stockholm_msa: str) -> str:
+    """Removes empty columns (dashes-only) from a Stockholm MSA."""
+    processed_lines = {}
+    unprocessed_lines = {}
+    for i, line in enumerate(stockholm_msa.splitlines()):
+        if line.startswith("#=GC RF"):
+            reference_annotation_i = i
+            reference_annotation_line = line
+            # Reached the end of this chunk of the alignment. Process chunk.
+            _, _, first_alignment = line.rpartition(" ")
+            mask = []
+            for j in range(len(first_alignment)):
+                for _, unprocessed_line in unprocessed_lines.items():
+                    prefix, _, alignment = unprocessed_line.rpartition(" ")
+                    if alignment[j] != "-":
+                        mask.append(True)
+                        break
+                else:  # Every row contained a hyphen - empty column.
+                    mask.append(False)
+            # Add reference annotation for processing with mask.
+            unprocessed_lines[reference_annotation_i] = reference_annotation_line
+
+            if not any(mask):  # All columns were empty. Output empty lines for chunk.
+                for line_index in unprocessed_lines:
+                    processed_lines[line_index] = ""
+            else:
+                for line_index, unprocessed_line in unprocessed_lines.items():
+                    prefix, _, alignment = unprocessed_line.rpartition(" ")
+                    masked_alignment = "".join(itertools.compress(alignment, mask))
+                    processed_lines[line_index] = f"{prefix} {masked_alignment}"
+
+            # Clear raw_alignments.
+            unprocessed_lines = {}
+        elif line.strip() and not line.startswith(("#", "//")):
+            unprocessed_lines[i] = line
+        else:
+            processed_lines[i] = line
+    return "\n".join((processed_lines[i] for i in range(len(processed_lines))))
+
+
+def deduplicate_stockholm_msa(stockholm_msa: str) -> str:
+    """Remove duplicate sequences (ignoring insertions wrt query)."""
+    sequence_dict = collections.defaultdict(str)
+
+    # First we must extract all sequences from the MSA.
+    for line in stockholm_msa.splitlines():
+        # Only consider the alignments - ignore reference annotation, empty lines,
+        # descriptions or markup.
+        if line.strip() and not line.startswith(("#", "//")):
+            line = line.strip()
+            seqname, alignment = line.split()
+            sequence_dict[seqname] += alignment
+
+    seen_sequences = set()
+    seqnames = set()
+    # First alignment is the query.
+    query_align = next(iter(sequence_dict.values()))
+    mask = [c != "-" for c in query_align]  # Mask is False for insertions.
+    for seqname, alignment in sequence_dict.items():
+        # Apply mask to remove all insertions from the string.
+        masked_alignment = "".join(itertools.compress(alignment, mask))
+        if masked_alignment in seen_sequences:
+            continue
+        else:
+            seen_sequences.add(masked_alignment)
+            seqnames.add(seqname)
+
+    filtered_lines = []
+    for line in stockholm_msa.splitlines():
+        if _keep_line(line, seqnames):
+            filtered_lines.append(line)
+
+    return "\n".join(filtered_lines) + "\n"
+
+
+def _get_hhr_line_regex_groups(
+    regex_pattern: str, line: str
+) -> Sequence[Optional[str]]:
+    match = re.match(regex_pattern, line)
+    if match is None:
+        raise RuntimeError(f"Could not parse query line {line}")
+    return match.groups()
+
+
+def _update_hhr_residue_indices_list(
+    sequence: str, start_index: int, indices_list: List[int]
+):
+    """Computes the relative indices for each residue with respect to the original sequence."""
+    counter = start_index
+    for symbol in sequence:
+        if symbol == "-":
+            indices_list.append(-1)
+        else:
+            indices_list.append(counter)
+            counter += 1
+
+
+def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
+    """Parses the detailed HMM HMM comparison section for a single Hit.
+
+    This works on .hhr files generated from both HHBlits and HHSearch.
+
+    Args:
+        detailed_lines: A list of lines from a single comparison section between 2
+            sequences (which each have their own HMM's)
+
+    Returns:
+        A dictionary with the information from that detailed comparison section
+
+    Raises:
+        RuntimeError: If a certain line cannot be processed
+    """
+    # Parse first 2 lines.
+    number_of_hit = int(detailed_lines[0].split()[-1])
+    name_hit = detailed_lines[1][1:]
+
+    # Parse the summary line.
+    pattern = (
+        "Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t"
+        " ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t "
+        "]*Template_Neff=(.*)"
+    )
+    match = re.match(pattern, detailed_lines[2])
+    if match is None:
+        raise RuntimeError(
+            "Could not parse section: %s. Expected this: \n%s to contain summary."
+            % (detailed_lines, detailed_lines[2])
+        )
+    (_, _, _, aligned_cols, _, _, sum_probs, _) = [float(x) for x in match.groups()]
+
+    # The next section reads the detailed comparisons. These are in a 'human
+    # readable' format which has a fixed length. The strategy employed is to
+    # assume that each block starts with the query sequence line, and to parse
+    # that with a regexp in order to deduce the fixed length used for that block.
+    query = ""
+    hit_sequence = ""
+    indices_query = []
+    indices_hit = []
+    length_block = None
+
+    for line in detailed_lines[3:]:
+        # Parse the query sequence line
+        if (
+            line.startswith("Q ")
+            and not line.startswith("Q ss_dssp")
+            and not line.startswith("Q ss_pred")
+            and not line.startswith("Q Consensus")
+        ):
+            # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
+            # everything after that.
+            #                            start        sequence             end             total_sequence_length
+            patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)"
+            groups = _get_hhr_line_regex_groups(patt, line[17:])
+
+            # Get the length of the parsed block using the start and finish indices,
+            # and ensure it is the same as the actual block length.
+            start = int(groups[0]) - 1  # Make index zero based.
+            delta_query = groups[1]
+            end = int(groups[2])
+            num_insertions = len([x for x in delta_query if x == "-"])
+            length_block = end - start + num_insertions
+            assert length_block == len(delta_query)
+
+            # Update the query sequence and indices list.
+            query += delta_query
+            _update_hhr_residue_indices_list(delta_query, start, indices_query)
+
+        elif line.startswith("T "):
+            # Parse the hit sequence.
+            if (
+                not line.startswith("T ss_dssp")
+                and not line.startswith("T ss_pred")
+                and not line.startswith("T Consensus")
+            ):
+                # Thus the first 17 characters must be 'T <hit_name> ', and we can
+                # parse everything after that.
+                #                            start        sequence             end         total_sequence_length
+                patt = r"[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)"
+                groups = _get_hhr_line_regex_groups(patt, line[17:])
+                start = int(groups[0]) - 1  # Make index zero based.
+                delta_hit_sequence = groups[1]
+                assert length_block == len(delta_hit_sequence)
+
+                # Update the hit sequence and indices list.
+                hit_sequence += delta_hit_sequence
+                _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)
+
+    return TemplateHit(
+        index=number_of_hit,
+        name=name_hit,
+        aligned_cols=int(aligned_cols),
+        sum_probs=sum_probs,
+        query=query,
+        hit_sequence=hit_sequence,
+        indices_query=indices_query,
+        indices_hit=indices_hit,
+    )
+
+
+def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
+    """Parses the content of an entire HHR file."""
+    lines = hhr_string.splitlines()
+
+    # Each .hhr file starts with a results table, then has a sequence of hit
+    # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
+    # iterate through each paragraph to parse each hit.
+
+    block_starts = [i for i, line in enumerate(lines) if line.startswith("No ")]
+
+    hits = []
+    if block_starts:
+        block_starts.append(len(lines))  # Add the end of the final block.
+        for i in range(len(block_starts) - 1):
+            hits.append(_parse_hhr_hit(lines[block_starts[i] : block_starts[i + 1]]))
+    return hits
+
+
+def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
+    """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
+    e_values = {"query": 0}
+    lines = [line for line in tblout.splitlines() if line[0] != "#"]
+    # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
+    # space-delimited. Relevant fields are (1) target name:    and
+    # (5) E-value (full sequence) (numbering from 1).
+    for line in lines:
+        fields = line.split()
+        e_value = fields[4]
+        target_name = fields[0]
+        e_values[target_name] = float(e_value)
+    return e_values
+
+
+def _get_indices(sequence: str, start: int) -> List[int]:
+    """Returns indices for non-gap/insert residues starting at the given index."""
+    indices = []
+    counter = start
+    for symbol in sequence:
+        # Skip gaps but add a placeholder so that the alignment is preserved.
+        if symbol == "-":
+            indices.append(-1)
+        # Skip deleted residues, but increase the counter.
+        elif symbol.islower():
+            counter += 1
+        # Normal aligned residue. Increase the counter and append to indices.
+        else:
+            indices.append(counter)
+            counter += 1
+    return indices
+
+
+@dataclasses.dataclass(frozen=True)
+class HitMetadata:
+    pdb_id: str
+    chain: str
+    start: int
+    end: int
+    length: int
+    text: str
+
+
+def _parse_hmmsearch_description(description: str) -> HitMetadata:
+    """Parses the hmmsearch A3M sequence description line."""
+    # Example 1: >4pqx_A/2-217 [subseq from] mol:protein length:217    Free text
+    # Example 2: >5g3r_A/1-55 [subseq from] mol:protein length:352
+    match = re.match(
+        r"^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$",
+        description.strip(),
+    )
+
+    if not match:
+        raise ValueError(f'Could not parse description: "{description}".')
+
+    return HitMetadata(
+        pdb_id=match[1],
+        chain=match[2],
+        start=int(match[3]),
+        end=int(match[4]),
+        length=int(match[5]),
+        text=match[6],
+    )
+
+
+def parse_hmmsearch_a3m(
+    query_sequence: str, a3m_string: str, skip_first: bool = True
+) -> Sequence[TemplateHit]:
+    """Parses an a3m string produced by hmmsearch.
+
+    Args:
+        query_sequence: The query sequence.
+        a3m_string: The a3m string produced by hmmsearch.
+        skip_first: Whether to skip the first sequence in the a3m string.
+
+    Returns:
+        A sequence of `TemplateHit` results.
+    """
+    # Zip the descriptions and MSAs together, skip the first query sequence.
+    parsed_a3m = list(zip(*parse_fasta(a3m_string)))
+    if skip_first:
+        parsed_a3m = parsed_a3m[1:]
+
+    indices_query = _get_indices(query_sequence, start=0)
+
+    hits = []
+    for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
+        if "mol:protein" not in hit_description:
+            continue  # Skip non-protein chains.
+        metadata = _parse_hmmsearch_description(hit_description)
+        # Aligned columns are only the match states.
+        aligned_cols = sum([r.isupper() and r != "-" for r in hit_sequence])
+        indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)
+
+        hit = TemplateHit(
+            index=i,
+            name=f"{metadata.pdb_id}_{metadata.chain}",
+            aligned_cols=aligned_cols,
+            sum_probs=None,
+            query=query_sequence,
+            hit_sequence=hit_sequence.upper(),
+            indices_query=indices_query,
+            indices_hit=indices_hit,
+        )
+        hits.append(hit)
+
+    return hits
--- a/unifold/msa/pipeline.py
+++ b/unifold/msa/pipeline.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for building the input features for the unifold model."""
+
+import os
+from typing import Any, Mapping, MutableMapping, Optional, Sequence, Union
+from absl import logging
+from unifold.data import residue_constants
+from unifold.msa import msa_identifiers
+from unifold.msa import parsers
+from unifold.msa import templates
+from unifold.msa.tools import hhblits
+from unifold.msa.tools import hhsearch
+from unifold.msa.tools import hmmsearch
+from unifold.msa.tools import jackhmmer
+import numpy as np
+
+
+FeatureDict = MutableMapping[str, np.ndarray]
+TemplateSearcher = Union[hhsearch.HHSearch, hmmsearch.Hmmsearch]
+
+
+def make_sequence_features(
+    sequence: str, description: str, num_res: int
+) -> FeatureDict:
+    """Constructs a feature dict of sequence features."""
+    features = {}
+    features["aatype"] = residue_constants.sequence_to_onehot(
+        sequence=sequence,
+        mapping=residue_constants.restype_order_with_x,
+        map_unknown_to_x=True,
+    )
+    features["between_segment_residues"] = np.zeros((num_res,), dtype=np.int32)
+    features["domain_name"] = np.array([description.encode("utf-8")], dtype=np.object_)
+    features["residue_index"] = np.array(range(num_res), dtype=np.int32)
+    features["seq_length"] = np.array([num_res] * num_res, dtype=np.int32)
+    features["sequence"] = np.array([sequence.encode("utf-8")], dtype=np.object_)
+    return features
+
+
+def make_msa_features(msas: Sequence[parsers.Msa]) -> FeatureDict:
+    """Constructs a feature dict of MSA features."""
+    if not msas:
+        raise ValueError("At least one MSA must be provided.")
+
+    int_msa = []
+    deletion_matrix = []
+    species_ids = []
+    seen_sequences = set()
+    for msa_index, msa in enumerate(msas):
+        if not msa:
+            raise ValueError(f"MSA {msa_index} must contain at least one sequence.")
+        for sequence_index, sequence in enumerate(msa.sequences):
+            if sequence in seen_sequences:
+                continue
+            seen_sequences.add(sequence)
+            int_msa.append(
+                [residue_constants.HHBLITS_AA_TO_ID[res] for res in sequence]
+            )
+            deletion_matrix.append(msa.deletion_matrix[sequence_index])
+            identifiers = msa_identifiers.get_identifiers(
+                msa.descriptions[sequence_index]
+            )
+            species_ids.append(identifiers.species_id.encode("utf-8"))
+
+    num_res = len(msas[0].sequences[0])
+    num_alignments = len(int_msa)
+    features = {}
+    features["deletion_matrix_int"] = np.array(deletion_matrix, dtype=np.int32)
+    features["msa"] = np.array(int_msa, dtype=np.int32)
+    features["num_alignments"] = np.array([num_alignments] * num_res, dtype=np.int32)
+    features["msa_species_identifiers"] = np.array(species_ids, dtype=np.object_)
+    return features
+
+
+def run_msa_tool(
+    msa_runner,
+    input_fasta_path: str,
+    msa_out_path: str,
+    msa_format: str,
+    use_precomputed_msas: bool,
+) -> Mapping[str, Any]:
+    """Runs an MSA tool, checking if output already exists first."""
+    if not use_precomputed_msas or not os.path.exists(msa_out_path):
+        result = msa_runner.query(input_fasta_path)[0]
+        with open(msa_out_path, "w") as f:
+            f.write(result[msa_format])
+    else:
+        logging.warning("Reading MSA from file %s", msa_out_path)
+        with open(msa_out_path, "r") as f:
+            result = {msa_format: f.read()}
+    return result
+
+
+class DataPipeline:
+    """Runs the alignment tools and assembles the input features."""
+
+    def __init__(
+        self,
+        jackhmmer_binary_path: str,
+        hhblits_binary_path: str,
+        uniref90_database_path: str,
+        mgnify_database_path: str,
+        bfd_database_path: Optional[str],
+        uniclust30_database_path: Optional[str],
+        small_bfd_database_path: Optional[str],
+        uniprot_database_path: Optional[str],
+        template_searcher: TemplateSearcher,
+        template_featurizer: templates.TemplateHitFeaturizer,
+        use_small_bfd: bool,
+        mgnify_max_hits: int = 501,
+        uniref_max_hits: int = 10000,
+        use_precomputed_msas: bool = False,
+    ):
+        """Initializes the data pipeline."""
+        self._use_small_bfd = use_small_bfd
+        self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
+            binary_path=jackhmmer_binary_path, database_path=uniref90_database_path
+        )
+        if use_small_bfd:
+            self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
+                binary_path=jackhmmer_binary_path, database_path=small_bfd_database_path
+            )
+        else:
+            self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
+                binary_path=hhblits_binary_path,
+                databases=[bfd_database_path, uniclust30_database_path],
+            )
+        self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
+            binary_path=jackhmmer_binary_path, database_path=mgnify_database_path
+        )
+        self.jackhmmer_uniprot_runner = jackhmmer.Jackhmmer(
+            binary_path=jackhmmer_binary_path, database_path=uniprot_database_path
+        )
+        self.template_searcher = template_searcher
+        self.template_featurizer = template_featurizer
+        self.mgnify_max_hits = mgnify_max_hits
+        self.uniref_max_hits = uniref_max_hits
+        self.use_precomputed_msas = use_precomputed_msas
+
+    def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
+        """Runs alignment tools on the input sequence and creates features."""
+        with open(input_fasta_path) as f:
+            input_fasta_str = f.read()
+        input_seqs, input_descs = parsers.parse_fasta(input_fasta_str)
+        if len(input_seqs) != 1:
+            raise ValueError(
+                f"More than one input sequence found in {input_fasta_path}."
+            )
+        input_sequence = input_seqs[0]
+        input_description = input_descs[0]
+        num_res = len(input_sequence)
+
+        uniref90_out_path = os.path.join(msa_output_dir, "uniref90_hits.sto")
+        jackhmmer_uniref90_result = run_msa_tool(
+            self.jackhmmer_uniref90_runner,
+            input_fasta_path,
+            uniref90_out_path,
+            "sto",
+            self.use_precomputed_msas,
+        )
+        mgnify_out_path = os.path.join(msa_output_dir, "mgnify_hits.sto")
+        jackhmmer_mgnify_result = run_msa_tool(
+            self.jackhmmer_mgnify_runner,
+            input_fasta_path,
+            mgnify_out_path,
+            "sto",
+            self.use_precomputed_msas,
+        )
+
+        msa_for_templates = jackhmmer_uniref90_result["sto"]
+        msa_for_templates = parsers.truncate_stockholm_msa(
+            msa_for_templates, max_sequences=self.uniref_max_hits
+        )
+        msa_for_templates = parsers.deduplicate_stockholm_msa(msa_for_templates)
+        msa_for_templates = parsers.remove_empty_columns_from_stockholm_msa(
+            msa_for_templates
+        )
+
+        if self.template_searcher.input_format == "sto":
+            pdb_templates_result = self.template_searcher.query(msa_for_templates)
+        elif self.template_searcher.input_format == "a3m":
+            uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(msa_for_templates)
+            pdb_templates_result = self.template_searcher.query(uniref90_msa_as_a3m)
+        else:
+            raise ValueError(
+                "Unrecognized template input format: "
+                f"{self.template_searcher.input_format}"
+            )
+
+        pdb_hits_out_path = os.path.join(
+            msa_output_dir, f"pdb_hits.{self.template_searcher.output_format}"
+        )
+        with open(pdb_hits_out_path, "w") as f:
+            f.write(pdb_templates_result)
+
+        uniref90_msa = parsers.parse_stockholm(jackhmmer_uniref90_result["sto"])
+        uniref90_msa = uniref90_msa.truncate(max_seqs=self.uniref_max_hits)
+        mgnify_msa = parsers.parse_stockholm(jackhmmer_mgnify_result["sto"])
+        mgnify_msa = mgnify_msa.truncate(max_seqs=self.mgnify_max_hits)
+
+        pdb_template_hits = self.template_searcher.get_template_hits(
+            output_string=pdb_templates_result, input_sequence=input_sequence
+        )
+
+        if self._use_small_bfd:
+            bfd_out_path = os.path.join(msa_output_dir, "small_bfd_hits.sto")
+            jackhmmer_small_bfd_result = run_msa_tool(
+                self.jackhmmer_small_bfd_runner,
+                input_fasta_path,
+                bfd_out_path,
+                "sto",
+                self.use_precomputed_msas,
+            )
+            bfd_msa = parsers.parse_stockholm(jackhmmer_small_bfd_result["sto"])
+        else:
+            bfd_out_path = os.path.join(msa_output_dir, "bfd_uniclust_hits.a3m")
+            hhblits_bfd_uniclust_result = run_msa_tool(
+                self.hhblits_bfd_uniclust_runner,
+                input_fasta_path,
+                bfd_out_path,
+                "a3m",
+                self.use_precomputed_msas,
+            )
+            bfd_msa = parsers.parse_a3m(hhblits_bfd_uniclust_result["a3m"])
+
+        templates_result = self.template_featurizer.get_templates(
+            query_sequence=input_sequence, hits=pdb_template_hits
+        )
+
+        sequence_features = make_sequence_features(
+            sequence=input_sequence, description=input_description, num_res=num_res
+        )
+
+        msa_features = make_msa_features((uniref90_msa, bfd_msa, mgnify_msa))
+
+        logging.info("Uniref90 MSA size: %d sequences.", len(uniref90_msa))
+        logging.info("BFD MSA size: %d sequences.", len(bfd_msa))
+        logging.info("MGnify MSA size: %d sequences.", len(mgnify_msa))
+        logging.info(
+            "Final (deduplicated) MSA size: %d sequences.",
+            msa_features["num_alignments"][0],
+        )
+        logging.info(
+            "Total number of templates (NB: this can include bad "
+            "templates and is later filtered to top 4): %d.",
+            templates_result.features["template_domain_names"].shape[0],
+        )
+
+        return {**sequence_features, **msa_features, **templates_result.features}
+
+    def process_uniprot(
+        self, input_fasta_path: str, msa_output_dir: str
+    ) -> FeatureDict:
+        uniprot_path = os.path.join(msa_output_dir, "uniprot_hits.sto")
+        uniprot_result = run_msa_tool(
+            self.jackhmmer_uniprot_runner,
+            input_fasta_path,
+            uniprot_path,
+            "sto",
+            self.use_precomputed_msas,
+        )
+        msa = parsers.parse_stockholm(uniprot_result["sto"])
+        msa = msa.truncate(max_seqs=50000)
+        all_seq_dict = make_msa_features([msa])
+        return all_seq_dict
--- a/unifold/msa/templates.py
+++ b/unifold/msa/templates.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions for getting templates and calculating template features."""
+import abc
+import dataclasses
+import datetime
+import functools
+import glob
+import os
+import re
+from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
+
+from absl import logging
+from unifold.data import residue_constants
+from unifold.msa import mmcif, parsers
+from unifold.msa.tools import kalign
+import numpy as np
+
+
+class Error(Exception):
+    """Base class for exceptions."""
+
+
+class NoChainsError(Error):
+    """An error indicating that template mmCIF didn't have any chains."""
+
+
+class SequenceNotInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain the sequence."""
+
+
+class NoAtomDataInTemplateError(Error):
+    """An error indicating that template mmCIF didn't contain atom positions."""
+
+
+class TemplateAtomMaskAllZerosError(Error):
+    """An error indicating that template mmCIF had all atom positions masked."""
+
+
+class QueryToTemplateAlignError(Error):
+    """An error indicating that the query can't be aligned to the template."""
+
+
+class CaDistanceError(Error):
+    """An error indicating that a CA atom distance exceeds a threshold."""
+
+
+class MultipleChainsError(Error):
+    """An error indicating that multiple chains were found for a given ID."""
+
+
+# Prefilter exceptions.
+class PrefilterError(Exception):
+    """A base class for template prefilter exceptions."""
+
+
+class DateError(PrefilterError):
+    """An error indicating that the hit date was after the max allowed date."""
+
+
+class AlignRatioError(PrefilterError):
+    """An error indicating that the hit align ratio to the query was too small."""
+
+
+class DuplicateError(PrefilterError):
+    """An error indicating that the hit was an exact subsequence of the query."""
+
+
+class LengthError(PrefilterError):
+    """An error indicating that the hit was too short."""
+
+
+TEMPLATE_FEATURES = {
+    "template_aatype": np.float32,
+    "template_all_atom_mask": np.float32,
+    "template_all_atom_positions": np.float32,
+    "template_domain_names": np.object_,
+    "template_sequence": np.object_,
+    "template_sum_probs": np.float32,
+}
+
+
+def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
+    """Returns PDB id and chain id for an HHSearch Hit."""
+    # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
+    id_match = re.match(r"[a-zA-Z\d]{4}_[a-zA-Z0-9.]+", hit.name)
+    if not id_match:
+        raise ValueError(f"hit.name did not start with PDBID_chain: {hit.name}")
+    pdb_id, chain_id = id_match.group(0).split("_")
+    return pdb_id.lower(), chain_id
+
+
+def _is_after_cutoff(
+    pdb_id: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: Optional[datetime.datetime],
+) -> bool:
+    """Checks if the template date is after the release date cutoff.
+
+    Args:
+        pdb_id: 4 letter pdb code.
+        release_dates: Dictionary mapping PDB ids to their structure release dates.
+        release_date_cutoff: Max release date that is valid for this query.
+
+    Returns:
+        True if the template release date is after the cutoff, False otherwise.
+    """
+    if release_date_cutoff is None:
+        raise ValueError("The release_date_cutoff must not be None.")
+    if pdb_id in release_dates:
+        return release_dates[pdb_id] > release_date_cutoff
+    else:
+        # Since this is just a quick prefilter to reduce the number of mmCIF files
+        # we need to parse, we don't have to worry about returning True here.
+        return False
+
+
+def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, Optional[str]]:
+    """Parses the data file from PDB that lists which pdb_ids are obsolete."""
+    with open(obsolete_file_path) as f:
+        result = {}
+        for line in f:
+            line = line.strip()
+            # Format:        Date            From         To
+            # 'OBSLTE        06-NOV-19 6G9Y'                                - Removed, rare
+            # 'OBSLTE        31-JUL-94 116L         216L'             - Replaced, common
+            # 'OBSLTE        26-SEP-06 2H33         2JM5 2OWI'    - Replaced by multiple, rare
+            if line.startswith("OBSLTE"):
+                if len(line) > 30:
+                    # Replaced by at least one structure.
+                    from_id = line[20:24].lower()
+                    to_id = line[29:33].lower()
+                    result[from_id] = to_id
+                elif len(line) == 24:
+                    # Removed.
+                    from_id = line[20:24].lower()
+                    result[from_id] = None
+        return result
+
+
+def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
+    """Parses release dates file, returns a mapping from PDBs to release dates."""
+    if path.endswith("txt"):
+        release_dates = {}
+        with open(path, "r") as f:
+            for line in f:
+                pdb_id, date = line.split(":")
+                date = date.strip()
+                # Python 3.6 doesn't have datetime.date.fromisoformat() which is about
+                # 90x faster than strptime. However, splitting the string manually is
+                # about 10x faster than strptime.
+                release_dates[pdb_id.strip()] = datetime.datetime(
+                    year=int(date[:4]), month=int(date[5:7]), day=int(date[8:10])
+                )
+        return release_dates
+    else:
+        raise ValueError("Invalid format of the release date file %s." % path)
+
+
+def _assess_hhsearch_hit(
+    hit: parsers.TemplateHit,
+    hit_pdb_code: str,
+    query_sequence: str,
+    release_dates: Mapping[str, datetime.datetime],
+    release_date_cutoff: datetime.datetime,
+    max_subsequence_ratio: float = 0.95,
+    min_align_ratio: float = 0.1,
+) -> bool:
+    """Determines if template is valid (without parsing the template mmcif file).
+
+    Args:
+        hit: HhrHit for the template.
+        hit_pdb_code: The 4 letter pdb code of the template hit. This might be
+            different from the value in the actual hit since the original pdb might
+            have become obsolete.
+        query_sequence: Amino acid sequence of the query.
+        release_dates: Dictionary mapping pdb codes to their structure release
+            dates.
+        release_date_cutoff: Max release date that is valid for this query.
+        max_subsequence_ratio: Exclude any exact matches with this much overlap.
+        min_align_ratio: Minimum overlap between the template and query.
+
+    Returns:
+        True if the hit passed the prefilter. Raises an exception otherwise.
+
+    Raises:
+        DateError: If the hit date was after the max allowed date.
+        AlignRatioError: If the hit align ratio to the query was too small.
+        DuplicateError: If the hit was an exact subsequence of the query.
+        LengthError: If the hit was too short.
+    """
+    aligned_cols = hit.aligned_cols
+    align_ratio = aligned_cols / len(query_sequence)
+
+    template_sequence = hit.hit_sequence.replace("-", "")
+    length_ratio = float(len(template_sequence)) / len(query_sequence)
+
+    # Check whether the template is a large subsequence or duplicate of original
+    # query. This can happen due to duplicate entries in the PDB database.
+    duplicate = (
+        template_sequence in query_sequence and length_ratio > max_subsequence_ratio
+    )
+
+    if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
+        raise DateError(
+            f"Date ({release_dates[hit_pdb_code]}) > max template date "
+            f"({release_date_cutoff})."
+        )
+
+    if align_ratio <= min_align_ratio:
+        raise AlignRatioError(
+            "Proportion of residues aligned to query too small. "
+            f"Align ratio: {align_ratio}."
+        )
+
+    if duplicate:
+        raise DuplicateError(
+            "Template is an exact subsequence of query with large "
+            f"coverage. Length ratio: {length_ratio}."
+        )
+
+    if len(template_sequence) < 10:
+        raise LengthError(f"Template too short. Length: {len(template_sequence)}.")
+
+    return True
+
+
+def _find_template_in_pdb(
+    template_chain_id: str, template_sequence: str, mmcif_object: mmcif.MmcifObject
+) -> Tuple[str, str, int]:
+    """Tries to find the template chain in the given pdb file.
+
+    This method tries the three following things in order:
+        1. Tries if there is an exact match in both the chain ID and the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        2. Tries if there is an exact match only in the sequence.
+             If yes, the chain sequence is returned. Otherwise:
+        3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
+             If yes, the chain sequence is returned.
+    If none of these succeed, a SequenceNotInTemplateError is thrown.
+
+    Args:
+        template_chain_id: The template chain ID.
+        template_sequence: The template chain sequence.
+        mmcif_object: The PDB object to search for the template in.
+
+    Returns:
+        A tuple with:
+        * The chain sequence that was found to match the template in the PDB object.
+        * The ID of the chain that is being returned.
+        * The offset where the template sequence starts in the chain sequence.
+
+    Raises:
+        SequenceNotInTemplateError: If no match is found after the steps described
+            above.
+    """
+    # Try if there is an exact match in both the chain ID and the (sub)sequence.
+    pdb_id = mmcif_object.file_id
+    chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
+    if chain_sequence and (template_sequence in chain_sequence):
+        logging.info("Found an exact template match %s_%s.", pdb_id, template_chain_id)
+        mapping_offset = chain_sequence.find(template_sequence)
+        return chain_sequence, template_chain_id, mapping_offset
+
+    # Try if there is an exact match in the (sub)sequence only.
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        if chain_sequence and (template_sequence in chain_sequence):
+            logging.info("Found a sequence-only match %s_%s.", pdb_id, chain_id)
+            mapping_offset = chain_sequence.find(template_sequence)
+            return chain_sequence, chain_id, mapping_offset
+
+    # Return a chain sequence that fuzzy matches (X = wildcard) the template.
+    # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
+    regex = ["." if aa == "X" else "(?:%s|X)" % aa for aa in template_sequence]
+    regex = re.compile("".join(regex))
+    for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
+        match = re.search(regex, chain_sequence)
+        if match:
+            logging.info("Found a fuzzy sequence-only match %s_%s.", pdb_id, chain_id)
+            mapping_offset = match.start()
+            return chain_sequence, chain_id, mapping_offset
+
+    # No hits, raise an error.
+    raise SequenceNotInTemplateError(
+        "Could not find the template sequence in %s_%s. Template sequence: %s, "
+        "chain_to_seqres: %s"
+        % (pdb_id, template_chain_id, template_sequence, mmcif_object.chain_to_seqres)
+    )
+
+
+def _realign_pdb_template_to_query(
+    old_template_sequence: str,
+    template_chain_id: str,
+    mmcif_object: mmcif.MmcifObject,
+    old_mapping: Mapping[int, int],
+    kalign_binary_path: str,
+) -> Tuple[str, Mapping[int, int]]:
+    """Aligns template from the mmcif_object to the query.
+
+    In case PDB70 contains a different version of the template sequence, we need
+    to perform a realignment to the actual sequence that is in the mmCIF file.
+    This method performs such realignment, but returns the new sequence and
+    mapping only if the sequence in the mmCIF file is 90% identical to the old
+    sequence.
+
+    Note that the old_template_sequence comes from the hit, and contains only that
+    part of the chain that matches with the query while the new_template_sequence
+    is the full chain.
+
+    Args:
+        old_template_sequence: The template sequence that was returned by the PDB
+            template search (typically done using HHSearch).
+        template_chain_id: The template chain id was returned by the PDB template
+            search (typically done using HHSearch). This is used to find the right
+            chain in the mmcif_object chain_to_seqres mapping.
+        mmcif_object: A mmcif_object which holds the actual template data.
+        old_mapping: A mapping from the query sequence to the template sequence.
+            This mapping will be used to compute the new mapping from the query
+            sequence to the actual mmcif_object template sequence by aligning the
+            old_template_sequence and the actual template sequence.
+        kalign_binary_path: The path to a kalign executable.
+
+    Returns:
+        A tuple (new_template_sequence, new_query_to_template_mapping) where:
+        * new_template_sequence is the actual template sequence that was found in
+            the mmcif_object.
+        * new_query_to_template_mapping is the new mapping from the query to the
+            actual template found in the mmcif_object.
+
+    Raises:
+        QueryToTemplateAlignError:
+        * If there was an error thrown by the alignment tool.
+        * Or if the actual template sequence differs by more than 10% from the
+            old_template_sequence.
+    """
+    aligner = kalign.Kalign(binary_path=kalign_binary_path)
+    new_template_sequence = mmcif_object.chain_to_seqres.get(template_chain_id, "")
+
+    # Sometimes the template chain id is unknown. But if there is only a single
+    # sequence within the mmcif_object, it is safe to assume it is that one.
+    if not new_template_sequence:
+        if len(mmcif_object.chain_to_seqres) == 1:
+            logging.info(
+                "Could not find %s in %s, but there is only 1 sequence, so "
+                "using that one.",
+                template_chain_id,
+                mmcif_object.file_id,
+            )
+            new_template_sequence = list(mmcif_object.chain_to_seqres.values())[0]
+        else:
+            raise QueryToTemplateAlignError(
+                f"Could not find chain {template_chain_id} in {mmcif_object.file_id}. "
+                "If there are no mmCIF parsing errors, it is possible it was not a "
+                "protein chain."
+            )
+
+    try:
+        parsed_a3m = parsers.parse_a3m(
+            aligner.align([old_template_sequence, new_template_sequence])
+        )
+        old_aligned_template, new_aligned_template = parsed_a3m.sequences
+    except Exception as e:
+        raise QueryToTemplateAlignError(
+            "Could not align old template %s to template %s (%s_%s). Error: %s"
+            % (
+                old_template_sequence,
+                new_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                str(e),
+            )
+        )
+
+    logging.info(
+        "Old aligned template: %s\nNew aligned template: %s",
+        old_aligned_template,
+        new_aligned_template,
+    )
+
+    old_to_new_template_mapping = {}
+    old_template_index = -1
+    new_template_index = -1
+    num_same = 0
+    for old_template_aa, new_template_aa in zip(
+        old_aligned_template, new_aligned_template
+    ):
+        if old_template_aa != "-":
+            old_template_index += 1
+        if new_template_aa != "-":
+            new_template_index += 1
+        if old_template_aa != "-" and new_template_aa != "-":
+            old_to_new_template_mapping[old_template_index] = new_template_index
+            if old_template_aa == new_template_aa:
+                num_same += 1
+
+    # Require at least 90 % sequence identity wrt to the shorter of the sequences.
+    if (
+        float(num_same) / min(len(old_template_sequence), len(new_template_sequence))
+        < 0.9
+    ):
+        raise QueryToTemplateAlignError(
+            "Insufficient similarity of the sequence in the database: %s to the "
+            "actual sequence in the mmCIF file %s_%s: %s. We require at least "
+            "90 %% similarity wrt to the shorter of the sequences. This is not a "
+            "problem unless you think this is a template that should be included."
+            % (
+                old_template_sequence,
+                mmcif_object.file_id,
+                template_chain_id,
+                new_template_sequence,
+            )
+        )
+
+    new_query_to_template_mapping = {}
+    for query_index, old_template_index in old_mapping.items():
+        new_query_to_template_mapping[query_index] = old_to_new_template_mapping.get(
+            old_template_index, -1
+        )
+
+    new_template_sequence = new_template_sequence.replace("-", "")
+
+    return new_template_sequence, new_query_to_template_mapping
+
+
+def _check_residue_distances(
+    all_positions: np.ndarray, all_positions_mask: np.ndarray, max_ca_ca_distance: float
+):
+    """Checks if the distance between unmasked neighbor residues is ok."""
+    ca_position = residue_constants.atom_order["CA"]
+    prev_is_unmasked = False
+    prev_calpha = None
+    for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
+        this_is_unmasked = bool(mask[ca_position])
+        if this_is_unmasked:
+            this_calpha = coords[ca_position]
+            if prev_is_unmasked:
+                distance = np.linalg.norm(this_calpha - prev_calpha)
+                if distance > max_ca_ca_distance:
+                    raise CaDistanceError(
+                        "The distance between residues %d and %d is %f > limit %f."
+                        % (i, i + 1, distance, max_ca_ca_distance)
+                    )
+            prev_calpha = this_calpha
+        prev_is_unmasked = this_is_unmasked
+
+
+def _get_atom_positions(
+    mmcif_object: mmcif.MmcifObject, auth_chain_id: str, max_ca_ca_distance: float
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Gets atom positions and mask from a list of Biopython Residues."""
+    num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])
+
+    relevant_chains = [
+        c for c in mmcif_object.structure.get_chains() if c.id == auth_chain_id
+    ]
+    if len(relevant_chains) != 1:
+        raise MultipleChainsError(
+            f"Expected exactly one chain in structure with id {auth_chain_id}."
+        )
+    chain = relevant_chains[0]
+
+    all_positions = np.zeros([num_res, residue_constants.atom_type_num, 3])
+    all_positions_mask = np.zeros(
+        [num_res, residue_constants.atom_type_num], dtype=np.int64
+    )
+    for res_index in range(num_res):
+        pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
+        mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
+        res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
+        if not res_at_position.is_missing:
+            res = chain[
+                (
+                    res_at_position.hetflag,
+                    res_at_position.position.residue_number,
+                    res_at_position.position.insertion_code,
+                )
+            ]
+            for atom in res.get_atoms():
+                atom_name = atom.get_name()
+                x, y, z = atom.get_coord()
+                if atom_name in residue_constants.atom_order.keys():
+                    pos[residue_constants.atom_order[atom_name]] = [x, y, z]
+                    mask[residue_constants.atom_order[atom_name]] = 1.0
+                elif atom_name.upper() == "SE" and res.get_resname() == "MSE":
+                    # Put the coordinates of the selenium atom in the sulphur column.
+                    pos[residue_constants.atom_order["SD"]] = [x, y, z]
+                    mask[residue_constants.atom_order["SD"]] = 1.0
+
+            # Fix naming errors in arginine residues where NH2 is incorrectly
+            # assigned to be closer to CD than NH1.
+            cd = residue_constants.atom_order["CD"]
+            nh1 = residue_constants.atom_order["NH1"]
+            nh2 = residue_constants.atom_order["NH2"]
+            if (
+                res.get_resname() == "ARG"
+                and all(mask[atom_index] for atom_index in (cd, nh1, nh2))
+                and (
+                    np.linalg.norm(pos[nh1] - pos[cd])
+                    > np.linalg.norm(pos[nh2] - pos[cd])
+                )
+            ):
+                pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
+                mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()
+
+        all_positions[res_index] = pos
+        all_positions_mask[res_index] = mask
+    _check_residue_distances(all_positions, all_positions_mask, max_ca_ca_distance)
+    return all_positions, all_positions_mask
+
+
+def _extract_template_features(
+    mmcif_object: mmcif.MmcifObject,
+    pdb_id: str,
+    mapping: Mapping[int, int],
+    template_sequence: str,
+    query_sequence: str,
+    template_chain_id: str,
+    kalign_binary_path: str,
+) -> Tuple[Dict[str, Any], Optional[str]]:
+    """Parses atom positions in the target structure and aligns with the query.
+
+    Atoms for each residue in the template structure are indexed to coincide
+    with their corresponding residue in the query sequence, according to the
+    alignment mapping provided.
+
+    Args:
+        mmcif_object: mmcif_parsing.MmcifObject representing the template.
+        pdb_id: PDB code for the template.
+        mapping: Dictionary mapping indices in the query sequence to indices in
+            the template sequence.
+        template_sequence: String describing the amino acid sequence for the
+            template protein.
+        query_sequence: String describing the amino acid sequence for the query
+            protein.
+        template_chain_id: String ID describing which chain in the structure proto
+            should be used.
+        kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+
+    Returns:
+        A tuple with:
+        * A dictionary containing the extra features derived from the template
+            protein structure.
+        * A warning message if the hit was realigned to the actual mmCIF sequence.
+            Otherwise None.
+
+    Raises:
+        NoChainsError: If the mmcif object doesn't contain any chains.
+        SequenceNotInTemplateError: If the given chain id / sequence can't
+            be found in the mmcif object.
+        QueryToTemplateAlignError: If the actual template in the mmCIF file
+            can't be aligned to the query.
+        NoAtomDataInTemplateError: If the mmcif object doesn't contain
+            atom positions.
+        TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
+            unmasked residues.
+    """
+    if mmcif_object is None or not mmcif_object.chain_to_seqres:
+        raise NoChainsError("No chains in PDB: %s_%s" % (pdb_id, template_chain_id))
+
+    warning = None
+    try:
+        seqres, chain_id, mapping_offset = _find_template_in_pdb(
+            template_chain_id=template_chain_id,
+            template_sequence=template_sequence,
+            mmcif_object=mmcif_object,
+        )
+    except SequenceNotInTemplateError:
+        # If PDB70 contains a different version of the template, we use the sequence
+        # from the mmcif_object.
+        chain_id = template_chain_id
+        warning = (
+            f"The exact sequence {template_sequence} was not found in "
+            f"{pdb_id}_{chain_id}. Realigning the template to the actual sequence."
+        )
+        logging.warning(warning)
+        # This throws an exception if it fails to realign the hit.
+        seqres, mapping = _realign_pdb_template_to_query(
+            old_template_sequence=template_sequence,
+            template_chain_id=template_chain_id,
+            mmcif_object=mmcif_object,
+            old_mapping=mapping,
+            kalign_binary_path=kalign_binary_path,
+        )
+        logging.info(
+            "Sequence in %s_%s: %s successfully realigned to %s",
+            pdb_id,
+            chain_id,
+            template_sequence,
+            seqres,
+        )
+        # The template sequence changed.
+        template_sequence = seqres
+        # No mapping offset, the query is aligned to the actual sequence.
+        mapping_offset = 0
+
+    try:
+        # Essentially set to infinity - we don't want to reject templates unless
+        # they're really really bad.
+        all_atom_positions, all_atom_mask = _get_atom_positions(
+            mmcif_object, chain_id, max_ca_ca_distance=150.0
+        )
+    except (CaDistanceError, KeyError) as ex:
+        raise NoAtomDataInTemplateError(
+            "Could not get atom data (%s_%s): %s" % (pdb_id, chain_id, str(ex))
+        ) from ex
+
+    all_atom_positions = np.split(all_atom_positions, all_atom_positions.shape[0])
+    all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
+
+    output_templates_sequence = []
+    templates_all_atom_positions = []
+    templates_all_atom_masks = []
+
+    for _ in query_sequence:
+        # Residues in the query_sequence that are not in the template_sequence:
+        templates_all_atom_positions.append(
+            np.zeros((residue_constants.atom_type_num, 3))
+        )
+        templates_all_atom_masks.append(np.zeros(residue_constants.atom_type_num))
+        output_templates_sequence.append("-")
+
+    for k, v in mapping.items():
+        template_index = v + mapping_offset
+        templates_all_atom_positions[k] = all_atom_positions[template_index][0]
+        templates_all_atom_masks[k] = all_atom_masks[template_index][0]
+        output_templates_sequence[k] = template_sequence[v]
+
+    # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
+    if np.sum(templates_all_atom_masks) < 5:
+        raise TemplateAtomMaskAllZerosError(
+            "Template all atom mask was all zeros: %s_%s. Residue range: %d-%d"
+            % (
+                pdb_id,
+                chain_id,
+                min(mapping.values()) + mapping_offset,
+                max(mapping.values()) + mapping_offset,
+            )
+        )
+
+    output_templates_sequence = "".join(output_templates_sequence)
+
+    templates_aatype = residue_constants.sequence_to_onehot(
+        output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID
+    )
+
+    return (
+        {
+            "template_all_atom_positions": np.array(templates_all_atom_positions),
+            "template_all_atom_mask": np.array(templates_all_atom_masks),
+            "template_sequence": output_templates_sequence.encode(),
+            "template_aatype": np.array(templates_aatype),
+            "template_domain_names": f"{pdb_id.lower()}_{chain_id}".encode(),
+        },
+        warning,
+    )
+
+
+def _build_query_to_hit_index_mapping(
+    hit_query_sequence: str,
+    hit_sequence: str,
+    indices_hit: Sequence[int],
+    indices_query: Sequence[int],
+    original_query_sequence: str,
+) -> Mapping[int, int]:
+    """Gets mapping from indices in original query sequence to indices in the hit.
+
+    hit_query_sequence and hit_sequence are two aligned sequences containing gap
+    characters. hit_query_sequence contains only the part of the original query
+    sequence that matched the hit. When interpreting the indices from the .hhr, we
+    need to correct for this to recover a mapping from original query sequence to
+    the hit sequence.
+
+    Args:
+        hit_query_sequence: The portion of the query sequence that is in the .hhr
+            hit
+        hit_sequence: The portion of the hit sequence that is in the .hhr
+        indices_hit: The indices for each aminoacid relative to the hit sequence
+        indices_query: The indices for each aminoacid relative to the original query
+            sequence
+        original_query_sequence: String describing the original query sequence.
+
+    Returns:
+        Dictionary with indices in the original query sequence as keys and indices
+        in the hit sequence as values.
+    """
+    # If the hit is empty (no aligned residues), return empty mapping
+    if not hit_query_sequence:
+        return {}
+
+    # Remove gaps and find the offset of hit.query relative to original query.
+    hhsearch_query_sequence = hit_query_sequence.replace("-", "")
+    hit_sequence = hit_sequence.replace("-", "")
+    hhsearch_query_offset = original_query_sequence.find(hhsearch_query_sequence)
+
+    # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
+    min_idx = min(x for x in indices_hit if x > -1)
+    fixed_indices_hit = [x - min_idx if x > -1 else -1 for x in indices_hit]
+
+    min_idx = min(x for x in indices_query if x > -1)
+    fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]
+
+    # Zip the corrected indices, ignore case where both seqs have gap characters.
+    mapping = {}
+    for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
+        if q_t != -1 and q_i != -1:
+            if q_t >= len(hit_sequence) or q_i + hhsearch_query_offset >= len(
+                original_query_sequence
+            ):
+                continue
+            mapping[q_i + hhsearch_query_offset] = q_t
+
+    return mapping
+
+
+@dataclasses.dataclass(frozen=True)
+class SingleHitResult:
+    features: Optional[Mapping[str, Any]]
+    error: Optional[str]
+    warning: Optional[str]
+
+
+@functools.lru_cache(16, typed=False)
+def _read_file(path):
+    with open(path, "r") as f:
+        file_data = f.read()
+    return file_data
+
+
+def _process_single_hit(
+    query_sequence: str,
+    hit: parsers.TemplateHit,
+    mmcif_dir: str,
+    max_template_date: datetime.datetime,
+    release_dates: Mapping[str, datetime.datetime],
+    obsolete_pdbs: Mapping[str, Optional[str]],
+    kalign_binary_path: str,
+    strict_error_check: bool = False,
+) -> SingleHitResult:
+    """Tries to extract template features from a single HHSearch hit."""
+    # Fail hard if we can't get the PDB ID and chain name from the hit.
+    hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
+
+    # This hit has been removed (obsoleted) from PDB, skip it.
+    if hit_pdb_code in obsolete_pdbs and obsolete_pdbs[hit_pdb_code] is None:
+        return SingleHitResult(
+            features=None, error=None, warning=f"Hit {hit_pdb_code} is obsolete."
+        )
+
+    if hit_pdb_code not in release_dates:
+        if hit_pdb_code in obsolete_pdbs:
+            hit_pdb_code = obsolete_pdbs[hit_pdb_code]
+
+    # Pass hit_pdb_code since it might have changed due to the pdb being obsolete.
+    try:
+        _assess_hhsearch_hit(
+            hit=hit,
+            hit_pdb_code=hit_pdb_code,
+            query_sequence=query_sequence,
+            release_dates=release_dates,
+            release_date_cutoff=max_template_date,
+        )
+    except PrefilterError as e:
+        msg = f"hit {hit_pdb_code}_{hit_chain_id} did not pass prefilter: {str(e)}"
+        logging.info(msg)
+        if strict_error_check and isinstance(e, (DateError, DuplicateError)):
+            # In strict mode we treat some prefilter cases as errors.
+            return SingleHitResult(features=None, error=msg, warning=None)
+
+        return SingleHitResult(features=None, error=None, warning=None)
+
+    mapping = _build_query_to_hit_index_mapping(
+        hit.query, hit.hit_sequence, hit.indices_hit, hit.indices_query, query_sequence
+    )
+
+    # The mapping is from the query to the actual hit sequence, so we need to
+    # remove gaps (which regardless have a missing confidence score).
+    template_sequence = hit.hit_sequence.replace("-", "")
+
+    cif_path = os.path.join(mmcif_dir, hit_pdb_code + ".cif")
+    logging.debug(
+        "Reading PDB entry from %s. Query: %s, template: %s",
+        cif_path,
+        query_sequence,
+        template_sequence,
+    )
+    # Fail if we can't find the mmCIF file.
+    cif_string = _read_file(cif_path)
+
+    parsing_result = mmcif.parse(file_id=hit_pdb_code, mmcif_string=cif_string)
+
+    if parsing_result.mmcif_object is not None:
+        hit_release_date = datetime.datetime.strptime(
+            parsing_result.mmcif_object.header["release_date"], "%Y-%m-%d"
+        )
+        if hit_release_date > max_template_date:
+            error = "Template %s date (%s) > max template date (%s)." % (
+                hit_pdb_code,
+                hit_release_date,
+                max_template_date,
+            )
+            if strict_error_check:
+                return SingleHitResult(features=None, error=error, warning=None)
+            else:
+                logging.debug(error)
+                return SingleHitResult(features=None, error=None, warning=None)
+
+    try:
+        features, realign_warning = _extract_template_features(
+            mmcif_object=parsing_result.mmcif_object,
+            pdb_id=hit_pdb_code,
+            mapping=mapping,
+            template_sequence=template_sequence,
+            query_sequence=query_sequence,
+            template_chain_id=hit_chain_id,
+            kalign_binary_path=kalign_binary_path,
+        )
+        if hit.sum_probs is None:
+            features["template_sum_probs"] = [0]
+        else:
+            features["template_sum_probs"] = [hit.sum_probs]
+
+        # It is possible there were some errors when parsing the other chains in the
+        # mmCIF file, but the template features for the chain we want were still
+        # computed. In such case the mmCIF parsing errors are not relevant.
+        return SingleHitResult(features=features, error=None, warning=realign_warning)
+    except (
+        NoChainsError,
+        NoAtomDataInTemplateError,
+        TemplateAtomMaskAllZerosError,
+    ) as e:
+        # These 3 errors indicate missing mmCIF experimental data rather than a
+        # problem with the template search, so turn them into warnings.
+        warning = (
+            "%s_%s (sum_probs: %s, rank: %s): feature extracting errors: "
+            "%s, mmCIF parsing errors: %s"
+            % (
+                hit_pdb_code,
+                hit_chain_id,
+                hit.sum_probs,
+                hit.index,
+                str(e),
+                parsing_result.errors,
+            )
+        )
+        if strict_error_check:
+            return SingleHitResult(features=None, error=warning, warning=None)
+        else:
+            return SingleHitResult(features=None, error=None, warning=warning)
+    except Error as e:
+        error = (
+            "%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: "
+            "%s, mmCIF parsing errors: %s"
+            % (
+                hit_pdb_code,
+                hit_chain_id,
+                hit.sum_probs,
+                hit.index,
+                str(e),
+                parsing_result.errors,
+            )
+        )
+        return SingleHitResult(features=None, error=error, warning=None)
+
+
+@dataclasses.dataclass(frozen=True)
+class TemplateSearchResult:
+    features: Mapping[str, Any]
+    errors: Sequence[str]
+    warnings: Sequence[str]
+
+
+class TemplateHitFeaturizer(abc.ABC):
+    """An abstract base class for turning template hits to template features."""
+
+    def __init__(
+        self,
+        mmcif_dir: str,
+        max_template_date: str,
+        max_hits: int,
+        kalign_binary_path: str,
+        release_dates_path: Optional[str],
+        obsolete_pdbs_path: Optional[str],
+        strict_error_check: bool = False,
+    ):
+        """Initializes the Template Search.
+
+        Args:
+            mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
+                is found by HHSearch, this directory is used to retrieve the template
+                data.
+            max_template_date: The maximum date permitted for template structures. No
+                template with date higher than this date will be returned. In ISO8601
+                date format, YYYY-MM-DD.
+            max_hits: The maximum number of templates that will be returned.
+            kalign_binary_path: The path to a kalign executable used for template
+                realignment.
+            release_dates_path: An optional path to a file with a mapping from PDB IDs
+                to their release dates. Thanks to this we don't have to redundantly
+                parse mmCIF files to get that information.
+            obsolete_pdbs_path: An optional path to a file containing a mapping from
+                obsolete PDB IDs to the PDB IDs of their replacements.
+            strict_error_check: If True, then the following will be treated as errors:
+                * If any template date is after the max_template_date.
+                * If any template has identical PDB ID to the query.
+                * If any template is a duplicate of the query.
+                * Any feature computation errors.
+        """
+        self._mmcif_dir = mmcif_dir
+        if not glob.glob(os.path.join(self._mmcif_dir, "*.cif")):
+            logging.error("Could not find CIFs in %s", self._mmcif_dir)
+            raise ValueError(f"Could not find CIFs in {self._mmcif_dir}")
+
+        try:
+            self._max_template_date = datetime.datetime.strptime(
+                max_template_date, "%Y-%m-%d"
+            )
+        except ValueError:
+            raise ValueError(
+                "max_template_date must be set and have format YYYY-MM-DD."
+            )
+        self._max_hits = max_hits
+        self._kalign_binary_path = kalign_binary_path
+        self._strict_error_check = strict_error_check
+
+        if release_dates_path:
+            logging.info("Using precomputed release dates %s.", release_dates_path)
+            self._release_dates = _parse_release_dates(release_dates_path)
+        else:
+            self._release_dates = {}
+
+        if obsolete_pdbs_path:
+            logging.info("Using precomputed obsolete pdbs %s.", obsolete_pdbs_path)
+            self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
+        else:
+            self._obsolete_pdbs = {}
+
+    @abc.abstractmethod
+    def get_templates(
+        self, query_sequence: str, hits: Sequence[parsers.TemplateHit]
+    ) -> TemplateSearchResult:
+        """Computes the templates for given query sequence."""
+
+
+class HhsearchHitFeaturizer(TemplateHitFeaturizer):
+    """A class for turning a3m hits from hhsearch to template features."""
+
+    def get_templates(
+        self, query_sequence: str, hits: Sequence[parsers.TemplateHit]
+    ) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info("Searching for template for: %s", query_sequence)
+
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+
+        num_hits = 0
+        errors = []
+        warnings = []
+
+        for hit in sorted(hits, key=lambda x: x.sum_probs, reverse=True):
+            # We got all the templates we wanted, stop processing hits.
+            if num_hits >= self._max_hits:
+                break
+
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path,
+            )
+
+            if result.error:
+                errors.append(result.error)
+
+            # There could be an error even if there are some results, e.g. thrown by
+            # other unparsable chains in the same mmCIF file.
+            if result.warning:
+                warnings.append(result.warning)
+
+            if result.features is None:
+                logging.info(
+                    "Skipped invalid hit %s, error: %s, warning: %s",
+                    hit.name,
+                    result.error,
+                    result.warning,
+                )
+            else:
+                # Increment the hit counter, since we got features out of this hit.
+                num_hits += 1
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+
+        for name in template_features:
+            if num_hits > 0:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+            else:
+                # Make sure the feature has correct dtype even if empty.
+                template_features[name] = np.array([], dtype=TEMPLATE_FEATURES[name])
+
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings
+        )
+
+
+class HmmsearchHitFeaturizer(TemplateHitFeaturizer):
+    """A class for turning a3m hits from hmmsearch to template features."""
+
+    def get_templates(
+        self, query_sequence: str, hits: Sequence[parsers.TemplateHit]
+    ) -> TemplateSearchResult:
+        """Computes the templates for given query sequence (more details above)."""
+        logging.info("Searching for template for: %s", query_sequence)
+
+        template_features = {}
+        for template_feature_name in TEMPLATE_FEATURES:
+            template_features[template_feature_name] = []
+
+        already_seen = set()
+        errors = []
+        warnings = []
+
+        if not hits or hits[0].sum_probs is None:
+            sorted_hits = hits
+        else:
+            sorted_hits = sorted(hits, key=lambda x: x.sum_probs, reverse=True)
+
+        for hit in sorted_hits:
+            # We got all the templates we wanted, stop processing hits.
+            if len(already_seen) >= self._max_hits:
+                break
+
+            result = _process_single_hit(
+                query_sequence=query_sequence,
+                hit=hit,
+                mmcif_dir=self._mmcif_dir,
+                max_template_date=self._max_template_date,
+                release_dates=self._release_dates,
+                obsolete_pdbs=self._obsolete_pdbs,
+                strict_error_check=self._strict_error_check,
+                kalign_binary_path=self._kalign_binary_path,
+            )
+
+            if result.error:
+                errors.append(result.error)
+
+            # There could be an error even if there are some results, e.g. thrown by
+            # other unparsable chains in the same mmCIF file.
+            if result.warning:
+                warnings.append(result.warning)
+
+            if result.features is None:
+                logging.debug(
+                    "Skipped invalid hit %s, error: %s, warning: %s",
+                    hit.name,
+                    result.error,
+                    result.warning,
+                )
+            else:
+                already_seen_key = result.features["template_sequence"]
+                if already_seen_key in already_seen:
+                    continue
+                # Increment the hit counter, since we got features out of this hit.
+                already_seen.add(already_seen_key)
+                for k in template_features:
+                    template_features[k].append(result.features[k])
+
+        if already_seen:
+            for name in template_features:
+                template_features[name] = np.stack(
+                    template_features[name], axis=0
+                ).astype(TEMPLATE_FEATURES[name])
+        else:
+            num_res = len(query_sequence)
+            # Construct a default template with all zeros.
+            template_features = {
+                "template_aatype": np.zeros(
+                    (1, num_res, len(residue_constants.restypes_with_x_and_gap)),
+                    np.float32,
+                ),
+                "template_all_atom_mask": np.zeros(
+                    (1, num_res, residue_constants.atom_type_num), np.float32
+                ),
+                "template_all_atom_positions": np.zeros(
+                    (1, num_res, residue_constants.atom_type_num, 3), np.float32
+                ),
+                "template_domain_names": np.array(["".encode()], dtype=np.object),
+                "template_sequence": np.array(["".encode()], dtype=np.object),
+                "template_sum_probs": np.array([0], dtype=np.float32),
+            }
+        return TemplateSearchResult(
+            features=template_features, errors=errors, warnings=warnings
+        )
--- a/unifold/msa/tools/__init__.py
+++ b/unifold/msa/tools/__init__.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Python wrappers for third party tools."""
\ No newline at end of file
--- a/unifold/msa/tools/__pycache__/__init__.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/__init__.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/hhblits.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/hhblits.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/hhsearch.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/hhsearch.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/hmmbuild.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/hmmbuild.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/hmmsearch.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/hmmsearch.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/jackhmmer.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/jackhmmer.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/kalign.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/kalign.cpython-37.pyc
--- a/unifold/msa/tools/__pycache__/utils.cpython-37.pyc
+++ b/unifold/msa/tools/__pycache__/utils.cpython-37.pyc
--- a/unifold/msa/tools/hhblits.py
+++ b/unifold/msa/tools/hhblits.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to run HHblits from Python."""
+
+import glob
+import os
+import subprocess
+from typing import Any, List, Mapping, Optional, Sequence
+
+from absl import logging
+from . import utils
+
+
+_HHBLITS_DEFAULT_P = 20
+_HHBLITS_DEFAULT_Z = 500
+
+
+class HHBlits:
+    """Python wrapper of the HHblits binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        databases: Sequence[str],
+        n_cpu: int = 4,
+        n_iter: int = 3,
+        e_value: float = 0.001,
+        maxseq: int = 1_000_000,
+        realign_max: int = 100_000,
+        maxfilt: int = 100_000,
+        min_prefilter_hits: int = 1000,
+        all_seqs: bool = False,
+        alt: Optional[int] = None,
+        p: int = _HHBLITS_DEFAULT_P,
+        z: int = _HHBLITS_DEFAULT_Z,
+    ):
+        """Initializes the Python HHblits wrapper.
+
+        Args:
+            binary_path: The path to the HHblits executable.
+            databases: A sequence of HHblits database paths. This should be the
+                common prefix for the database files (i.e. up to but not including
+                _hhm.ffindex etc.)
+            n_cpu: The number of CPUs to give HHblits.
+            n_iter: The number of HHblits iterations.
+            e_value: The E-value, see HHblits docs for more details.
+            maxseq: The maximum number of rows in an input alignment. Note that this
+                parameter is only supported in HHBlits version 3.1 and higher.
+            realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
+            maxfilt: Max number of hits allowed to pass the 2nd prefilter.
+                HHblits default: 20000.
+            min_prefilter_hits: Min number of hits to pass prefilter.
+                HHblits default: 100.
+            all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
+                HHblits default: False.
+            alt: Show up to this many alternative alignments.
+            p: Minimum Prob for a hit to be included in the output hhr file.
+                HHblits default: 20.
+            z: Hard cap on number of hits reported in the hhr file.
+                HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
+
+        Raises:
+            RuntimeError: If HHblits binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error("Could not find HHBlits database %s", database_path)
+                raise ValueError(f"Could not find HHBlits database {database_path}")
+
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.maxseq = maxseq
+        self.realign_max = realign_max
+        self.maxfilt = maxfilt
+        self.min_prefilter_hits = min_prefilter_hits
+        self.all_seqs = all_seqs
+        self.alt = alt
+        self.p = p
+        self.z = z
+
+    def query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:
+        """Queries the database using HHblits."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            a3m_path = os.path.join(query_tmp_dir, "output.a3m")
+
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_fasta_path,
+                "-cpu",
+                str(self.n_cpu),
+                "-oa3m",
+                a3m_path,
+                "-o",
+                "/dev/null",
+                "-n",
+                str(self.n_iter),
+                "-e",
+                str(self.e_value),
+                "-maxseq",
+                str(self.maxseq),
+                "-realign_max",
+                str(self.realign_max),
+                "-maxfilt",
+                str(self.maxfilt),
+                "-min_prefilter_hits",
+                str(self.min_prefilter_hits),
+            ]
+            if self.all_seqs:
+                cmd += ["-all"]
+            if self.alt:
+                cmd += ["-alt", str(self.alt)]
+            if self.p != _HHBLITS_DEFAULT_P:
+                cmd += ["-p", str(self.p)]
+            if self.z != _HHBLITS_DEFAULT_Z:
+                cmd += ["-Z", str(self.z)]
+            cmd += db_cmd
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+
+            with utils.timing("HHblits query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                # Logs have a 15k character limit, so log HHblits error line by line.
+                logging.error("HHblits failed. HHblits stderr begin:")
+                for error_line in stderr.decode("utf-8").splitlines():
+                    if error_line.strip():
+                        logging.error(error_line.strip())
+                logging.error("HHblits stderr end")
+                raise RuntimeError(
+                    "HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:500_000].decode("utf-8"))
+                )
+
+            with open(a3m_path) as f:
+                a3m = f.read()
+
+        raw_output = dict(
+            a3m=a3m,
+            output=stdout,
+            stderr=stderr,
+            n_iter=self.n_iter,
+            e_value=self.e_value,
+        )
+        return [raw_output]
--- a/unifold/msa/tools/hhsearch.py
+++ b/unifold/msa/tools/hhsearch.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to run HHsearch from Python."""
+
+import glob
+import os
+import subprocess
+from typing import Sequence
+
+from absl import logging
+
+from unifold.msa import parsers
+from . import utils
+
+
+class HHSearch:
+    """Python wrapper of the HHsearch binary."""
+
+    def __init__(
+        self, *, binary_path: str, databases: Sequence[str], maxseq: int = 1_000_000
+    ):
+        """Initializes the Python HHsearch wrapper.
+
+        Args:
+            binary_path: The path to the HHsearch executable.
+            databases: A sequence of HHsearch database paths. This should be the
+                common prefix for the database files (i.e. up to but not including
+                _hhm.ffindex etc.)
+            maxseq: The maximum number of rows in an input alignment. Note that this
+                parameter is only supported in HHBlits version 3.1 and higher.
+
+        Raises:
+            RuntimeError: If HHsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.databases = databases
+        self.maxseq = maxseq
+
+        for database_path in self.databases:
+            if not glob.glob(database_path + "_*"):
+                logging.error("Could not find HHsearch database %s", database_path)
+                raise ValueError(f"Could not find HHsearch database {database_path}")
+
+    @property
+    def output_format(self) -> str:
+        return "hhr"
+
+    @property
+    def input_format(self) -> str:
+        return "a3m"
+
+    def query(self, a3m: str) -> str:
+        """Queries the database using HHsearch using a given a3m."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_path = os.path.join(query_tmp_dir, "query.a3m")
+            hhr_path = os.path.join(query_tmp_dir, "output.hhr")
+            with open(input_path, "w") as f:
+                f.write(a3m)
+
+            db_cmd = []
+            for db_path in self.databases:
+                db_cmd.append("-d")
+                db_cmd.append(db_path)
+            cmd = [
+                self.binary_path,
+                "-i",
+                input_path,
+                "-o",
+                hhr_path,
+                "-maxseq",
+                str(self.maxseq),
+            ] + db_cmd
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing("HHsearch query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                # Stderr is truncated to prevent proto size errors in Beam.
+                raise RuntimeError(
+                    "HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr[:100_000].decode("utf-8"))
+                )
+
+            with open(hhr_path) as f:
+                hhr = f.read()
+        return hhr
+
+    def get_template_hits(
+        self, output_string: str, input_sequence: str
+    ) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool."""
+        del input_sequence  # Used by hmmseach but not needed for hhsearch.
+        return parsers.parse_hhr(output_string)
--- a/unifold/msa/tools/hmmbuild.py
+++ b/unifold/msa/tools/hmmbuild.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Python wrapper for hmmbuild - construct HMM profiles from MSA."""
+
+import os
+import re
+import subprocess
+
+from absl import logging
+from . import utils
+
+
+class Hmmbuild(object):
+    """Python wrapper of the hmmbuild binary."""
+
+    def __init__(self, *, binary_path: str, singlemx: bool = False):
+        """Initializes the Python hmmbuild wrapper.
+
+        Args:
+            binary_path: The path to the hmmbuild executable.
+            singlemx: Whether to use --singlemx flag. If True, it forces HMMBuild to
+                just use a common substitution score matrix.
+
+        Raises:
+            RuntimeError: If hmmbuild binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.singlemx = singlemx
+
+    def build_profile_from_sto(self, sto: str, model_construction="fast") -> str:
+        """Builds a HHM for the aligned sequences given as an A3M string.
+
+        Args:
+            sto: A string with the aligned sequences in the Stockholm format.
+            model_construction: Whether to use reference annotation in the msa to
+                determine consensus columns ('hand') or default ('fast').
+
+        Returns:
+            A string with the profile in the HMM format.
+
+        Raises:
+            RuntimeError: If hmmbuild fails.
+        """
+        return self._build_profile(sto, model_construction=model_construction)
+
+    def build_profile_from_a3m(self, a3m: str) -> str:
+        """Builds a HHM for the aligned sequences given as an A3M string.
+
+        Args:
+            a3m: A string with the aligned sequences in the A3M format.
+
+        Returns:
+            A string with the profile in the HMM format.
+
+        Raises:
+            RuntimeError: If hmmbuild fails.
+        """
+        lines = []
+        for line in a3m.splitlines():
+            if not line.startswith(">"):
+                line = re.sub("[a-z]+", "", line)  # Remove inserted residues.
+            lines.append(line + "\n")
+        msa = "".join(lines)
+        return self._build_profile(msa, model_construction="fast")
+
+    def _build_profile(self, msa: str, model_construction: str = "fast") -> str:
+        """Builds a HMM for the aligned sequences given as an MSA string.
+
+        Args:
+            msa: A string with the aligned sequences, in A3M or STO format.
+            model_construction: Whether to use reference annotation in the msa to
+                determine consensus columns ('hand') or default ('fast').
+
+        Returns:
+            A string with the profile in the HMM format.
+
+        Raises:
+            RuntimeError: If hmmbuild fails.
+            ValueError: If unspecified arguments are provided.
+        """
+        if model_construction not in {"hand", "fast"}:
+            raise ValueError(
+                f"Invalid model_construction {model_construction} - only"
+                "hand and fast supported."
+            )
+
+        with utils.tmpdir_manager() as query_tmp_dir:
+            input_query = os.path.join(query_tmp_dir, "query.msa")
+            output_hmm_path = os.path.join(query_tmp_dir, "output.hmm")
+
+            with open(input_query, "w") as f:
+                f.write(msa)
+
+            cmd = [self.binary_path]
+            # If adding flags, we have to do so before the output and input:
+
+            if model_construction == "hand":
+                cmd.append(f"--{model_construction}")
+            if self.singlemx:
+                cmd.append("--singlemx")
+            cmd.extend(
+                [
+                    "--amino",
+                    output_hmm_path,
+                    input_query,
+                ]
+            )
+
+            logging.info("Launching subprocess %s", cmd)
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+
+            with utils.timing("hmmbuild query"):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+                logging.info(
+                    "hmmbuild stdout:\n%s\n\nstderr:\n%s\n",
+                    stdout.decode("utf-8"),
+                    stderr.decode("utf-8"),
+                )
+
+            if retcode:
+                raise RuntimeError(
+                    "hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+                )
+
+            with open(output_hmm_path, encoding="utf-8") as f:
+                hmm = f.read()
+
+        return hmm
--- a/unifold/msa/tools/hmmsearch.py
+++ b/unifold/msa/tools/hmmsearch.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Python wrapper for hmmsearch - search profile against a sequence db."""
+
+import os
+import subprocess
+from typing import Optional, Sequence
+
+from absl import logging
+from unifold.msa import parsers
+from . import hmmbuild, utils
+
+
+class Hmmsearch(object):
+    """Python wrapper of the hmmsearch binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        hmmbuild_binary_path: str,
+        database_path: str,
+        flags: Optional[Sequence[str]] = None,
+    ):
+        """Initializes the Python hmmsearch wrapper.
+
+        Args:
+            binary_path: The path to the hmmsearch executable.
+            hmmbuild_binary_path: The path to the hmmbuild executable. Used to build
+                an hmm from an input a3m.
+            database_path: The path to the hmmsearch database (FASTA format).
+            flags: List of flags to be used by hmmsearch.
+
+        Raises:
+            RuntimeError: If hmmsearch binary not found within the path.
+        """
+        self.binary_path = binary_path
+        self.hmmbuild_runner = hmmbuild.Hmmbuild(binary_path=hmmbuild_binary_path)
+        self.database_path = database_path
+        if flags is None:
+            # Default hmmsearch run settings.
+            flags = [
+                "--F1",
+                "0.1",
+                "--F2",
+                "0.1",
+                "--F3",
+                "0.1",
+                "--incE",
+                "100",
+                "-E",
+                "100",
+                "--domE",
+                "100",
+                "--incdomE",
+                "100",
+            ]
+        self.flags = flags
+
+        if not os.path.exists(self.database_path):
+            logging.error("Could not find hmmsearch database %s", database_path)
+            raise ValueError(f"Could not find hmmsearch database {database_path}")
+
+    @property
+    def output_format(self) -> str:
+        return "sto"
+
+    @property
+    def input_format(self) -> str:
+        return "sto"
+
+    def query(self, msa_sto: str) -> str:
+        """Queries the database using hmmsearch using a given stockholm msa."""
+        hmm = self.hmmbuild_runner.build_profile_from_sto(
+            msa_sto, model_construction="hand"
+        )
+        return self.query_with_hmm(hmm)
+
+    def query_with_hmm(self, hmm: str) -> str:
+        """Queries the database using hmmsearch using a given hmm."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            hmm_input_path = os.path.join(query_tmp_dir, "query.hmm")
+            out_path = os.path.join(query_tmp_dir, "output.sto")
+            with open(hmm_input_path, "w") as f:
+                f.write(hmm)
+
+            cmd = [
+                self.binary_path,
+                "--noali",  # Don't include the alignment in stdout.
+                "--cpu",
+                "8",
+            ]
+            # If adding flags, we have to do so before the output and input:
+            if self.flags:
+                cmd.extend(self.flags)
+            cmd.extend(
+                [
+                    "-A",
+                    out_path,
+                    hmm_input_path,
+                    self.database_path,
+                ]
+            )
+
+            logging.info("Launching sub-process %s", cmd)
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(
+                f"hmmsearch ({os.path.basename(self.database_path)}) query"
+            ):
+                stdout, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                raise RuntimeError(
+                    "hmmsearch failed:\nstdout:\n%s\n\nstderr:\n%s\n"
+                    % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+                )
+
+            with open(out_path) as f:
+                out_msa = f.read()
+
+        return out_msa
+
+    def get_template_hits(
+        self, output_string: str, input_sequence: str
+    ) -> Sequence[parsers.TemplateHit]:
+        """Gets parsed template hits from the raw string output by the tool."""
+        a3m_string = parsers.convert_stockholm_to_a3m(
+            output_string, remove_first_row_gaps=False
+        )
+        template_hits = parsers.parse_hmmsearch_a3m(
+            query_sequence=input_sequence, a3m_string=a3m_string, skip_first=False
+        )
+        return template_hits
--- a/unifold/msa/tools/jackhmmer.py
+++ b/unifold/msa/tools/jackhmmer.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#            http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to run Jackhmmer from Python."""
+
+from concurrent import futures
+import glob
+import os
+import subprocess
+from typing import Any, Callable, Mapping, Optional, Sequence
+from urllib import request
+
+from absl import logging
+
+from . import utils
+
+
+class Jackhmmer:
+    """Python wrapper of the Jackhmmer binary."""
+
+    def __init__(
+        self,
+        *,
+        binary_path: str,
+        database_path: str,
+        n_cpu: int = 8,
+        n_iter: int = 1,
+        e_value: float = 0.0001,
+        z_value: Optional[int] = None,
+        get_tblout: bool = False,
+        filter_f1: float = 0.0005,
+        filter_f2: float = 0.00005,
+        filter_f3: float = 0.0000005,
+        incdom_e: Optional[float] = None,
+        dom_e: Optional[float] = None,
+        num_streamed_chunks: Optional[int] = None,
+        streaming_callback: Optional[Callable[[int], None]] = None,
+    ):
+        """Initializes the Python Jackhmmer wrapper.
+
+        Args:
+            binary_path: The path to the jackhmmer executable.
+            database_path: The path to the jackhmmer database (FASTA format).
+            n_cpu: The number of CPUs to give Jackhmmer.
+            n_iter: The number of Jackhmmer iterations.
+            e_value: The E-value, see Jackhmmer docs for more details.
+            z_value: The Z-value, see Jackhmmer docs for more details.
+            get_tblout: Whether to save tblout string.
+            filter_f1: MSV and biased composition pre-filter, set to >1.0 to turn off.
+            filter_f2: Viterbi pre-filter, set to >1.0 to turn off.
+            filter_f3: Forward pre-filter, set to >1.0 to turn off.
+            incdom_e: Domain e-value criteria for inclusion of domains in MSA/next
+                round.
+            dom_e: Domain e-value criteria for inclusion in tblout.
+            num_streamed_chunks: Number of database chunks to stream over.
+            streaming_callback: Callback function run after each chunk iteration with
+                the iteration number as argument.
+        """
+        self.binary_path = binary_path
+        self.database_path = database_path
+        self.num_streamed_chunks = num_streamed_chunks
+
+        if not os.path.exists(self.database_path) and num_streamed_chunks is None:
+            logging.error("Could not find Jackhmmer database %s", database_path)
+            raise ValueError(f"Could not find Jackhmmer database {database_path}")
+
+        self.n_cpu = n_cpu
+        self.n_iter = n_iter
+        self.e_value = e_value
+        self.z_value = z_value
+        self.filter_f1 = filter_f1
+        self.filter_f2 = filter_f2
+        self.filter_f3 = filter_f3
+        self.incdom_e = incdom_e
+        self.dom_e = dom_e
+        self.get_tblout = get_tblout
+        self.streaming_callback = streaming_callback
+
+    def _query_chunk(
+        self, input_fasta_path: str, database_path: str
+    ) -> Mapping[str, Any]:
+        """Queries the database chunk using Jackhmmer."""
+        with utils.tmpdir_manager() as query_tmp_dir:
+            sto_path = os.path.join(query_tmp_dir, "output.sto")
+
+            # The F1/F2/F3 are the expected proportion to pass each of the filtering
+            # stages (which get progressively more expensive), reducing these
+            # speeds up the pipeline at the expensive of sensitivity.    They are
+            # currently set very low to make querying Mgnify run in a reasonable
+            # amount of time.
+            cmd_flags = [
+                # Don't pollute stdout with Jackhmmer output.
+                "-o",
+                "/dev/null",
+                "-A",
+                sto_path,
+                "--noali",
+                "--F1",
+                str(self.filter_f1),
+                "--F2",
+                str(self.filter_f2),
+                "--F3",
+                str(self.filter_f3),
+                "--incE",
+                str(self.e_value),
+                # Report only sequences with E-values <= x in per-sequence output.
+                "-E",
+                str(self.e_value),
+                "--cpu",
+                str(self.n_cpu),
+                "-N",
+                str(self.n_iter),
+            ]
+            if self.get_tblout:
+                tblout_path = os.path.join(query_tmp_dir, "tblout.txt")
+                cmd_flags.extend(["--tblout", tblout_path])
+
+            if self.z_value:
+                cmd_flags.extend(["-Z", str(self.z_value)])
+
+            if self.dom_e is not None:
+                cmd_flags.extend(["--domE", str(self.dom_e)])
+
+            if self.incdom_e is not None:
+                cmd_flags.extend(["--incdomE", str(self.incdom_e)])
+
+            cmd = [self.binary_path] + cmd_flags + [input_fasta_path, database_path]
+
+            logging.info('Launching subprocess "%s"', " ".join(cmd))
+            process = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            with utils.timing(f"Jackhmmer ({os.path.basename(database_path)}) query"):
+                _, stderr = process.communicate()
+                retcode = process.wait()
+
+            if retcode:
+                raise RuntimeError(
+                    "Jackhmmer failed\nstderr:\n%s\n" % stderr.decode("utf-8")
+                )
+
+            # Get e-values for each target name
+            tbl = ""
+            if self.get_tblout:
+                with open(tblout_path) as f:
+                    tbl = f.read()
+
+            with open(sto_path) as f:
+                sto = f.read()
+
+        raw_output = dict(
+            sto=sto, tbl=tbl, stderr=stderr, n_iter=self.n_iter, e_value=self.e_value
+        )
+
+        return raw_output
+
+    def query(self, input_fasta_path: str) -> Sequence[Mapping[str, Any]]:
+        """Queries the database using Jackhmmer."""
+        if self.num_streamed_chunks is None:
+            return [self._query_chunk(input_fasta_path, self.database_path)]
+
+        db_basename = os.path.basename(self.database_path)
+        db_remote_chunk = lambda db_idx: f"{self.database_path}.{db_idx}"
+        db_local_chunk = lambda db_idx: f"/tmp/ramdisk/{db_basename}.{db_idx}"
+
+        # Remove existing files to prevent OOM
+        for f in glob.glob(db_local_chunk("[0-9]*")):
+            try:
+                os.remove(f)
+            except OSError:
+                print(f"OSError while deleting {f}")
+
+        # Download the (i+1)-th chunk while Jackhmmer is running on the i-th chunk
+        with futures.ThreadPoolExecutor(max_workers=2) as executor:
+            chunked_output = []
+            for i in range(1, self.num_streamed_chunks + 1):
+                # Copy the chunk locally
+                if i == 1:
+                    future = executor.submit(
+                        request.urlretrieve, db_remote_chunk(i), db_local_chunk(i)
+                    )
+                if i < self.num_streamed_chunks:
+                    next_future = executor.submit(
+                        request.urlretrieve,
+                        db_remote_chunk(i + 1),
+                        db_local_chunk(i + 1),
+                    )
+
+                # Run Jackhmmer with the chunk
+                future.result()
+                chunked_output.append(
+                    self._query_chunk(input_fasta_path, db_local_chunk(i))
+                )
+
+                # Remove the local copy of the chunk
+                os.remove(db_local_chunk(i))
+                # Do not set next_future for the last chunk so that this works even for
+                # databases with only 1 chunk.
+                if i < self.num_streamed_chunks:
+                    future = next_future
+                if self.streaming_callback:
+                    self.streaming_callback(i)
+        return chunked_output