Start implementing Multimer

e699d7d2 · Gustaf Ahdritz · 61d004a2 · e699d7d2 · e699d7d2 · e699d7d2
Commit e699d7d2 authored Feb 22, 2022 by Gustaf Ahdritz
8 changed files
--- a/openfold/data/mmcif_parsing.py
+++ b/openfold/data/mmcif_parsing.py
@@ -16,6 +16,7 @@
 """Parses the mmCIF file format."""
 import collections
 import dataclasses
+import functools
 import io
 import json
 import logging
@@ -173,6 +174,7 @@ def mmcif_loop_to_dict(
    return {entry[index]: entry for entry in entries}
+@functools.lru_cache(16, typed=False)
 def parse(
    *, file_id: str, mmcif_string: str, catch_all_errors: bool = True
 ) -> ParsingResult:
@@ -346,7 +348,7 @@ def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
                raw_resolution = parsed_info[res_key][0]
                header["resolution"] = float(raw_resolution)
            except ValueError:
-                logging.info(
+                logging.debug(
                    "Invalid resolution format: %s", parsed_info[res_key]
                )

--- a/openfold/data/msa_identifiers.py
+++ b/openfold/data/msa_identifiers.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for extracting identifiers from MSA sequence descriptions."""
+import dataclasses
+import re
+from typing import Optional
+# Sequences coming from UniProtKB database come in the
+# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
+_UNIPROT_PATTERN = re.compile(
+    r"""
+    ^
+    # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
+    (?:tr|sp)
+    \|
+    # A primary accession number of the UniProtKB entry.
+    (?P<AccessionIdentifier>[A-Za-z0-9]{6,10})
+    # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
+    (?:_\d)?
+    \|
+    # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
+    # protein ID code.
+    (?:[A-Za-z0-9]+)
+    _
+    # A mnemonic species identification code.
+    (?P<SpeciesIdentifier>([A-Za-z0-9]){1,5})
+    # Small BFD uses a final value after an underscore, which we ignore.
+    (?:_\d+)?
+    $
+    """,
+    re.VERBOSE)
+@dataclasses.dataclass(frozen=True)
+class Identifiers:
+  uniprot_accession_id: str = ''
+  species_id: str = ''
+def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
+  """Gets accession id and species from an msa sequence identifier.
+  The sequence identifier has the format specified by
+  _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
+  An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`
+  Args:
+    msa_sequence_identifier: a sequence identifier.
+  Returns:
+    An `Identifiers` instance with a uniprot_accession_id and species_id. These
+    can be empty in the case where no identifier was found.
+  """
+  matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
+  if matches:
+    return Identifiers(
+        uniprot_accession_id=matches.group('AccessionIdentifier'),
+        species_id=matches.group('SpeciesIdentifier'))
+  return Identifiers()
+def _extract_sequence_identifier(description: str) -> Optional[str]:
+  """Extracts sequence identifier from description. Returns None if no match."""
+  split_description = description.split()
+  if split_description:
+    return split_description[0].partition('/')[0]
+  else:
+    return None
+def get_identifiers(description: str) -> Identifiers:
+  """Computes extra MSA features from the description."""
+  sequence_identifier = _extract_sequence_identifier(description)
+  if sequence_identifier is None:
+    return Identifiers()
+  else:
+    return _parse_sequence_identifier(sequence_identifier)
--- a/openfold/data/msa_pairing.py
+++ b/openfold/data/msa_pairing.py
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pairing logic for multimer data pipeline."""
+import collections
+import functools
+import string
+from typing import Any, Dict, Iterable, List, Sequence
+import numpy as np
+import pandas as pd
+import scipy.linalg
+from openfold.np import residue_constants
+# TODO: This stuff should probably also be in a config
+ALPHA_ACCESSION_ID_MAP = {x: y for y, x in enumerate(string.ascii_uppercase)}
+ALPHANUM_ACCESSION_ID_MAP = {
+    chr: num for num, chr in enumerate(string.ascii_uppercase + string.digits)
+}  # A-Z,0-9
+NUM_ACCESSION_ID_MAP = {str(x): x for x in range(10)}                # 0-9
+MSA_GAP_IDX = residue_constants.restypes_with_x_and_gap.index('-')
+SEQUENCE_GAP_CUTOFF = 0.5
+SEQUENCE_SIMILARITY_CUTOFF = 0.9
+MSA_PAD_VALUES = {'msa_all_seq': MSA_GAP_IDX,
+                  'msa_mask_all_seq': 1,
+                  'deletion_matrix_all_seq': 0,
+                  'deletion_matrix_int_all_seq': 0,
+                  'msa': MSA_GAP_IDX,
+                  'msa_mask': 1,
+                  'deletion_matrix': 0,
+                  'deletion_matrix_int': 0}
+MSA_FEATURES = ('msa', 'msa_mask', 'deletion_matrix', 'deletion_matrix_int')
+SEQ_FEATURES = ('residue_index', 'aatype', 'all_atom_positions',
+                'all_atom_mask', 'seq_mask', 'between_segment_residues',
+                'has_alt_locations', 'has_hetatoms', 'asym_id', 'entity_id',
+                'sym_id', 'entity_mask', 'deletion_mean',
+                'prediction_atom_mask',
+                'literature_positions', 'atom_indices_to_group_indices',
+                'rigid_group_default_frame')
+TEMPLATE_FEATURES = ('template_aatype', 'template_all_atom_positions',
+                     'template_all_atom_mask')
+CHAIN_FEATURES = ('num_alignments', 'seq_length')
+def create_paired_features(
+    chains: Iterable[Mapping[str, np.ndarray]],
+    prokaryotic: bool,
+    ) ->  List[Mapping[str, np.ndarray]]:
+  """Returns the original chains with paired NUM_SEQ features.
+  Args:
+    chains:  A list of feature dictionaries for each chain.
+    prokaryotic: Whether the target complex is from a prokaryotic organism.
+      Used to determine the distance metric for pairing.
+  Returns:
+    A list of feature dictionaries with sequence features including only
+    rows to be paired.
+  """
+  chains = list(chains)
+  chain_keys = chains[0].keys()
+  if len(chains) < 2:
+    return chains
+  else:
+    updated_chains = []
+    paired_chains_to_paired_row_indices = pair_sequences(
+        chains, prokaryotic)
+    paired_rows = reorder_paired_rows(
+        paired_chains_to_paired_row_indices)
+    for chain_num, chain in enumerate(chains):
+      new_chain = {k: v for k, v in chain.items() if '_all_seq' not in k}
+      for feature_name in chain_keys:
+        if feature_name.endswith('_all_seq'):
+          feats_padded = pad_features(chain[feature_name], feature_name)
+          new_chain[feature_name] = feats_padded[paired_rows[:, chain_num]]
+      new_chain['num_alignments_all_seq'] = np.asarray(
+          len(paired_rows[:, chain_num]))
+      updated_chains.append(new_chain)
+    return updated_chains
+def pad_features(feature: np.ndarray, feature_name: str) -> np.ndarray:
+  """Add a 'padding' row at the end of the features list.
+  The padding row will be selected as a 'paired' row in the case of partial
+  alignment - for the chain that doesn't have paired alignment.
+  Args:
+    feature: The feature to be padded.
+    feature_name: The name of the feature to be padded.
+  Returns:
+    The feature with an additional padding row.
+  """
+  assert feature.dtype != np.dtype(np.string_)
+  if feature_name in ('msa_all_seq', 'msa_mask_all_seq',
+                      'deletion_matrix_all_seq', 'deletion_matrix_int_all_seq'):
+    num_res = feature.shape[1]
+    padding = MSA_PAD_VALUES[feature_name] * np.ones([1, num_res],
+                                                     feature.dtype)
+  elif feature_name in ('msa_uniprot_accession_identifiers_all_seq',
+                        'msa_species_identifiers_all_seq'):
+    padding = [b'']
+  else:
+    return feature
+  feats_padded = np.concatenate([feature, padding], axis=0)
+  return feats_padded
+def _make_msa_df(chain_features: Mapping[str, np.ndarray]) -> pd.DataFrame:
+  """Makes dataframe with msa features needed for msa pairing."""
+  chain_msa = chain_features['msa_all_seq']
+  query_seq = chain_msa[0]
+  per_seq_similarity = np.sum(
+      query_seq[None] == chain_msa, axis=-1) / float(len(query_seq))
+  per_seq_gap = np.sum(chain_msa == 21, axis=-1) / float(len(query_seq))
+  msa_df = pd.DataFrame({
+      'msa_species_identifiers':
+          chain_features['msa_species_identifiers_all_seq'],
+      'msa_uniprot_accession_identifiers':
+          chain_features['msa_uniprot_accession_identifiers_all_seq'],
+      'msa_row':
+          np.arange(len(
+              chain_features['msa_uniprot_accession_identifiers_all_seq'])),
+      'msa_similarity': per_seq_similarity,
+      'gap': per_seq_gap
+  })
+  return msa_df
+def _create_species_dict(msa_df: pd.DataFrame) -> Dict[bytes, pd.DataFrame]:
+  """Creates mapping from species to msa dataframe of that species."""
+  species_lookup = {}
+  for species, species_df in msa_df.groupby('msa_species_identifiers'):
+    species_lookup[species] = species_df
+  return species_lookup
+@functools.lru_cache(maxsize=65536)
+def encode_accession(accession_id: str) -> int:
+  """Map accession codes to the serial order in which they were assigned."""
+  alpha = ALPHA_ACCESSION_ID_MAP        # A-Z
+  alphanum = ALPHANUM_ACCESSION_ID_MAP  # A-Z,0-9
+  num = NUM_ACCESSION_ID_MAP            # 0-9
+  coding = 0
+  # This is based on the uniprot accession id format
+  # https://www.uniprot.org/help/accession_numbers
+  if accession_id[0] in {'O', 'P', 'Q'}:
+    bases = (alpha, num, alphanum, alphanum, alphanum, num)
+  elif len(accession_id) == 6:
+    bases = (alpha, num, alpha, alphanum, alphanum, num)
+  elif len(accession_id) == 10:
+    bases = (alpha, num, alpha, alphanum, alphanum, num, alpha, alphanum,
+             alphanum, num)
+  product = 1
+  for place, base in zip(reversed(accession_id), reversed(bases)):
+    coding += base[place] * product
+    product *= len(base)
+  return coding
+def _calc_id_diff(id_a: bytes, id_b: bytes) -> int:
+  return abs(encode_accession(id_a.decode()) - encode_accession(id_b.decode()))
+def _find_all_accession_matches(accession_id_lists: List[List[bytes]],
+                                diff_cutoff: int = 20
+                                ) -> List[List[Any]]:
+  """Finds accession id matches across the chains based on their difference."""
+  all_accession_tuples = []
+  current_tuple = []
+  tokens_used_in_answer = set()
+  def _matches_all_in_current_tuple(inp: bytes, diff_cutoff: int) -> bool:
+    return all((_calc_id_diff(s, inp) < diff_cutoff for s in current_tuple))
+  def _all_tokens_not_used_before() -> bool:
+    return all((s not in tokens_used_in_answer for s in current_tuple))
+  def dfs(level, accession_id, diff_cutoff=diff_cutoff) -> None:
+    if level == len(accession_id_lists) - 1:
+      if _all_tokens_not_used_before():
+        all_accession_tuples.append(list(current_tuple))
+        for s in current_tuple:
+          tokens_used_in_answer.add(s)
+      return
+    if level == -1:
+      new_list = accession_id_lists[level+1]
+    else:
+      new_list = [(_calc_id_diff(accession_id, s), s) for
+                  s in accession_id_lists[level+1]]
+      new_list = sorted(new_list)
+      new_list = [s for d, s in new_list]
+    for s in new_list:
+      if (_matches_all_in_current_tuple(s, diff_cutoff) and
+          s not in tokens_used_in_answer):
+        current_tuple.append(s)
+        dfs(level + 1, s)
+        current_tuple.pop()
+  dfs(-1, '')
+  return all_accession_tuples
+def _accession_row(msa_df: pd.DataFrame, accession_id: bytes) -> pd.Series:
+  matched_df = msa_df[msa_df.msa_uniprot_accession_identifiers == accession_id]
+  return matched_df.iloc[0]
+def _match_rows_by_genetic_distance(
+    this_species_msa_dfs: List[pd.DataFrame],
+    cutoff: int = 20) -> List[List[int]]:
+  """Finds MSA sequence pairings across chains within a genetic distance cutoff.
+  The genetic distance between two sequences is approximated by taking the
+  difference in their UniProt accession ids.
+  Args:
+    this_species_msa_dfs: a list of dataframes containing MSA features for
+      sequences for a specific species. If species is missing for a chain, the
+      dataframe is set to None.
+    cutoff: the genetic distance cutoff.
+  Returns:
+    A list of lists, each containing M indices corresponding to paired MSA rows,
+    where M is the number of chains.
+  """
+  num_examples = len(this_species_msa_dfs)  # N
+  accession_id_lists = []  # M
+  match_index_to_chain_index = {}
+  for chain_index, species_df in enumerate(this_species_msa_dfs):
+    if species_df is not None:
+      accession_id_lists.append(
+          list(species_df.msa_uniprot_accession_identifiers.values))
+      # Keep track of which of the this_species_msa_dfs are not None.
+      match_index_to_chain_index[len(accession_id_lists) - 1] = chain_index
+  all_accession_id_matches = _find_all_accession_matches(
+      accession_id_lists, cutoff)  # [k, M]
+  all_paired_msa_rows = []  # [k, N]
+  for accession_id_match in all_accession_id_matches:
+    paired_msa_rows = []
+    for match_index, accession_id in enumerate(accession_id_match):
+      # Map back to chain index.
+      chain_index = match_index_to_chain_index[match_index]
+      seq_series = _accession_row(
+          this_species_msa_dfs[chain_index], accession_id)
+      if (seq_series.msa_similarity > SEQUENCE_SIMILARITY_CUTOFF or
+          seq_series.gap > SEQUENCE_GAP_CUTOFF):
+        continue
+      else:
+        paired_msa_rows.append(seq_series.msa_row)
+    # If a sequence is skipped based on sequence similarity to the respective
+    # target sequence or a gap cuttoff, the lengths of accession_id_match and
+    # paired_msa_rows will be different. Skip this match.
+    if len(paired_msa_rows) == len(accession_id_match):
+      paired_and_non_paired_msa_rows = np.array([-1] * num_examples)
+      matched_chain_indices = list(match_index_to_chain_index.values())
+      paired_and_non_paired_msa_rows[matched_chain_indices] = paired_msa_rows
+      all_paired_msa_rows.append(list(paired_and_non_paired_msa_rows))
+  return all_paired_msa_rows
+def _match_rows_by_sequence_similarity(this_species_msa_dfs: List[pd.DataFrame]
+                                       ) -> List[List[int]]:
+  """Finds MSA sequence pairings across chains based on sequence similarity.
+  Each chain's MSA sequences are first sorted by their sequence similarity to
+  their respective target sequence. The sequences are then paired, starting
+  from the sequences most similar to their target sequence.
+  Args:
+    this_species_msa_dfs: a list of dataframes containing MSA features for
+      sequences for a specific species.
+  Returns:
+   A list of lists, each containing M indices corresponding to paired MSA rows,
+   where M is the number of chains.
+  """
+  all_paired_msa_rows = []
+  num_seqs = [len(species_df) for species_df in this_species_msa_dfs
+              if species_df is not None]
+  take_num_seqs = np.min(num_seqs)
+  sort_by_similarity = (
+      lambda x: x.sort_values('msa_similarity', axis=0, ascending=False))
+  for species_df in this_species_msa_dfs:
+    if species_df is not None:
+      species_df_sorted = sort_by_similarity(species_df)
+      msa_rows = species_df_sorted.msa_row.iloc[:take_num_seqs].values
+    else:
+      msa_rows = [-1] * take_num_seqs  # take the last 'padding' row
+    all_paired_msa_rows.append(msa_rows)
+  all_paired_msa_rows = list(np.array(all_paired_msa_rows).transpose())
+  return all_paired_msa_rows
+def pair_sequences(examples: List[Mapping[str, np.ndarray]],
+                   prokaryotic: bool) -> Dict[int, np.ndarray]:
+  """Returns indices for paired MSA sequences across chains."""
+  num_examples = len(examples)
+  all_chain_species_dict = []
+  common_species = set()
+  for chain_features in examples:
+    msa_df = _make_msa_df(chain_features)
+    species_dict = _create_species_dict(msa_df)
+    all_chain_species_dict.append(species_dict)
+    common_species.update(set(species_dict))
+  common_species = sorted(common_species)
+  common_species.remove(b'')  # Remove target sequence species.
+  all_paired_msa_rows = [np.zeros(len(examples), int)]
+  all_paired_msa_rows_dict = {k: [] for k in range(num_examples)}
+  all_paired_msa_rows_dict[num_examples] = [np.zeros(len(examples), int)]
+  for species in common_species:
+    if not species:
+      continue
+    this_species_msa_dfs = []
+    species_dfs_present = 0
+    for species_dict in all_chain_species_dict:
+      if species in species_dict:
+        this_species_msa_dfs.append(species_dict[species])
+        species_dfs_present += 1
+      else:
+        this_species_msa_dfs.append(None)
+    # Skip species that are present in only one chain.
+    if species_dfs_present <= 1:
+      continue
+    if np.any(
+        np.array([len(species_df) for species_df in
+                  this_species_msa_dfs if
+                  isinstance(species_df, pd.DataFrame)]) > 600):
+      continue
+    # In prokaryotes (and some eukaryotes), interacting genes are often
+    # co-located on the chromosome into operons. Because of that we can assume
+    # that if two proteins' intergenic distance is less than a threshold, they
+    # two proteins will form an an interacting pair.
+    # In most eukaryotes, a single protein's MSA can contain many paralogs.
+    # Two genes may interact even if they are not close by genomic distance.
+    # In case of eukaryotes, some methods pair MSA sequences using sequence
+    # similarity method.
+    # See Jinbo Xu's work:
+    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6030867/#B28.
+    if prokaryotic:
+      paired_msa_rows = _match_rows_by_genetic_distance(this_species_msa_dfs)
+      if not paired_msa_rows:
+        continue
+    else:
+      paired_msa_rows = _match_rows_by_sequence_similarity(this_species_msa_dfs)
+    all_paired_msa_rows.extend(paired_msa_rows)
+    all_paired_msa_rows_dict[species_dfs_present].extend(paired_msa_rows)
+  all_paired_msa_rows_dict = {
+      num_examples: np.array(paired_msa_rows) for
+      num_examples, paired_msa_rows in all_paired_msa_rows_dict.items()
+  }
+  return all_paired_msa_rows_dict
+def reorder_paired_rows(all_paired_msa_rows_dict: Dict[int, np.ndarray]
+                        ) -> np.ndarray:
+  """Creates a list of indices of paired MSA rows across chains.
+  Args:
+    all_paired_msa_rows_dict: a mapping from the number of paired chains to the
+      paired indices.
+  Returns:
+    a list of lists, each containing indices of paired MSA rows across chains.
+    The paired-index lists are ordered by:
+      1) the number of chains in the paired alignment, i.e, all-chain pairings
+         will come first.
+      2) e-values
+  """
+  all_paired_msa_rows = []
+  for num_pairings in sorted(all_paired_msa_rows_dict, reverse=True):
+    paired_rows = all_paired_msa_rows_dict[num_pairings]
+    paired_rows_product = abs(np.array([np.prod(rows) for rows in paired_rows]))
+    paired_rows_sort_index = np.argsort(paired_rows_product)
+    all_paired_msa_rows.extend(paired_rows[paired_rows_sort_index])
+  return np.array(all_paired_msa_rows)
+def block_diag(*arrs: np.ndarray, pad_value: float = 0.0) -> np.ndarray:
+  """Like scipy.linalg.block_diag but with an optional padding value."""
+  ones_arrs = [np.ones_like(x) for x in arrs]
+  off_diag_mask = 1.0 - scipy.linalg.block_diag(*ones_arrs)
+  diag = scipy.linalg.block_diag(*arrs)
+  diag += (off_diag_mask * pad_value).astype(diag.dtype)
+  return diag
+def _correct_post_merged_feats(
+    np_example: Mapping[str, np.ndarray],
+    np_chains_list: Sequence[Mapping[str, np.ndarray]],
+    pair_msa_sequences: bool) -> Mapping[str, np.ndarray]:
+  """Adds features that need to be computed/recomputed post merging."""
+  np_example['seq_length'] = np.asarray(np_example['aatype'].shape[0],
+                                        dtype=np.int32)
+  np_example['num_alignments'] = np.asarray(np_example['msa'].shape[0],
+                                            dtype=np.int32)
+  if not pair_msa_sequences:
+    # Generate a bias that is 1 for the first row of every block in the
+    # block diagonal MSA - i.e. make sure the cluster stack always includes
+    # the query sequences for each chain (since the first row is the query
+    # sequence).
+    cluster_bias_masks = []
+    for chain in np_chains_list:
+      mask = np.zeros(chain['msa'].shape[0])
+      mask[0] = 1
+      cluster_bias_masks.append(mask)
+    np_example['cluster_bias_mask'] = np.concatenate(cluster_bias_masks)
+    # Initialize Bert mask with masked out off diagonals.
+    msa_masks = [np.ones(x['msa'].shape, dtype=np.float32)
+                 for x in np_chains_list]
+    np_example['bert_mask'] = block_diag(
+        *msa_masks, pad_value=0)
+  else:
+    np_example['cluster_bias_mask'] = np.zeros(np_example['msa'].shape[0])
+    np_example['cluster_bias_mask'][0] = 1
+    # Initialize Bert mask with masked out off diagonals.
+    msa_masks = [np.ones(x['msa'].shape, dtype=np.float32) for
+                 x in np_chains_list]
+    msa_masks_all_seq = [np.ones(x['msa_all_seq'].shape, dtype=np.float32) for
+                         x in np_chains_list]
+    msa_mask_block_diag = block_diag(
+        *msa_masks, pad_value=0)
+    msa_mask_all_seq = np.concatenate(msa_masks_all_seq, axis=1)
+    np_example['bert_mask'] = np.concatenate(
+        [msa_mask_all_seq, msa_mask_block_diag], axis=0)
+  return np_example
+def _pad_templates(chains: Sequence[Mapping[str, np.ndarray]],
+                   max_templates: int) -> Sequence[Mapping[str, np.ndarray]]:
+  """For each chain pad the number of templates to a fixed size.
+  Args:
+    chains: A list of protein chains.
+    max_templates: Each chain will be padded to have this many templates.
+  Returns:
+    The list of chains, updated to have template features padded to
+    max_templates.
+  """
+  for chain in chains:
+    for k, v in chain.items():
+      if k in TEMPLATE_FEATURES:
+        padding = np.zeros_like(v.shape)
+        padding[0] = max_templates - v.shape[0]
+        padding = [(0, p) for p in padding]
+        chain[k] = np.pad(v, padding, mode='constant')
+  return chains
+def _merge_features_from_multiple_chains(
+    chains: Sequence[Mapping[str, np.ndarray]],
+    pair_msa_sequences: bool) -> Mapping[str, np.ndarray]:
+  """Merge features from multiple chains.
+  Args:
+    chains: A list of feature dictionaries that we want to merge.
+    pair_msa_sequences: Whether to concatenate MSA features along the
+      num_res dimension (if True), or to block diagonalize them (if False).
+  Returns:
+    A feature dictionary for the merged example.
+  """
+  merged_example = {}
+  for feature_name in chains[0]:
+    feats = [x[feature_name] for x in chains]
+    feature_name_split = feature_name.split('_all_seq')[0]
+    if feature_name_split in MSA_FEATURES:
+      if pair_msa_sequences or '_all_seq' in feature_name:
+        merged_example[feature_name] = np.concatenate(feats, axis=1)
+      else:
+        merged_example[feature_name] = block_diag(
+            *feats, pad_value=MSA_PAD_VALUES[feature_name])
+    elif feature_name_split in SEQ_FEATURES:
+      merged_example[feature_name] = np.concatenate(feats, axis=0)
+    elif feature_name_split in TEMPLATE_FEATURES:
+      merged_example[feature_name] = np.concatenate(feats, axis=1)
+    elif feature_name_split in CHAIN_FEATURES:
+      merged_example[feature_name] = np.sum(x for x in feats).astype(np.int32)
+    else:
+      merged_example[feature_name] = feats[0]
+  return merged_example
+def _merge_homomers_dense_msa(
+    chains: Iterable[Mapping[str, np.ndarray]]) -> Sequence[Mapping[str, np.ndarray]]:
+  """Merge all identical chains, making the resulting MSA dense.
+  Args:
+    chains: An iterable of features for each chain.
+  Returns:
+    A list of feature dictionaries.  All features with the same entity_id
+    will be merged - MSA features will be concatenated along the num_res
+    dimension - making them dense.
+  """
+  entity_chains = collections.defaultdict(list)
+  for chain in chains:
+    entity_id = chain['entity_id'][0]
+    entity_chains[entity_id].append(chain)
+  grouped_chains = []
+  for entity_id in sorted(entity_chains):
+    chains = entity_chains[entity_id]
+    grouped_chains.append(chains)
+  chains = [
+      _merge_features_from_multiple_chains(chains, pair_msa_sequences=True)
+      for chains in grouped_chains]
+  return chains
+def _concatenate_paired_and_unpaired_features(
+    example: Mapping[str, np.ndarray]) -> Mapping[str, np.ndarray]:
+  """Merges paired and block-diagonalised features."""
+  features = MSA_FEATURES
+  for feature_name in features:
+    if feature_name in example:
+      feat = example[feature_name]
+      feat_all_seq = example[feature_name + '_all_seq']
+      merged_feat = np.concatenate([feat_all_seq, feat], axis=0)
+      example[feature_name] = merged_feat
+  example['num_alignments'] = np.array(example['msa'].shape[0],
+                                       dtype=np.int32)
+  return example
+def merge_chain_features(np_chains_list: List[Mapping[str, np.ndarray]],
+                         pair_msa_sequences: bool,
+                         max_templates: int) -> Mapping[str, np.ndarray]:
+  """Merges features for multiple chains to single FeatureDict.
+  Args:
+    np_chains_list: List of FeatureDicts for each chain.
+    pair_msa_sequences: Whether to merge paired MSAs.
+    max_templates: The maximum number of templates to include.
+  Returns:
+    Single FeatureDict for entire complex.
+  """
+  np_chains_list = _pad_templates(
+      np_chains_list, max_templates=max_templates)
+  np_chains_list = _merge_homomers_dense_msa(np_chains_list)
+  # Unpaired MSA features will be always block-diagonalised; paired MSA
+  # features will be concatenated.
+  np_example = _merge_features_from_multiple_chains(
+      np_chains_list, pair_msa_sequences=False)
+  if pair_msa_sequences:
+    np_example = _concatenate_paired_and_unpaired_features(np_example)
+  np_example = _correct_post_merged_feats(
+      np_example=np_example,
+      np_chains_list=np_chains_list,
+      pair_msa_sequences=pair_msa_sequences)
+  return np_example
+def deduplicate_unpaired_sequences(
+    np_chains: List[Mapping[str, np.ndarray]]) -> List[Mapping[str, np.ndarray]]:
+  """Removes unpaired sequences which duplicate a paired sequence."""
+  feature_names = np_chains[0].keys()
+  msa_features = MSA_FEATURES
+  for chain in np_chains:
+    # Convert the msa_all_seq numpy array to a tuple for hashing.
+    sequence_set = set(tuple(s) for s in chain['msa_all_seq'])
+    keep_rows = []
+    # Go through unpaired MSA seqs and remove any rows that correspond to the
+    # sequences that are already present in the paired MSA.
+    for row_num, seq in enumerate(chain['msa']):
+      if tuple(seq) not in sequence_set:
+        keep_rows.append(row_num)
+    for feature_name in feature_names:
+      if feature_name in msa_features:
+        chain[feature_name] = chain[feature_name][keep_rows]
+    chain['num_alignments'] = np.array(chain['msa'].shape[0], dtype=np.int32)
+  return np_chains
--- a/openfold/data/multimer_feature_processing.py
+++ b/openfold/data/multimer_feature_processing.py
+# Copyright 2021 DeepMind Technologies Limited
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature processing logic for multimer data pipeline."""
+from typing import Iterable, MutableMapping, List
+from openfold.data import msa_pairing
+from openfold.np import residue_constants
+import numpy as np
+# TODO: Move this into the config
+REQUIRED_FEATURES = frozenset({
+    'aatype', 'all_atom_mask', 'all_atom_positions', 'all_chains_entity_ids',
+    'all_crops_all_chains_mask', 'all_crops_all_chains_positions',
+    'all_crops_all_chains_residue_ids', 'assembly_num_chains', 'asym_id',
+    'bert_mask', 'cluster_bias_mask', 'deletion_matrix', 'deletion_mean',
+    'entity_id', 'entity_mask', 'mem_peak', 'msa', 'msa_mask', 'num_alignments',
+    'num_templates', 'queue_size', 'residue_index', 'resolution',
+    'seq_length', 'seq_mask', 'sym_id', 'template_aatype',
+    'template_all_atom_mask', 'template_all_atom_positions'
+})
+MAX_TEMPLATES = 4
+MSA_CROP_SIZE = 2048
+def _is_homomer_or_monomer(chains: Iterable[Mapping[str, np.ndarray]]) -> bool:
+  """Checks if a list of chains represents a homomer/monomer example."""
+  # Note that an entity_id of 0 indicates padding.
+  num_unique_chains = len(np.unique(np.concatenate(
+      [np.unique(chain['entity_id'][chain['entity_id'] > 0]) for
+       chain in chains])))
+  return num_unique_chains == 1
+def pair_and_merge(
+    all_chain_features: MutableMapping[str, Mapping[str, np.ndarray]],
+    is_prokaryote: bool) -> Mapping[str, np.ndarray]:
+  """Runs processing on features to augment, pair and merge.
+  Args:
+    all_chain_features: A MutableMap of dictionaries of features for each chain.
+    is_prokaryote: Whether the target complex is from a prokaryotic or
+    eukaryotic organism.
+  Returns:
+    A dictionary of features.
+  """
+  process_unmerged_features(all_chain_features)
+  np_chains_list = list(all_chain_features.values())
+  pair_msa_sequences = not _is_homomer_or_monomer(np_chains_list)
+  if pair_msa_sequences:
+    np_chains_list = msa_pairing.create_paired_features(
+        chains=np_chains_list, prokaryotic=is_prokaryote)
+    np_chains_list = msa_pairing.deduplicate_unpaired_sequences(np_chains_list)
+  np_chains_list = crop_chains(
+      np_chains_list,
+      msa_crop_size=MSA_CROP_SIZE,
+      pair_msa_sequences=pair_msa_sequences,
+      max_templates=MAX_TEMPLATES
+  )
+  np_example = msa_pairing.merge_chain_features(
+      np_chains_list=np_chains_list, pair_msa_sequences=pair_msa_sequences,
+      max_templates=MAX_TEMPLATES
+  )
+  np_example = process_final(np_example)
+  return np_example
+def crop_chains(
+    chains_list: List[Mapping[str, np.ndarray]],
+    msa_crop_size: int,
+    pair_msa_sequences: bool,
+    max_templates: int
+) -> List[Mapping[str, np.ndarray]]:
+  """Crops the MSAs for a set of chains.
+  Args:
+    chains_list: A list of chains to be cropped.
+    msa_crop_size: The total number of sequences to crop from the MSA.
+    pair_msa_sequences: Whether we are operating in sequence-pairing mode.
+    max_templates: The maximum templates to use per chain.
+  Returns:
+    The chains cropped.
+  """
+  # Apply the cropping.
+  cropped_chains = []
+  for chain in chains_list:
+    cropped_chain = _crop_single_chain(
+        chain,
+        msa_crop_size=msa_crop_size,
+        pair_msa_sequences=pair_msa_sequences,
+        max_templates=max_templates)
+    cropped_chains.append(cropped_chain)
+  return cropped_chains
+def _crop_single_chain(chain: Mapping[str, np.ndarray],
+                       msa_crop_size: int,
+                       pair_msa_sequences: bool,
+                       max_templates: int) -> Mapping[str, np.ndarray]:
+  """Crops msa sequences to `msa_crop_size`."""
+  msa_size = chain['num_alignments']
+  if pair_msa_sequences:
+    msa_size_all_seq = chain['num_alignments_all_seq']
+    msa_crop_size_all_seq = np.minimum(msa_size_all_seq, msa_crop_size // 2)
+    # We reduce the number of un-paired sequences, by the number of times a
+    # sequence from this chain's MSA is included in the paired MSA.  This keeps
+    # the MSA size for each chain roughly constant.
+    msa_all_seq = chain['msa_all_seq'][:msa_crop_size_all_seq, :]
+    num_non_gapped_pairs = np.sum(
+        np.any(msa_all_seq != msa_pairing.MSA_GAP_IDX, axis=1))
+    num_non_gapped_pairs = np.minimum(num_non_gapped_pairs,
+                                      msa_crop_size_all_seq)
+    # Restrict the unpaired crop size so that paired+unpaired sequences do not
+    # exceed msa_seqs_per_chain for each chain.
+    max_msa_crop_size = np.maximum(msa_crop_size - num_non_gapped_pairs, 0)
+    msa_crop_size = np.minimum(msa_size, max_msa_crop_size)
+  else:
+    msa_crop_size = np.minimum(msa_size, msa_crop_size)
+  include_templates = 'template_aatype' in chain and max_templates
+  if include_templates:
+    num_templates = chain['template_aatype'].shape[0]
+    templates_crop_size = np.minimum(num_templates, max_templates)
+  for k in chain:
+    k_split = k.split('_all_seq')[0]
+    if k_split in msa_pairing.TEMPLATE_FEATURES:
+      chain[k] = chain[k][:templates_crop_size, :]
+    elif k_split in msa_pairing.MSA_FEATURES:
+      if '_all_seq' in k and pair_msa_sequences:
+        chain[k] = chain[k][:msa_crop_size_all_seq, :]
+      else:
+        chain[k] = chain[k][:msa_crop_size, :]
+  chain['num_alignments'] = np.asarray(msa_crop_size, dtype=np.int32)
+  if include_templates:
+    chain['num_templates'] = np.asarray(templates_crop_size, dtype=np.int32)
+  if pair_msa_sequences:
+    chain['num_alignments_all_seq'] = np.asarray(
+        msa_crop_size_all_seq, dtype=np.int32)
+  return chain
+def process_final(
+    np_example: Mapping[str, np.ndarray]
+) -> Mapping[str, np.ndarray]:
+  """Final processing steps in data pipeline, after merging and pairing."""
+  np_example = _correct_msa_restypes(np_example)
+  np_example = _make_seq_mask(np_example)
+  np_example = _make_msa_mask(np_example)
+  np_example = _filter_features(np_example)
+  return np_example
+def _correct_msa_restypes(np_example):
+  """Correct MSA restype to have the same order as residue_constants."""
+  new_order_list = residue_constants.MAP_HHBLITS_AATYPE_TO_OUR_AATYPE
+  np_example['msa'] = np.take(new_order_list, np_example['msa'], axis=0)
+  np_example['msa'] = np_example['msa'].astype(np.int32)
+  return np_example
+def _make_seq_mask(np_example):
+  np_example['seq_mask'] = (np_example['entity_id'] > 0).astype(np.float32)
+  return np_example
+def _make_msa_mask(np_example):
+  """Mask features are all ones, but will later be zero-padded."""
+  np_example['msa_mask'] = np.ones_like(np_example['msa'], dtype=np.float32)
+  seq_mask = (np_example['entity_id'] > 0).astype(np.float32)
+  np_example['msa_mask'] *= seq_mask[None]
+  return np_example
+def _filter_features(
+    np_example: Mapping[str, np.ndarray]
+) -> Mapping[str, np.ndarray]:
+  """Filters features of example to only those requested."""
+  return {k: v for (k, v) in np_example.items() if k in REQUIRED_FEATURES}
+def process_unmerged_features(
+    all_chain_features: MutableMapping[str, Mapping[str, np.ndarray]]):
+  """Postprocessing stage for per-chain features before merging."""
+  num_chains = len(all_chain_features)
+  for chain_features in all_chain_features.values():
+    # Convert deletion matrices to float.
+    chain_features['deletion_matrix'] = np.asarray(
+        chain_features.pop('deletion_matrix_int'), dtype=np.float32)
+    if 'deletion_matrix_int_all_seq' in chain_features:
+      chain_features['deletion_matrix_all_seq'] = np.asarray(
+          chain_features.pop('deletion_matrix_int_all_seq'), dtype=np.float32)
+    chain_features['deletion_mean'] = np.mean(
+        chain_features['deletion_matrix'], axis=0)
+    # Add all_atom_mask and dummy all_atom_positions based on aatype.
+    all_atom_mask = residue_constants.STANDARD_ATOM_MASK[
+        chain_features['aatype']]
+    chain_features['all_atom_mask'] = all_atom_mask
+    chain_features['all_atom_positions'] = np.zeros(
+        list(all_atom_mask.shape) + [3])
+    # Add assembly_num_chains.
+    chain_features['assembly_num_chains'] = np.asarray(num_chains)
+  # Add entity_mask.
+  for chain_features in all_chain_features.values():
+    chain_features['entity_mask'] = (
+        chain_features['entity_id'] != 0).astype(np.int32)
--- a/openfold/data/parsers.py
+++ b/openfold/data/parsers.py
@@ -18,12 +18,41 @@ import collections
 import dataclasses
 import re
 import string
-from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Set
 DeletionMatrix = Sequence[Sequence[int]]
+@dataclasses.dataclass(frozen=True)
+class Msa:
+    """Class representing a parsed MSA file"""
+    sequences: Sequence[str]
+    deletion_matrix: DeletionMatrix
+    descriptions: Sequence[str]
+    def __post_init__(self):
+        if(not (
+            len(self.sequences) == 
+            len(self.deletion_matrix) == 
+            len(self.descriptions)
+        )):
+            raise ValueError(
+                "All fields for an MSA must have the same length"
+            )
+    def __len__(self):
+        return len(self.sequences)
+    def truncate(self, max_seqs: int):
+        return Msa(
+            sequences=self.sequences[:max_seqs],
+            deletion_matrix=self.deletion_matrix[:max_seqs],
+            descriptions=self.descriptions[:max_seqs],
+        )
 @dataclasses.dataclass(frozen=True)
 class TemplateHit:
    """Class representing a template hit."""
@@ -31,7 +60,7 @@ class TemplateHit:
    index: int
    name: str
    aligned_cols: int
-    sum_probs: float
+    sum_probs: Optional[float]
    query: str
    hit_sequence: str
    indices_query: List[int]
@@ -67,9 +96,7 @@ def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
    return sequences, descriptions
-def parse_stockholm(
+def parse_stockholm(stockholm_string: str) -> Msa:
-    stockholm_string: str,
-) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]:
    """Parses sequences and deletion matrix from stockholm format alignment.
    Args:
@@ -124,10 +151,14 @@ def parse_stockholm(
                    deletion_count = 0
        deletion_matrix.append(deletion_vec)
-    return msa, deletion_matrix, list(name_to_sequence.keys())
+    return Msa(
+        sequences=msa, 
+        deletion_matrix=deletion_matrix, 
+        descriptions=list(name_to_sequence.keys())
+    )
-def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
+def parse_a3m(a3m_string: str) -> Msa:
    """Parses sequences and deletion matrix from a3m format alignment.
    Args:
@@ -142,7 +173,7 @@ def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
                at `deletion_matrix[i][j]` is the number of residues deleted from
                the aligned sequence i at residue position j.
    """
-    sequences, _ = parse_fasta(a3m_string)
+    sequences, descriptions = parse_fasta(a3m_string)
    deletion_matrix = []
    for msa_sequence in sequences:
        deletion_vec = []
@@ -158,7 +189,11 @@ def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
    # Make the MSA matrix out of aligned (deletion-free) sequences.
    deletion_table = str.maketrans("", "", string.ascii_lowercase)
    aligned_sequences = [s.translate(deletion_table) for s in sequences]
-    return aligned_sequences, deletion_matrix
+    return Msa(
+        sequences=aligned_sequences, 
+        deletion_matrix=deletion_matrix,
+        descriptions=descriptions
+    )
 def _convert_sto_seq_to_a3m(
@@ -172,7 +207,9 @@ def _convert_sto_seq_to_a3m(
 def convert_stockholm_to_a3m(
-    stockholm_format: str, max_sequences: Optional[int] = None
+    stockholm_format: str, 
+    max_sequences: Optional[int] = None,
+    remove_first_row_gaps: bool = True,
 ) -> str:
    """Converts MSA in Stockholm format to the A3M format."""
    descriptions = {}
@@ -210,13 +247,19 @@ def convert_stockholm_to_a3m(
    # Convert sto format to a3m line by line
    a3m_sequences = {}
-    # query_sequence is assumed to be the first sequence
+    if(remove_first_row_gaps):
-    query_sequence = next(iter(sequences.values()))
+        # query_sequence is assumed to be the first sequence
-    query_non_gaps = [res != "-" for res in query_sequence]
+        query_sequence = next(iter(sequences.values()))
+        query_non_gaps = [res != "-" for res in query_sequence]
    for seqname, sto_sequence in sequences.items():
-        a3m_sequences[seqname] = "".join(
+        # Dots are optional in a3m format and are commonly removed.
-            _convert_sto_seq_to_a3m(query_non_gaps, sto_sequence)
+        out_sequence = sto_sequence.replace('.', '')
-        )
+        if(remove_first_row_gaps):
+            out_sequence = ''.join(
+                _convert_sto_seq_to_a3m(query_non_gaps, out_sequence)
+            )
+        a3m_sequences[seqname] = out_sequence
    fasta_chunks = (
        f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
@@ -225,6 +268,124 @@ def convert_stockholm_to_a3m(
    return "\n".join(fasta_chunks) + "\n"  # Include terminating newline.
+def _keep_line(line: str, seqnames: Set[str]) -> bool:
+  """Function to decide which lines to keep."""
+  if not line.strip():
+    return True
+  if line.strip() == '//':  # End tag
+    return True
+  if line.startswith('# STOCKHOLM'):  # Start tag
+    return True
+  if line.startswith('#=GC RF'):  # Reference Annotation Line
+    return True
+  if line[:4] == '#=GS':  # Description lines - keep if sequence in list.
+    _, seqname, _ = line.split(maxsplit=2)
+    return seqname in seqnames
+  elif line.startswith('#'):  # Other markup - filter out
+    return False
+  else:  # Alignment data - keep if sequence in list.
+    seqname = line.partition(' ')[0]
+    return seqname in seqnames
+def truncate_stockholm_msa(stockholm_msa_path: str, max_sequences: int) -> str:
+  """Reads + truncates a Stockholm file while preventing excessive RAM usage."""
+  seqnames = set()
+  filtered_lines = []
+  with open(stockholm_msa_path) as f:
+    for line in f:
+      if line.strip() and not line.startswith(('#', '//')):
+        # Ignore blank lines, markup and end symbols - remainder are alignment
+        # sequence parts.
+        seqname = line.partition(' ')[0]
+        seqnames.add(seqname)
+        if len(seqnames) >= max_sequences:
+          break
+    f.seek(0)
+    for line in f:
+      if _keep_line(line, seqnames):
+        filtered_lines.append(line)
+  return ''.join(filtered_lines)
+def remove_empty_columns_from_stockholm_msa(stockholm_msa: str) -> str:
+  """Removes empty columns (dashes-only) from a Stockholm MSA."""
+  processed_lines = {}
+  unprocessed_lines = {}
+  for i, line in enumerate(stockholm_msa.splitlines()):
+    if line.startswith('#=GC RF'):
+      reference_annotation_i = i
+      reference_annotation_line = line
+      # Reached the end of this chunk of the alignment. Process chunk.
+      _, _, first_alignment = line.rpartition(' ')
+      mask = []
+      for j in range(len(first_alignment)):
+        for _, unprocessed_line in unprocessed_lines.items():
+          prefix, _, alignment = unprocessed_line.rpartition(' ')
+          if alignment[j] != '-':
+            mask.append(True)
+            break
+        else:  # Every row contained a hyphen - empty column.
+          mask.append(False)
+      # Add reference annotation for processing with mask.
+      unprocessed_lines[reference_annotation_i] = reference_annotation_line
+      if not any(mask):  # All columns were empty. Output empty lines for chunk.
+        for line_index in unprocessed_lines:
+          processed_lines[line_index] = ''
+      else:
+        for line_index, unprocessed_line in unprocessed_lines.items():
+          prefix, _, alignment = unprocessed_line.rpartition(' ')
+          masked_alignment = ''.join(itertools.compress(alignment, mask))
+          processed_lines[line_index] = f'{prefix} {masked_alignment}'
+      # Clear raw_alignments.
+      unprocessed_lines = {}
+    elif line.strip() and not line.startswith(('#', '//')):
+      unprocessed_lines[i] = line
+    else:
+      processed_lines[i] = line
+  return '\n'.join((processed_lines[i] for i in range(len(processed_lines))))
+def deduplicate_stockholm_msa(stockholm_msa: str) -> str:
+  """Remove duplicate sequences (ignoring insertions wrt query)."""
+  sequence_dict = collections.defaultdict(str)
+  # First we must extract all sequences from the MSA.
+  for line in stockholm_msa.splitlines():
+    # Only consider the alignments - ignore reference annotation, empty lines,
+    # descriptions or markup.
+    if line.strip() and not line.startswith(('#', '//')):
+      line = line.strip()
+      seqname, alignment = line.split()
+      sequence_dict[seqname] += alignment
+  seen_sequences = set()
+  seqnames = set()
+  # First alignment is the query.
+  query_align = next(iter(sequence_dict.values()))
+  mask = [c != '-' for c in query_align]  # Mask is False for insertions.
+  for seqname, alignment in sequence_dict.items():
+    # Apply mask to remove all insertions from the string.
+    masked_alignment = ''.join(itertools.compress(alignment, mask))
+    if masked_alignment in seen_sequences:
+      continue
+    else:
+      seen_sequences.add(masked_alignment)
+      seqnames.add(seqname)
+  filtered_lines = []
+  for line in stockholm_msa.splitlines():
+    if _keep_line(line, seqnames):
+      filtered_lines.append(line)
+  return '\n'.join(filtered_lines) + '\n'
 def _get_hhr_line_regex_groups(
    regex_pattern: str, line: str
 ) -> Sequence[Optional[str]]:
@@ -278,7 +439,7 @@ def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
            "Could not parse section: %s. Expected this: \n%s to contain summary."
            % (detailed_lines, detailed_lines[2])
        )
-    (prob_true, e_value, _, aligned_cols, _, _, sum_probs, neff) = [
+    (_, _, _, aligned_cols, _, _, sum_probs, _) = [
        float(x) for x in match.groups()
    ]
@@ -386,3 +547,98 @@ def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
        target_name = fields[0]
        e_values[target_name] = float(e_value)
    return e_values
+def _get_indices(sequence: str, start: int) -> List[int]:
+    """Returns indices for non-gap/insert residues starting at the given index."""
+    indices = []
+    counter = start
+    for symbol in sequence:
+      # Skip gaps but add a placeholder so that the alignment is preserved.
+      if symbol == '-':
+        indices.append(-1)
+      # Skip deleted residues, but increase the counter.
+      elif symbol.islower():
+        counter += 1
+      # Normal aligned residue. Increase the counter and append to indices.
+      else:
+        indices.append(counter)
+        counter += 1
+    return indices
+@dataclasses.dataclass(frozen=True)
+class HitMetadata:
+    pdb_id: str
+    chain: str
+    start: int
+    end: int
+    length: int
+    text: str
+def _parse_hmmsearch_description(description: str) -> HitMetadata:
+    """Parses the hmmsearch A3M sequence description line."""
+    # Example 1: >4pqx_A/2-217 [subseq from] mol:protein length:217  Free text
+    # Example 2: >5g3r_A/1-55 [subseq from] mol:protein length:352
+    match = re.match(
+        r'^>?([a-z0-9]+)_(\w+)/([0-9]+)-([0-9]+).*protein length:([0-9]+) *(.*)$',
+        description.strip())
+    if not match:
+      raise ValueError(f'Could not parse description: "{description}".')
+    return HitMetadata(
+        pdb_id=match[1],
+        chain=match[2],
+        start=int(match[3]),
+        end=int(match[4]),
+        length=int(match[5]),
+        text=match[6]
+    )
+def parse_hmmsearch_a3m(
+    query_sequence: str,
+    a3m_string: str,
+    skip_first: bool = True
+) -> Sequence[TemplateHit]:
+    """Parses an a3m string produced by hmmsearch.
+    Args:
+      query_sequence: The query sequence.
+      a3m_string: The a3m string produced by hmmsearch.
+      skip_first: Whether to skip the first sequence in the a3m string.
+    Returns:
+      A sequence of `TemplateHit` results.
+    """
+    # Zip the descriptions and MSAs together, skip the first query sequence.
+    parsed_a3m = list(zip(*parse_fasta(a3m_string)))
+    if skip_first:
+      parsed_a3m = parsed_a3m[1:]
+    indices_query = _get_indices(query_sequence, start=0)
+    hits = []
+    for i, (hit_sequence, hit_description) in enumerate(parsed_a3m, start=1):
+      if 'mol:protein' not in hit_description:
+        continue  # Skip non-protein chains.
+      metadata = _parse_hmmsearch_description(hit_description)
+      # Aligned columns are only the match states.
+      aligned_cols = sum([r.isupper() and r != '-' for r in hit_sequence])
+      indices_hit = _get_indices(hit_sequence, start=metadata.start - 1)
+      hit = TemplateHit(
+          index=i,
+          name=f'{metadata.pdb_id}_{metadata.chain}',
+          aligned_cols=aligned_cols,
+          sum_probs=None,
+          query=query_sequence,
+          hit_sequence=hit_sequence.upper(),
+          indices_query=indices_query,
+          indices_hit=indices_hit,
+      )
+      hits.append(hit)
+    return hits
--- a/openfold/np/protein.py
+++ b/openfold/np/protein.py
@@ -28,6 +28,11 @@ FeatureDict = Mapping[str, np.ndarray]
 ModelOutput = Mapping[str, Any]  # Is a nested dict.
 PICO_TO_ANGSTROM = 0.01
+PDB_CHAIN_IDS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)
+assert(PDB_MAX_CHAINS == 62)
 @dataclasses.dataclass(frozen=True)
 class Protein:
    """Protein structure representation."""
@@ -46,12 +51,23 @@ class Protein:
    # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
    residue_index: np.ndarray  # [num_res]
+    # 0-indexed number corresponding to the chain in the protein that this 
+    # residue belongs to
+    chain_index: np.ndarray # [num_res]
    # B-factors, or temperature factors, of each residue (in sq. angstroms units),
    # representing the displacement of the residue from its ground truth mean
    # value.
    b_factors: np.ndarray  # [num_res, num_atom_type]
+    def __post_init__(self):
+        if(len(np.unique(self.chain_index)) > PDB_MAX_CHAINS:
+            raise ValueError(
+                f"Cannot build an instance with more than {PDB_MAX_CHAINS} "
+                "chains because these cannot be written to PDB format"
+            )
 def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
    """Takes a PDB string and constructs a Protein object.
@@ -61,9 +77,8 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
    Args:
      pdb_str: The contents of the pdb file
-      chain_id: If None, then the pdb file must contain a single chain (which
+      chain_id: If chain_id is specified (e.g. A), then only that chain is 
-        will be parsed). If chain_id is specified (e.g. A), then only that chain
+      parsed. Else, all chains are parsed.
-        is parsed.
    Returns:
      A new `Protein` parsed from the pdb contents.
@@ -78,59 +93,61 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
        )
    model = models[0]
-    if chain_id is not None:
-        chain = model[chain_id]
-    else:
-        chains = list(model.get_chains())
-        if len(chains) != 1:
-            raise ValueError(
-                "Only single chain PDBs are supported when chain_id not specified. "
-                f"Found {len(chains)} chains."
-            )
-        else:
-            chain = chains[0]
    atom_positions = []
    aatype = []
    atom_mask = []
    residue_index = []
+    chain_ids = []
    b_factors = []
-    for res in chain:
+    for chain in model:
-        if res.id[2] != " ":
+        if(chain_id is not None and chain.id != chain_id):
-            raise ValueError(
+            continue
-                f"PDB contains an insertion code at chain {chain.id} and residue "
-                f"index {res.id[1]}. These are not supported."
+        for res in chain:
+            if res.id[2] != " ":
+                raise ValueError(
+                    f"PDB contains an insertion code at chain {chain.id} and residue "
+                    f"index {res.id[1]}. These are not supported."
+                )
+            res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
+            restype_idx = residue_constants.restype_order.get(
+                res_shortname, residue_constants.restype_num
            )
-        res_shortname = residue_constants.restype_3to1.get(res.resname, "X")
+            pos = np.zeros((residue_constants.atom_type_num, 3))
-        restype_idx = residue_constants.restype_order.get(
+            mask = np.zeros((residue_constants.atom_type_num,))
-            res_shortname, residue_constants.restype_num
+            res_b_factors = np.zeros((residue_constants.atom_type_num,))
-        )
+            for atom in res:
-        pos = np.zeros((residue_constants.atom_type_num, 3))
+                if atom.name not in residue_constants.atom_types:
-        mask = np.zeros((residue_constants.atom_type_num,))
+                    continue
-        res_b_factors = np.zeros((residue_constants.atom_type_num,))
+                pos[residue_constants.atom_order[atom.name]] = atom.coord
-        for atom in res:
+                mask[residue_constants.atom_order[atom.name]] = 1.0
-            if atom.name not in residue_constants.atom_types:
+                res_b_factors[
+                    residue_constants.atom_order[atom.name]
+                ] = atom.bfactor
+            if np.sum(mask) < 0.5:
+                # If no known atom positions are reported for the residue then skip it.
                continue
-            pos[residue_constants.atom_order[atom.name]] = atom.coord
-            mask[residue_constants.atom_order[atom.name]] = 1.0
+            aatype.append(restype_idx)
-            res_b_factors[
+            atom_positions.append(pos)
-                residue_constants.atom_order[atom.name]
+            atom_mask.append(mask)
-            ] = atom.bfactor
+            residue_index.append(mask)
-        if np.sum(mask) < 0.5:
+            chain_ids.append(chain.id)
-            # If no known atom positions are reported for the residue then skip it.
+            b_factors.append(res_b_factors)
-            continue
-        aatype.append(restype_idx)
+    # Chain IDs are usually characters so map these to ints
-        atom_positions.append(pos)
+    unique_chain_ids = np.unique(chain_ids)
-        atom_mask.append(mask)
+    chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
-        residue_index.append(res.id[1])
+    chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
-        b_factors.append(res_b_factors)
    return Protein(
        atom_positions=np.array(atom_positions),
        atom_mask=np.array(atom_mask),
        aatype=np.array(aatype),
        residue_index=np.array(residue_index),
+        chain_index=chain_index,
        b_factors=np.array(b_factors),
    )
@@ -188,6 +205,14 @@ def from_proteinnet_string(proteinnet_str: str) -> Protein:
    )
+def _chain_end(atom_index, end_resname, chain_name, residue_indx) -> str:
+    chain_end = 'TER'
+    return(
+        f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
+        f'{chain_name:>1}{residue_index:>4}'
+    )
 def to_pdb(prot: Protein) -> str:
    """Converts a `Protein` instance to a PDB string.
@@ -207,16 +232,39 @@ def to_pdb(prot: Protein) -> str:
    aatype = prot.aatype
    atom_positions = prot.atom_positions
    residue_index = prot.residue_index.astype(np.int32)
+    chain_index = prot.chain_index.astype(np.int32)
    b_factors = prot.b_factors
    if np.any(aatype > residue_constants.restype_num):
        raise ValueError("Invalid aatypes.")
+    # Construct a mapping from chain integer indices to chain ID strings.
+    chain_ids = {}
+    for i in np.unique(chain_index): # np.unique gives sorted output.
+        if i >= PDB_MAX_CHAINS:
+            raise ValueError(
+                f"The PDB format supports at most {PDB_MAX_CHAINS} chains."
+            )
+        chain_ids[i] = PDB_CHAIN_IDS[i]
    pdb_lines.append("MODEL     1")
    atom_index = 1
-    chain_id = "A"
+    last_chain_index = chain_index[0]
    # Add all atom sites.
    for i in range(aatype.shape[0]):
+        # Close the previous chain if in a multichain PDB.
+        if last_chain_index != chain_index[i]:
+            pdb_lines.append(
+                _chain_end(
+                    atom_index, 
+                    res_1to3(aatype[i - 1]), 
+                    chain_ids[chain_index[i - 1]], 
+                    residue_index[i - 1]
+                )
+            )
+            last_chain_index = chain_index[i]
+            atom_index += 1 # Atom index increases at the TER symbol.
        res_name_3 = res_1to3(aatype[i])
        for atom_name, pos, mask, b_factor in zip(
            atom_types, atom_positions[i], atom_mask[i], b_factors[i]
@@ -236,7 +284,7 @@ def to_pdb(prot: Protein) -> str:
            # PDB is a columnar format, every space matters here!
            atom_line = (
                f"{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}"
-                f"{res_name_3:>3} {chain_id:>1}"
+                f"{res_name_3:>3} {chain_ids[chain_index[i]]:>1}"
                f"{residue_index[i]:>4}{insertion_code:>1}   "
                f"{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}"
                f"{occupancy:>6.2f}{b_factor:>6.2f}          "
@@ -245,18 +293,22 @@ def to_pdb(prot: Protein) -> str:
            pdb_lines.append(atom_line)
            atom_index += 1
-    # Close the chain.
+    # Close the final chain.
-    chain_end = "TER"
+    pdb_lines.append(
-    chain_termination_line = (
+        _chain_end(
-        f"{chain_end:<6}{atom_index:>5}      {res_1to3(aatype[-1]):>3} "
+            atom_index, 
-        f"{chain_id:>1}{residue_index[-1]:>4}"
+            res_1to3(aatype[-1]), 
+            chain_ids[chain_index[-1]], 
+            residue_index[-1]
+        )
    )
-    pdb_lines.append(chain_termination_line)
    pdb_lines.append("ENDMDL")
    pdb_lines.append("END")
-    pdb_lines.append("")
-    return "\n".join(pdb_lines)
+    # Pad all lines to 80 characters
+    pdb_lines = [line.ljust(80) for line in pdb_lines]
+    return '\n'.join(pdb_lines) + '\n' # Add terminating newline.
 def ideal_atom_mask(prot: Protein) -> np.ndarray:
@@ -279,6 +331,7 @@ def from_prediction(
    features: FeatureDict,
    result: ModelOutput,
    b_factors: Optional[np.ndarray] = None,
+    remove_leading_feature_dimension: bool = True,
 ) -> Protein:
    """Assembles a protein from a prediction.
@@ -286,17 +339,30 @@ def from_prediction(
      features: Dictionary holding model inputs.
      result: Dictionary holding model outputs.
      b_factors: (Optional) B-factors to use for the protein.
+      remove_leading_feature_dimension: Whether to remove the leading dimension 
+        of the `features` values
    Returns:
      A protein instance.
    """
+    def _maybe_remove_leading_dim(arr: np.ndarray) -> np.ndarray:
+        return arr[0] if remove_leading_feature_dimension else arr
+    if 'asym_id' in features:
+        chain_index = _maybe_remove_leading_dim(features["asym_id"])
+    else:
+        chain_index = np.zeros_like(
+            _maybe_remove_leading_dim(features["aatype"])
+        )
    if b_factors is None:
        b_factors = np.zeros_like(result["final_atom_mask"])
    return Protein(
-        aatype=features["aatype"],
+        aatype=_maybe_remove_leading_dim(features["aatype"]),
        atom_positions=result["final_atom_positions"],
        atom_mask=result["final_atom_mask"],
-        residue_index=features["residue_index"] + 1,
+        residue_index=_maybe_remove_leading_dim(features["residue_index"]) + 1,
+        chain_index=chain_index,
        b_factors=b_factors,
    )
--- a/openfold/np/residue_constants.py
+++ b/openfold/np/residue_constants.py
@@ -17,6 +17,7 @@
 import collections
 import functools
+import os
 from typing import Mapping, List, Tuple
 from importlib import resources
@@ -448,9 +449,9 @@ def load_stereo_chemical_props() -> Tuple[
    ("residue_virtual_bonds").
    Returns:
-      residue_bonds:  dict that maps resname --> list of Bond tuples
+      residue_bonds:  Dict that maps resname -> list of Bond tuples
-      residue_virtual_bonds: dict that maps resname --> list of Bond tuples
+      residue_virtual_bonds: Dict that maps resname -> list of Bond tuples
-      residue_bond_angles: dict that maps resname --> list of BondAngle tuples
+      residue_bond_angles: Dict that maps resname -> list of BondAngle tuples
    """
    # TODO: this file should be downloaded in a setup script
    stereo_chemical_props = resources.read_text("openfold.resources", "stereo_chemical_props.txt")

--- a/openfold/utils/loss.py
+++ b/openfold/utils/loss.py
@@ -619,6 +619,8 @@ def compute_predicted_aligned_error(
 def compute_tm(
    logits: torch.Tensor,
    residue_weights: Optional[torch.Tensor] = None,
+    asym_id: Optional[torch.Tensor] = None,
+    interface: bool = False,
    max_bin: int = 31,
    no_bins: int = 64,
    eps: float = 1e-8,
@@ -632,9 +634,9 @@ def compute_tm(
    )
    bin_centers = _calculate_bin_centers(boundaries)
-    torch.sum(residue_weights)
+    soft_n = torch.sum(residue_weights, dim=-1).to(torch.int32)
-    n = logits.shape[-2]
+    other = n.new_zeros() + 19
-    clipped_n = max(n, 19)
+    clipped_n = torch.max(soft_n, other, dim=-1)
    d0 = 1.24 * (clipped_n - 15) ** (1.0 / 3) - 1.8
@@ -643,11 +645,22 @@ def compute_tm(
    tm_per_bin = 1.0 / (1 + (bin_centers ** 2) / (d0 ** 2))
    predicted_tm_term = torch.sum(probs * tm_per_bin, dim=-1)
-    normed_residue_mask = residue_weights / (eps + residue_weights.sum())
+    n = residue_weights.shape[-1]
+    pair_mask = residue_weights.new_ones((n, n), dtype=torch.int32)
+    if interface:
+        pair_mask *= (asym_id[..., None] != asym_id[..., None, :])
+    predicted_tm_term *= pair_mask
+    pair_residue_weights = pair_mask * (
+        residue_weights[..., None, :] * residue_weights[..., :, None]
+    )
+    denom = eps + torch.sum(pair_residue_weights, dim=-1, keepdims=True)
+    normed_residue_mask = pair_residue_weights / denom
    per_alignment = torch.sum(predicted_tm_term * normed_residue_mask, dim=-1)
    weighted = per_alignment * residue_weights
-    argmax = (weighted == torch.max(weighted)).nonzero()[0]
+    idx = weighted.argmax(dim=-1, keepdim=True)
-    return per_alignment[tuple(argmax)]
+    return torch.gather(per_alignment, -1, idx).squeeze(-1)
 def tm_loss(
@@ -701,7 +714,7 @@ def tm_loss(
        (resolution >= min_resolution) & (resolution <= max_resolution)
    )
-    # Average over the loss dimension
+    # Average over the batch dimension
    loss = torch.mean(loss)
    return loss