Merge pull request #405 from aqlaboratory/multimer

Full multimer merge

Merge pull request #405 from aqlaboratory/multimer
Full multimer merge
bb3f51e5 · Christina Floristean · GitHub · ce211367 · c33a0bd6 · bb3f51e5
Unverified Commit bb3f51e5 authored Feb 07, 2024 by Christina Floristean Committed by GitHub Feb 07, 2024
20 changed files
--- a/scripts/download_alphafold_params.sh
+++ b/scripts/download_alphafold_params.sh
@@ -31,7 +31,7 @@ fi

 DOWNLOAD_DIR="$1"
 ROOT_DIR="${DOWNLOAD_DIR}/params"
-SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-01-19.tar"
+SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar"
 BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"

--- a/scripts/download_mgnify.sh
+++ b/scripts/download_mgnify.sh
@@ -32,8 +32,8 @@ fi
 DOWNLOAD_DIR="$1"
 ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
 # Mirror of:
-# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz
-SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz"
+# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/mgy_clusters.fa.gz
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/mgy_clusters_2022_05.fa.gz"
 BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"

--- a/scripts/download_pdb_seqres.sh
+++ b/scripts/download_pdb_seqres.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the PDB SeqRes database for AlphaFold.
+#
+# Usage: bash download_pdb_seqres.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/pdb_seqres"
+SOURCE_URL="ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
+BASENAME=$(basename "${SOURCE_URL}")
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+
+# Keep only protein sequences.
+grep --after-context=1 --no-group-separator '>.* mol:protein' "${ROOT_DIR}/pdb_seqres.txt" > "${ROOT_DIR}/pdb_seqres_filtered.txt"
+mv "${ROOT_DIR}/pdb_seqres_filtered.txt" "${ROOT_DIR}/pdb_seqres.txt"
--- a/scripts/download_uniprot.sh
+++ b/scripts/download_uniprot.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads, unzips and merges the SwissProt and TrEMBL databases for
+# AlphaFold-Multimer.
+#
+# Usage: bash download_uniprot.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/uniprot"
+
+TREMBL_SOURCE_URL="ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
+TREMBL_BASENAME=$(basename "${TREMBL_SOURCE_URL}")
+TREMBL_UNZIPPED_BASENAME="${TREMBL_BASENAME%.gz}"
+
+SPROT_SOURCE_URL="ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
+SPROT_BASENAME=$(basename "${SPROT_SOURCE_URL}")
+SPROT_UNZIPPED_BASENAME="${SPROT_BASENAME%.gz}"
+
+mkdir --parents "${ROOT_DIR}"
+aria2c "${TREMBL_SOURCE_URL}" --dir="${ROOT_DIR}"
+aria2c "${SPROT_SOURCE_URL}" --dir="${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${TREMBL_BASENAME}"
+gunzip "${ROOT_DIR}/${SPROT_BASENAME}"
+
+# Concatenate TrEMBL and SwissProt, rename to uniprot and clean up.
+cat "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}" >> "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}"
+mv "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}" "${ROOT_DIR}/uniprot.fasta"
+rm "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}"
+popd
--- a/scripts/download_uniref30.sh
+++ b/scripts/download_uniref30.sh
@@ -30,10 +30,15 @@ if ! command -v aria2c &> /dev/null ; then
 fi

 DOWNLOAD_DIR="$1"
-ROOT_DIR="${DOWNLOAD_DIR}"
-SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
+ROOT_DIR="${DOWNLOAD_DIR}/uniref30"
+# Mirror of:
+# https://wwwuser.gwdg.de/~compbiol/uniclust/2021_03/UniRef30_2021_03.tar.gz
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/v2.3/UniRef30_2021_03.tar.gz"
 BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"
 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
-gunzip "${ROOT_DIR}/${BASENAME}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
+
--- a/scripts/flatten_roda.sh
+++ b/scripts/flatten_roda.sh
@@ -26,7 +26,12 @@ mkdir -p "${ALIGNMENT_DIR}"
 for chain_dir in $(ls "${RODA_DIR}"); do
    CHAIN_DIR_PATH="${RODA_DIR}/${chain_dir}"
    for subdir in $(ls "${CHAIN_DIR_PATH}"); do
-        if [[ $subdir = "pdb" ]] || [[ $subdir = "cif" ]]; then
+        if [[ ! -d "$subdir" ]]; then
+            echo "$subdir is not directory"
+            continue
+        elif [[ -z $(ls "${subdir}")]]; then
+            continue
+        elif [[ $subdir = "pdb" ]] || [[ $subdir = "cif" ]]; then
            mv "${CHAIN_DIR_PATH}/${subdir}"/* "${DATA_DIR}"
        else
            CHAIN_ALIGNMENT_DIR="${ALIGNMENT_DIR}/${chain_dir}"

--- a/scripts/generate_alphafold_feature_dict.py
+++ b/scripts/generate_alphafold_feature_dict.py
@@ -2,35 +2,62 @@ import argparse
 import os
 import pickle

-from alphafold.data import pipeline, templates
+from alphafold.data import pipeline, pipeline_multimer, templates
+from alphafold.data.tools import hmmsearch, hhsearch

 from scripts.utils import add_data_args


 def main(args):
-    template_featurizer = templates.TemplateHitFeaturizer(
-        mmcif_dir=args.mmcif_dir,
+    if (args.multimer):
+        template_searcher = hmmsearch.Hmmsearch(
+            binary_path=args.hmmsearch_binary_path,
+            hmmbuild_binary_path=args.hmmbuild_binary_path,
+            database_path=args.pdb_seqres_database_path,
+        )
+
+        template_featurizer = templates.HmmsearchHitFeaturizer(
+            mmcif_dir=args.template_mmcif_dir,
+            max_template_date=args.max_template_date,
+            max_hits=20,
+            kalign_binary_path=args.kalign_binary_path,
+            release_dates_path=args.release_dates_path,
+            obsolete_pdbs_path=args.obsolete_pdbs_path
+        )
+    else:
+        template_searcher = hhsearch.HHSearch(
+            binary_path=args.hhsearch_binary_path,
+            databases=[args.pdb70_database_path],
+        )
+
+        template_featurizer = templates.HhsearchHitFeaturizer(
+            mmcif_dir=args.template_mmcif_dir,
            max_template_date=args.max_template_date,
            max_hits=20,
            kalign_binary_path=args.kalign_binary_path,
            release_dates_path=None,
-        obsolete_pdbs_path=args.obsolete_pdbs_path,
+            obsolete_pdbs_path=args.obsolete_pdbs_path
        )

    data_pipeline = pipeline.DataPipeline(
        jackhmmer_binary_path=args.jackhmmer_binary_path,
        hhblits_binary_path=args.hhblits_binary_path,
-        hhsearch_binary_path=args.hhsearch_binary_path,
        uniref90_database_path=args.uniref90_database_path,
        mgnify_database_path=args.mgnify_database_path,
        bfd_database_path=args.bfd_database_path,
-        uniclust30_database_path=args.uniclust30_database_path,
-        pdb70_database_path=args.pdb70_database_path,
+        uniref30_database_path=args.uniref30_database_path,
        small_bfd_database_path=None,
        template_featurizer=template_featurizer,
+        template_searcher=template_searcher,
        use_small_bfd=False,
    )

+    if (args.multimer):
+        data_pipeline = pipeline_multimer.DataPipeline(
+            monomer_data_pipeline=data_pipeline,
+            jackhmmer_binary_path=args.jackhmmer_binary_path,
+            uniprot_database_path=args.uniprot_database_path)
+
    feature_dict = data_pipeline.process(
        input_fasta_path=args.fasta_path,
        msa_output_dir=args.output_dir,
@@ -44,6 +71,7 @@ if __name__ == "__main__":
    parser.add_argument("fasta_path", type=str)
    parser.add_argument("mmcif_dir", type=str)
    parser.add_argument("output_dir", type=str)
+    parser.add_argument("--multimer", action='store_true')
    add_data_args(parser)

    args = parser.parse_args()

--- a/scripts/generate_chain_data_cache.py
+++ b/scripts/generate_chain_data_cache.py
@@ -4,10 +4,11 @@ import json
 import logging
 from multiprocessing import Pool
 import os
-
+import string
 import sys
 sys.path.append(".") # an innocent hack to get this to run from the top level

+from collections import defaultdict
 from tqdm import tqdm

 from openfold.data.mmcif_parsing import parse 
@@ -49,20 +50,27 @@ def parse_file(
            pdb_string = fp.read()
          
        protein_object = protein.from_pdb_string(pdb_string, None)
+        aatype = protein_object.aatype
+        chain_index = protein_object.chain_index

-        chain_dict = {} 
-        chain_dict["seq"] = residue_constants.aatype_to_str_sequence(
-            protein_object.aatype,
-        )
-        chain_dict["resolution"] = 0.
+        chain_dict = defaultdict(list)
+        for i in range(aatype.shape[0]):
+            chain_dict[chain_index[i]].append(residue_constants.restypes_with_x[aatype[i]])
+
+        out = {}
+        chain_tags = string.ascii_uppercase
+        for chain, seq in chain_dict.items():
+            full_name = "_".join([file_id, chain_tags[chain]])
+            out[full_name] = {}
+            local_data = out[full_name]
+            local_data["resolution"] = 0.
+            local_data["seq"] = ''.join(seq)
        
            if(chain_cluster_size_dict is not None):
                cluster_size = chain_cluster_size_dict.get(
                    full_name.upper(), -1
                )
-            chain_dict["cluster_size"] = cluster_size
-
-        out = {file_id: chain_dict}
+                local_data["cluster_size"] = cluster_size

    return out


--- a/scripts/generate_mmcif_cache.py
+++ b/scripts/generate_mmcif_cache.py
@@ -13,7 +13,7 @@ from tqdm import tqdm
 from openfold.data.mmcif_parsing import parse 


-def parse_file(f, args):
+def parse_file(f, args, chain_cluster_size_dict=None):
    with open(os.path.join(args.mmcif_dir, f), "r") as fp:
        mmcif_string = fp.read()
    file_id = os.path.splitext(f)[0]
@@ -28,6 +28,18 @@ def parse_file(f, args):
    local_data["release_date"] = mmcif.header["release_date"]

    chain_ids, seqs = list(zip(*mmcif.chain_to_seqres.items()))
+
+    if chain_cluster_size_dict is not None:
+        cluster_sizes = []
+        for chain_id in chain_ids:
+            full_name = "_".join([file_id, chain_id])
+            cluster_size = chain_cluster_size_dict.get(
+                full_name.upper(), -1
+            )
+            cluster_sizes.append(cluster_size)
+
+        local_data["cluster_sizes"] = cluster_sizes
+
    local_data["chain_ids"] = chain_ids
    local_data["seqs"] = seqs
    local_data["no_chains"] = len(chain_ids)
@@ -38,8 +50,21 @@ def parse_file(f, args):


 def main(args):
+    chain_cluster_size_dict = None
+    if args.cluster_file is not None:
+        chain_cluster_size_dict = {}
+        with open(args.cluster_file, "r") as fp:
+            clusters = [l.strip() for l in fp.readlines()]
+
+        for cluster in clusters:
+            chain_ids = cluster.split()
+            cluster_len = len(chain_ids)
+            for chain_id in chain_ids:
+                chain_id = chain_id.upper()
+                chain_cluster_size_dict[chain_id] = cluster_len
+
    files = [f for f in os.listdir(args.mmcif_dir) if ".cif" in f]
-    fn = partial(parse_file, args=args)
+    fn = partial(parse_file, args=args, chain_cluster_size_dict=chain_cluster_size_dict)
    data = {}
    with Pool(processes=args.no_workers) as p:
        with tqdm(total=len(files)) as pbar:
@@ -63,6 +88,15 @@ if __name__ == "__main__":
        "--no_workers", type=int, default=4,
        help="Number of workers to use for parsing"
    )
+    parser.add_argument(
+        "--cluster_file", type=str, default=None,
+        help=(
+            "Path to a cluster file (e.g. PDB40), one cluster "
+            "({PROT1_ID}_{CHAIN_ID} {PROT2_ID}_{CHAIN_ID} ...) per line. "
+            "Chains not in this cluster file will NOT be filtered by cluster "
+            "size."
+        )
+    )
    parser.add_argument(
        "--chunksize", type=int, default=10,
        help="How many files should be distributed to each worker at a time"

--- a/scripts/precompute_alignments.py
+++ b/scripts/precompute_alignments.py
@@ -11,6 +11,7 @@ import tempfile
 import openfold.data.mmcif_parsing as mmcif_parsing
 from openfold.data.data_pipeline import AlignmentRunner
 from openfold.data.parsers import parse_fasta
+from openfold.data.tools import hhsearch, hmmsearch
 from openfold.np import protein, residue_constants

 from utils import add_data_args
@@ -39,7 +40,8 @@ def run_seq_group_alignments(seq_groups, alignment_runner, args):
            alignment_runner.run(
                fasta_path, alignment_dir
            )
-        except:
+        except Exception as e:
+            logging.warning(e)
            logging.warning(f"Failed to run alignments for {first_name}. Skipping...")
            os.remove(fasta_path)
            os.rmdir(alignment_dir)
@@ -114,15 +116,30 @@ def parse_and_align(files, alignment_runner, args):

 def main(args):
    # Build the alignment tool runner
+    if args.hmmsearch_binary_path is not None and args.pdb_seqres_database_path is not None:
+        template_searcher = hmmsearch.Hmmsearch(
+            binary_path=args.hmmsearch_binary_path,
+            hmmbuild_binary_path=args.hmmbuild_binary_path,
+            database_path=args.pdb_seqres_database_path,
+        )
+    elif args.hhsearch_binary_path is not None and args.pdb70_database_path is not None:
+        template_searcher = hhsearch.HHSearch(
+            binary_path=args.hhsearch_binary_path,
+            databases=[args.pdb70_database_path],
+        )
+    else:
+        template_searcher = None
+
    alignment_runner = AlignmentRunner(
        jackhmmer_binary_path=args.jackhmmer_binary_path,
        hhblits_binary_path=args.hhblits_binary_path,
-        hhsearch_binary_path=args.hhsearch_binary_path,
        uniref90_database_path=args.uniref90_database_path,
        mgnify_database_path=args.mgnify_database_path,
        bfd_database_path=args.bfd_database_path,
+        uniref30_database_path=args.uniref30_database_path,
        uniclust30_database_path=args.uniclust30_database_path,
-        pdb70_database_path=args.pdb70_database_path,
+        uniprot_database_path=args.uniprot_database_path,
+        template_searcher=template_searcher,
        use_small_bfd=args.bfd_database_path is None,
        no_cpus=args.cpus_per_task,
    )

--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -14,9 +14,18 @@ def add_data_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        '--pdb70_database_path', type=str, default=None,
    )
+    parser.add_argument(
+        '--pdb_seqres_database_path', type=str, default=None,
+    )
+    parser.add_argument(
+        '--uniref30_database_path', type=str, default=None,
+    )
    parser.add_argument(
        '--uniclust30_database_path', type=str, default=None,
    )
+    parser.add_argument(
+        '--uniprot_database_path', type=str, default=None,
+    )
    parser.add_argument(
        '--bfd_database_path', type=str, default=None,
    )
@@ -29,6 +38,12 @@ def add_data_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        '--hhsearch_binary_path', type=str, default='/usr/bin/hhsearch'
    )
+    parser.add_argument(
+        '--hmmsearch_binary_path', type=str, default='/usr/bin/hmmsearch'
+    )
+    parser.add_argument(
+        '--hmmbuild_binary_path', type=str, default='/usr/bin/hmmbuild'
+    )
    parser.add_argument(
        '--kalign_binary_path', type=str, default='/usr/bin/kalign'
    )

--- a/setup.py
+++ b/setup.py
@@ -75,6 +75,8 @@ for major, minor in list(compute_capabilities):

 extra_cuda_flags += cc_flag

+cc_flag = ['-gencode', 'arch=compute_70,code=sm_70']
+
 if bare_metal_major != -1:
    modules = [CUDAExtension(
        name="attn_core_inplace_cuda",
@@ -111,10 +113,10 @@ else:

 setup(
    name='openfold',
-    version='1.0.1',
+    version='2.0.0',
    description='A PyTorch reimplementation of DeepMind\'s AlphaFold 2',
-    author='Gustaf Ahdritz & DeepMind',
-    author_email='gahdritz@gmail.com',
+    author='OpenFold Team',
+    author_email='jennifer.wei@omsf.io',
    license='Apache License, Version 2.0',
    url='https://github.com/aqlaboratory/openfold',
    packages=find_packages(exclude=["tests", "scripts"]),

--- a/tests/compare_utils.py
+++ b/tests/compare_utils.py
@@ -6,11 +6,14 @@ import sys
 import unittest

 import numpy as np
+import torch

 from openfold.config import model_config
 from openfold.model.model import AlphaFold
 from openfold.utils.import_weights import import_jax_weights_

+from tests.config import consts
+
 # Give JAX some GPU memory discipline
 # (by default it hogs 90% of GPU memory. This disables that behavior and also
 # forces it to proactively free memory that it allocates)
@@ -57,26 +60,27 @@ def import_alphafold():


 def get_alphafold_config():
-    config = alphafold.model.config.model_config("model_1_ptm")  # noqa
+    config = alphafold.model.config.model_config(consts.model)  # noqa
    config.model.global_config.deterministic = True
    return config


-_param_path = "openfold/resources/params/params_model_1_ptm.npz"
+dir_path = os.path.dirname(os.path.realpath(__file__))
+_param_path = os.path.join(dir_path, "..", f"openfold/resources/params/params_{consts.model}.npz")
 _model = None


 def get_global_pretrained_openfold():
    global _model
    if _model is None:
-        _model = AlphaFold(model_config("model_1_ptm"))
+        _model = AlphaFold(model_config(consts.model))
        _model = _model.eval()
        if not os.path.exists(_param_path):
            raise FileNotFoundError(
                """Cannot load pretrained parameters. Make sure to run the 
                installation script before running tests."""
            )
-        import_jax_weights_(_model, _param_path, version="model_1_ptm")
+        import_jax_weights_(_model, _param_path, version=consts.model)
        _model = _model.cuda()

    return _model
@@ -97,7 +101,7 @@ def _remove_key_prefix(d, prefix):
    for k, v in list(d.items()):
        if k.startswith(prefix):
            d.pop(k)
-            d[k[len(prefix) :]] = v
+            d[k[len(prefix):]] = v


 def fetch_alphafold_module_weights(weight_path):
@@ -106,7 +110,6 @@ def fetch_alphafold_module_weights(weight_path):
    if "/" in weight_path:
        spl = weight_path.split("/")
        spl = spl if len(spl[-1]) != 0 else spl[:-1]
-        module_name = spl[-1]
        prefix = "/".join(spl[:-1]) + "/"
        _remove_key_prefix(params, prefix)

@@ -117,3 +120,20 @@ def fetch_alphafold_module_weights(weight_path):
            "Make sure to call import_alphafold before running this function"
        )
    return params
+
+
+def _assert_abs_diff_small_base(compare_func, expected, actual, eps):
+    # Helper function for comparing absolute differences of two torch tensors.
+    abs_diff = torch.abs(expected - actual)
+    err = compare_func(abs_diff)
+    zero_tensor = torch.tensor(0, dtype=err.dtype)
+    rtol = 1.6e-2 if err.dtype == torch.bfloat16 else 1.3e-6  
+    torch.testing.assert_close(err, zero_tensor, atol=eps, rtol=rtol)
+
+
+def assert_max_abs_diff_small(expected, actual, eps):
+    _assert_abs_diff_small_base(torch.max, expected, actual, eps)
+
+
+def assert_mean_abs_diff_small(expected, actual, eps):
+    _assert_abs_diff_small_base(torch.mean, expected, actual, eps)
--- a/tests/config.py
+++ b/tests/config.py
 import ml_collections as mlc

-consts = mlc.ConfigDict(
+
+monomer_consts = mlc.ConfigDict(
    {
+        "model": "model_1_ptm",  # monomer:model_1_ptm, multimer: model_1_multimer_v3
+        "is_multimer": False,  # monomer: False, multimer: True
+        "chunk_size": 4,
        "batch_size": 2,
-        "n_res": 11,
+        "n_res": 22,
        "n_seq": 13,
        "n_templ": 3,
        "n_extra": 17,
@@ -16,9 +20,37 @@ consts = mlc.ConfigDict(
        "c_s": 384,
        "c_t": 64,
        "c_e": 64,
+        "msa_logits": 23,  # monomer: 23, multimer: 22
+        "template_mmcif_dir": None  # Set for test_multimer_datamodule
    }
 )

+multimer_consts = mlc.ConfigDict(
+    {
+        "model": "model_1_multimer_v3",  # monomer:model_1_ptm, multimer: model_1_multimer_v3
+        "is_multimer": True,  # monomer: False, multimer: True
+        "chunk_size": 4,
+        "batch_size": 2,
+        "n_res": 22,
+        "n_seq": 13,
+        "n_templ": 3,
+        "n_extra": 17,
+        "n_heads_extra_msa": 8,
+        "eps": 5e-4,
+        # For compatibility with DeepMind's pretrained weights, it's easiest for
+        # everyone if these take their real values.
+        "c_m": 256,
+        "c_z": 128,
+        "c_s": 384,
+        "c_t": 64,
+        "c_e": 64,
+        "msa_logits": 22,  # monomer: 23, multimer: 22
+        "template_mmcif_dir": None  # Set for test_multimer_datamodule
+    }
+)
+
+consts = monomer_consts 
+
 config = mlc.ConfigDict(
    {
        "data": {

--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -12,10 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from random import randint
 import torch
 import numpy as np
 from scipy.spatial.transform import Rotation

+from tests.config import consts
+
+
+def random_asym_ids(n_res, split_chains=True, min_chain_len=4):
+    n_chain = randint(1, n_res // min_chain_len) if consts.is_multimer else 1
+
+    if not split_chains:
+        return [0] * n_res
+
+    assert n_res >= n_chain
+
+    pieces = []
+    asym_ids = []
+    final_idx = n_chain - 1
+    for idx in range(n_chain - 1):
+        n_stop = (n_res - sum(pieces) - n_chain + idx - min_chain_len)
+        if n_stop <= min_chain_len:
+            final_idx = idx
+            break
+        piece = randint(min_chain_len, n_stop)
+        pieces.append(piece)
+        asym_ids.extend(piece * [idx])
+    asym_ids.extend((n_res - sum(pieces)) * [final_idx])
+
+    return np.array(asym_ids).astype(np.float32) + 1
+

 def random_template_feats(n_templ, n, batch_size=None):
    b = []
@@ -40,6 +67,11 @@ def random_template_feats(n_templ, n, batch_size=None):
    }
    batch = {k: v.astype(np.float32) for k, v in batch.items()}
    batch["template_aatype"] = batch["template_aatype"].astype(np.int64)
+
+    if consts.is_multimer:
+        asym_ids = np.array(random_asym_ids(n))
+        batch["asym_id"] = np.tile(asym_ids[np.newaxis, :], (*b, n_templ, 1))
+
    return batch



--- a/tests/test_data/alignments/2q2k_A/bfd_uniclust_hits.a3m
+++ b/tests/test_data/alignments/2q2k_A/bfd_uniclust_hits.a3m
+>query
+MGSSHHHHHHSSGLVPGSHMDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+>tr|A0A2W3M096|A0A2W3M096_STAAU Plasmid segregation protein ParR OS=Staphylococcus aureus OX=1280 GN=C7Q70_14145 PE=4 SV=1
+-------------------MDKKETQHLLKIKKQDYPQIFNFLEGLPKGTKTAHIREALMRYIAEEGNTP
+>tr|A0A0Q9XW80|A0A0Q9XW80_9STAP Uncharacterized protein OS=Staphylococcus sp. NAM3COL9 GN=ACA31_00310 PE=4 SV=1
+-------------------MSKQETNHLLKIKKKDYPQIFEFLEGVPKGTKTAHIREALLRYIEELGAPP
+>tr|A0A1E5U0W4|A0A1E5U0W4_STAXY Uncharacterized protein OS=Staphylococcus xylosus GN=AST15_04830 PE=4 SV=1
+-------------------MSKQETNHLLKIKKKDYPQIFDFLENVPKGTKTAHIREALIRYINDLGDTpP
--- a/tests/test_data/alignments/2q2k_A/hmm_output.sto
+++ b/tests/test_data/alignments/2q2k_A/hmm_output.sto
--- a/tests/test_data/alignments/2q2k_A/mgnify_hits.sto
+++ b/tests/test_data/alignments/2q2k_A/mgnify_hits.sto
+# STOCKHOLM 1.0
+
+#=GS MGYP000048211747/1-51    DE [subseq from] PL=00 UP=0 BIOMES=0000000011000
+#=GS MGYP000256545448/1-51    DE [subseq from] PL=00 UP=0 BIOMES=0000000011000
+#=GS MGYP000517307434/104-157 DE [subseq from] PL=11 UP=0 BIOMES=0000000011000
+#=GS MGYP000971940026/195-224 DE [subseq from] PL=10 UP=0 BIOMES=0110000000000
+#=GS MGYP000859660985/46-74   DE [subseq from] PL=10 UP=0 BIOMES=0110000000000
+#=GS MGYP000859660985/83-111  DE [subseq from] PL=10 UP=0 BIOMES=0110000000000
+
+query                            MGSSHHHHHHSSGLVPGSHMDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+MGYP000048211747/1-51            -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+MGYP000256545448/1-51            -------------------MKKKETQHLLKIKKEDYPQIFDFLEGLPRGTKTAHIREALLRYIADEGENP
+MGYP000517307434/104-157         ----------------GDLLRQKETQHLLKIKKEDYPQIFDFLEGLPRGTKTAHIREALLRYIADEGENP
+MGYP000971940026/195-224         ------------------------------VKKSDLGQVTSFLKEVPEGKKQDVLDEVLK----------
+MGYP000859660985/46-74           ------------------------------IKKSDLGQVASFLKEVPEGQKQEVLDQVL-----------
+MGYP000859660985/83-111          ------------------------------IKKSDLGQVASFLKEVPEGQKQEVLDQVL-----------
+#=GC RF                          xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+//
--- a/tests/test_data/alignments/2q2k_A/uniprot_hits.sto
+++ b/tests/test_data/alignments/2q2k_A/uniprot_hits.sto
+# STOCKHOLM 1.0
+
+#=GS tr|A0A0K0ME10|A0A0K0ME10_9STAP/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus schleiferi OX=1295 GN=NP71_p00120 PE=4 SV=1
+#=GS tr|A0A0C5BVQ8|A0A0C5BVQ8_STAAU/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus aureus OX=1280 GN=parR PE=4 SV=1
+#=GS tr|A0A0D4ZYK6|A0A0D4ZYK6_STAEP/1-51 DE [subseq from] DNA-binding protein ParR OS=Staphylococcus epidermidis OX=1282 GN=parR PE=4 SV=1
+#=GS tr|A0A0H2XKQ4|A0A0H2XKQ4_STAA3/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus aureus (strain USA300) OX=367830 GN=SAUSA300_pUSA030035 PE=4 SV=1
+#=GS tr|A0A0N9NJL4|A0A0N9NJL4_STAPS/1-51 DE [subseq from] Putative plasmid segregation protein ParR OS=Staphylococcus pseudintermedius OX=283734 GN=parR PE=4 SV=1
+#=GS tr|A0A0U2CJ65|A0A0U2CJ65_STAEP/1-51 DE [subseq from] Plasmid segregation protein OS=Staphylococcus epidermidis OX=1282 GN=parR PE=4 SV=1
+#=GS tr|A0A133QXU6|A0A133QXU6_STASI/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus simulans OX=1286 GN=HMPREF3215_00002 PE=4 SV=1
+#=GS tr|A0A141BHY3|A0A141BHY3_STAXY/1-51 DE [subseq from] DNA-binding protein OS=Staphylococcus xylosus OX=1288 GN=p11 PE=4 SV=1
+#=GS tr|A0A141HMK9|A0A141HMK9_STAA8/1-51 DE [subseq from] ParR OS=Staphylococcus aureus subsp. aureus RN4220 OX=561307 GN=pGO400_p33 PE=4 SV=1
+#=GS tr|A0A1B1UXS0|A0A1B1UXS0_STALU/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus lugdunensis OX=28035 GN=parR PE=4 SV=1
+#=GS tr|A0A1S7BGJ1|A0A1S7BGJ1_STAAU/1-51 DE [subseq from] DNA-binding protein ParR OS=Staphylococcus aureus OX=1280 GN=parR PE=4 SV=1
+#=GS tr|A0A418HED5|A0A418HED5_STAGA/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus gallinarum OX=1293 GN=BUY97_07835 PE=4 SV=1
+#=GS tr|A0A507SJ94|A0A507SJ94_9STAP/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus sp. SKL71187 OX=2497688 GN=EKV43_01520 PE=4 SV=1
+#=GS tr|A0A6N0I4W4|A0A6N0I4W4_STAHO/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus hominis OX=1290 GN=FOB69_12695 PE=4 SV=1
+#=GS tr|A0A7G3T6L6|A0A7G3T6L6_9STAP/1-51 DE [subseq from] Plasmid segregation protein OS=Staphylococcus equorum OX=246432 PE=4 SV=1
+#=GS tr|A0A848F022|A0A848F022_STACP/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus capitis OX=29388 GN=HHM13_04665 PE=4 SV=1
+#=GS tr|O87365|O87365_STAAU/1-51         DE [subseq from] Conserved domain protein OS=Staphylococcus aureus OX=1280 GN=parR PE=1 SV=1
+#=GS tr|A0A7G3L2E1|A0A7G3L2E1_STAAU/1-51 DE [subseq from] Plasmid segregation protein ParR OS=Staphylococcus aureus OX=1280 PE=4 SV=1
+#=GS tr|E4PYH1|E4PYH1_STAAU/1-39         DE [subseq from] DUF655 domain-containing protein OS=Staphylococcus aureus OX=1280 GN=SUM_0041p2 PE=4 SV=1
+#=GS tr|A0A0Q9XW80|A0A0Q9XW80_9STAP/1-51 DE [subseq from] RHH_1 domain-containing protein OS=Staphylococcus sp. NAM3COL9 OX=1667172 GN=ACA31_00310 PE=4 SV=1
+
+query                                       MGSSHHHHHHSSGLVPGSHMDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A0K0ME10|A0A0K0ME10_9STAP/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A0C5BVQ8|A0A0C5BVQ8_STAAU/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A0D4ZYK6|A0A0D4ZYK6_STAEP/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A0H2XKQ4|A0A0H2XKQ4_STAA3/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A0N9NJL4|A0A0N9NJL4_STAPS/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A0U2CJ65|A0A0U2CJ65_STAEP/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A133QXU6|A0A133QXU6_STASI/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A141BHY3|A0A141BHY3_STAXY/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A141HMK9|A0A141HMK9_STAA8/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A1B1UXS0|A0A1B1UXS0_STALU/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A1S7BGJ1|A0A1S7BGJ1_STAAU/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A418HED5|A0A418HED5_STAGA/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A507SJ94|A0A507SJ94_9STAP/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A6N0I4W4|A0A6N0I4W4_STAHO/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A7G3T6L6|A0A7G3T6L6_9STAP/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A848F022|A0A848F022_STACP/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|O87365|O87365_STAAU/1-51                 -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|A0A7G3L2E1|A0A7G3L2E1_STAAU/1-51         -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+tr|E4PYH1|E4PYH1_STAAU/1-39                 -------------------MDKKETKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREA------------
+tr|A0A0Q9XW80|A0A0Q9XW80_9STAP/1-51         -------------------MSKQETNHLLKIKKKDYPQIFEFLEGVPKGTKTAHIREALLRYIEELGAPP
+#=GC RF                                     xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+//
--- a/tests/test_data/alignments/2q2k_A/uniref90_hits.sto
+++ b/tests/test_data/alignments/2q2k_A/uniref90_hits.sto
+# STOCKHOLM 1.0
+
+#=GS UniRef90_A0A141BHY3/1-51    DE [subseq from] DNA-binding protein n=37 Tax=Staphylococcaceae TaxID=90964 RepID=A0A141BHY3_STAXY
+#=GS UniRef90_UPI000A061283/1-51 DE [subseq from] plasmid segregation protein ParR n=1 Tax=Mammaliicoccus sciuri TaxID=1296 RepID=UPI000A061283
+#=GS UniRef90_UPI001E649B27/1-51 DE [subseq from] plasmid segregation protein ParR n=1 Tax=Mammaliicoccus sciuri TaxID=1296 RepID=UPI001E649B27
+#=GS UniRef90_UPI00201A2D50/1-51 DE [subseq from] plasmid segregation protein ParR n=1 Tax=Staphylococcus aureus TaxID=1280 RepID=UPI00201A2D50
+#=GS UniRef90_UPI0018EDBA69/1-51 DE [subseq from] plasmid segregation protein ParR n=1 Tax=Staphylococcus aureus TaxID=1280 RepID=UPI0018EDBA69
+#=GS UniRef90_UPI0005E12F5A/1-51 DE [subseq from] plasmid segregation protein ParR n=1 Tax=Staphylococcus TaxID=1279 RepID=UPI0005E12F5A
+#=GS UniRef90_UPI00207B21F3/1-51 DE [subseq from] plasmid segregation protein ParR n=2 Tax=Staphylococcus TaxID=1279 RepID=UPI00207B21F3
+#=GS UniRef90_UPI0009836679/1-51 DE [subseq from] plasmid segregation protein ParR n=2 Tax=Staphylococcus aureus TaxID=1280 RepID=UPI0009836679
+#=GS UniRef90_UPI001F5439CD/1-51 DE [subseq from] plasmid segregation protein ParR n=11 Tax=Staphylococcaceae TaxID=90964 RepID=UPI001F5439CD
+#=GS UniRef90_UPI000DA9B884/1-51 DE [subseq from] plasmid segregation protein ParR n=3 Tax=Bacillales TaxID=1385 RepID=UPI000DA9B884
+#=GS UniRef90_A0A0Q9XW80/1-51    DE [subseq from] RHH_1 domain-containing protein n=1 Tax=Staphylococcus sp. NAM3COL9 TaxID=1667172 RepID=A0A0Q9XW80_9STAP
+#=GS UniRef90_UPI001CCC4088/3-48 DE [subseq from] plasmid segregation protein ParR n=1 Tax=Macrococcus armenti TaxID=2875764 RepID=UPI001CCC4088
+#=GS UniRef90_UPI0014612D4C/1-49 DE [subseq from] De novo designed WSHC6 n=2 Tax=synthetic construct TaxID=32630 RepID=UPI0014612D4C
+#=GS UniRef90_UPI000B802FE5/1-42 DE [subseq from] HEEH_rd4_0097 n=1 Tax=Escherichia coli TaxID=562 RepID=UPI000B802FE5
+#=GS UniRef90_UPI001E281CEB/1-54 DE [subseq from] Network hallucinated protein 0738_mod n=1 Tax=synthetic construct TaxID=32630 RepID=UPI001E281CEB
+
+query                               MGSSHHHHHHSSGLVP-GSHMDKKE-TKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+UniRef90_A0A141BHY3/1-51            --------------------MDKKE-TKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEIGENP
+UniRef90_UPI000A061283/1-51         --------------------MDKKE-TKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEMGENP
+UniRef90_UPI001E649B27/1-51         --------------------MDKKE-TKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEMGDNP
+UniRef90_UPI00201A2D50/1-51         --------------------MEKKE-TKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALRRYIEEMGDNP
+UniRef90_UPI0018EDBA69/1-51         --------------------MDKKE-TKHLLKIKKEDYPQIFDFLENVPRGTKTAHIREALLRYIEEFGENP
+UniRef90_UPI0005E12F5A/1-51         --------------------MKKKE-TQHLLKIKKEDYPQIFDFLEGLPRGTKTAHIREALLRYIADEGENP
+UniRef90_UPI00207B21F3/1-51         --------------------MSKQE-TNHLLKIKKEDYPQIFDFLENVPKGTKTAHIREALIRYINDLGGSP
+UniRef90_UPI0009836679/1-51         --------------------MDKKE-TQHLLKIKKQDYPQIFNFLEGLPKGTKTAHIREALMRYIAEEGQNP
+UniRef90_UPI001F5439CD/1-51         --------------------MSKQE-TNHLLKIKKKDYPQIFDFLENVPKGTKTAHIREALIRYINDLGGTP
+UniRef90_UPI000DA9B884/1-51         --------------------MDKKE-TQHLLKIKKQDYPQIFNFLEGLPKGTKTAHIREALMRYIAEEGNTP
+UniRef90_A0A0Q9XW80/1-51            --------------------MSKQE-TNHLLKIKKKDYPQIFEFLEGVPKGTKTAHIREALLRYIEELGAPP
+UniRef90_UPI001CCC4088/3-48         ----------------------KEV-NQTLLKIDKAEYPEIYDFLENVPRGTKTAHIREALIRYINDIN---
+UniRef90_UPI0014612D4C/1-49         MGSSHHHHHHSSGLVPRGSHMTEDE-IRKLRKLLEEAEKKLYKLEDKTRR----------------------
+UniRef90_UPI000B802FE5/1-42         MGSSHHHHHHSSGLVPRGSHMDVEEQIRRLEEVLKKNQPVTW------------------------------
+UniRef90_UPI001E281CEB/1-54         MGSSHHHHHHSSGLVPRGSHMNIQV-SLQWE---DPKKGKVFSHTVNIPPGGTAEQIA--------------
+#=GC RF                             xxxxxxxxxxxxxxxx.xxxxxxxx.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+//