Merge branch 'main' of https://github.com/aqlaboratory/openfold

13f8f163 · zhuwenwen · a509a4c5 · b5fa2ba3 · 13f8f163 · 13f8f163
Commit 13f8f163 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/openfold/utils/validation_metrics.py
+++ b/openfold/utils/validation_metrics.py
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+def drmsd(structure_1, structure_2, mask=None):
+    def prep_d(structure):
+        d = structure[..., :, None, :] - structure[..., None, :, :]
+        d = d ** 2
+        d = torch.sqrt(torch.sum(d, dim=-1))
+        return d
+    d1 = prep_d(structure_1)
+    d2 = prep_d(structure_2)
+    drmsd = d1 - d2
+    drmsd = drmsd ** 2
+    if(mask is not None):
+        drmsd = drmsd * (mask[..., None] * mask[..., None, :])
+    drmsd = torch.sum(drmsd, dim=(-1, -2))
+    n = d1.shape[-1] if mask is None else torch.sum(mask, dim=-1)
+    drmsd = drmsd * (1 / (n * (n - 1))) if n > 1 else (drmsd * 0.)
+    drmsd = torch.sqrt(drmsd)
+    return drmsd
+def drmsd_np(structure_1, structure_2, mask=None):
+    structure_1 = torch.tensor(structure_1)
+    structure_2 = torch.tensor(structure_2)
+    if(mask is not None):
+        mask = torch.tensor(mask)
+    return drmsd(structure_1, structure_2, mask)
+def gdt(p1, p2, mask, cutoffs):
+    n = torch.sum(mask, dim=-1)
+    p1 = p1.float()
+    p2 = p2.float()
+    distances = torch.sqrt(torch.sum((p1 - p2)**2, dim=-1))
+    scores = []
+    for c in cutoffs:
+        score = torch.sum((distances <= c) * mask, dim=-1) / n
+        score = torch.mean(score)
+        scores.append(score)
+    return sum(scores) / len(scores)
+def gdt_ts(p1, p2, mask):
+    return gdt(p1, p2, mask, [1., 2., 4., 8.])
+def gdt_ha(p1, p2, mask):
+    return gdt(p1, p2, mask, [0.5, 1., 2., 4.])
--- a/run_pretrained_openfold.py
+++ b/run_pretrained_openfold.py
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import math
+import numpy as np
+import os
+from openfold.utils.script_utils import load_models_from_command_line, parse_fasta, run_model, prep_output, \
+    update_timings, relax_protein
+logging.basicConfig()
+logger = logging.getLogger(__file__)
+logger.setLevel(level=logging.INFO)
+import pickle
+import random
+import time
+import torch
+torch_versions = torch.__version__.split(".")
+torch_major_version = int(torch_versions[0])
+torch_minor_version = int(torch_versions[1])
+if(
+    torch_major_version > 1 or
+    (torch_major_version == 1 and torch_minor_version >= 12)
+):
+    # Gives a large speedup on Ampere-class GPUs
+    torch.set_float32_matmul_precision("high")
+torch.set_grad_enabled(False)
+from openfold.config import model_config
+from openfold.data import templates, feature_pipeline, data_pipeline
+from openfold.np import residue_constants, protein
+import openfold.np.relax.relax as relax
+from openfold.utils.tensor_utils import (
+    tensor_tree_map,
+)
+from openfold.utils.trace_utils import (
+    pad_feature_dict_seq,
+    trace_model_,
+)
+from scripts.utils import add_data_args
+TRACING_INTERVAL = 50
+def precompute_alignments(tags, seqs, alignment_dir, args):
+    for tag, seq in zip(tags, seqs):
+        tmp_fasta_path = os.path.join(args.output_dir, f"tmp_{os.getpid()}.fasta")
+        with open(tmp_fasta_path, "w") as fp:
+            fp.write(f">{tag}\n{seq}")
+        local_alignment_dir = os.path.join(alignment_dir, tag)
+        if(args.use_precomputed_alignments is None and not os.path.isdir(local_alignment_dir)):
+            logger.info(f"Generating alignments for {tag}...")
+            os.makedirs(local_alignment_dir)
+            alignment_runner = data_pipeline.AlignmentRunner(
+                jackhmmer_binary_path=args.jackhmmer_binary_path,
+                hhblits_binary_path=args.hhblits_binary_path,
+                hhsearch_binary_path=args.hhsearch_binary_path,
+                uniref90_database_path=args.uniref90_database_path,
+                mgnify_database_path=args.mgnify_database_path,
+                bfd_database_path=args.bfd_database_path,
+                uniclust30_database_path=args.uniclust30_database_path,
+                pdb70_database_path=args.pdb70_database_path,
+                no_cpus=args.cpus,
+            )
+            alignment_runner.run(
+                tmp_fasta_path, local_alignment_dir
+            )
+        else:
+            logger.info(
+                f"Using precomputed alignments for {tag} at {alignment_dir}..."
+            )
+        # Remove temporary FASTA file
+        os.remove(tmp_fasta_path)
+def round_up_seqlen(seqlen):
+    return int(math.ceil(seqlen / TRACING_INTERVAL)) * TRACING_INTERVAL
+def generate_feature_dict(
+    tags,
+    seqs,
+    alignment_dir,
+    data_processor,
+    args,
+):
+    tmp_fasta_path = os.path.join(args.output_dir, f"tmp_{os.getpid()}.fasta")
+    if len(seqs) == 1:
+        tag = tags[0]
+        seq = seqs[0]
+        with open(tmp_fasta_path, "w") as fp:
+            fp.write(f">{tag}\n{seq}")
+        local_alignment_dir = os.path.join(alignment_dir, tag)
+        feature_dict = data_processor.process_fasta(
+            fasta_path=tmp_fasta_path, alignment_dir=local_alignment_dir
+        )
+    else:
+        with open(tmp_fasta_path, "w") as fp:
+            fp.write(
+                '\n'.join([f">{tag}\n{seq}" for tag, seq in zip(tags, seqs)])
+            )
+        feature_dict = data_processor.process_multiseq_fasta(
+            fasta_path=tmp_fasta_path, super_alignment_dir=alignment_dir,
+        )
+    # Remove temporary FASTA file
+    os.remove(tmp_fasta_path)
+    return feature_dict
+def list_files_with_extensions(dir, extensions):
+    return [f for f in os.listdir(dir) if f.endswith(extensions)]
+def main(args):
+    # Create the output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    config = model_config(args.config_preset, long_sequence_inference=args.long_sequence_inference)
+    if(args.trace_model):
+        if(not config.data.predict.fixed_size):
+            raise ValueError(
+                "Tracing requires that fixed_size mode be enabled in the config"
+            )
+    template_featurizer = templates.TemplateHitFeaturizer(
+        mmcif_dir=args.template_mmcif_dir,
+        max_template_date=args.max_template_date,
+        max_hits=config.data.predict.max_templates,
+        kalign_binary_path=args.kalign_binary_path,
+        release_dates_path=args.release_dates_path,
+        obsolete_pdbs_path=args.obsolete_pdbs_path
+    )
+    data_processor = data_pipeline.DataPipeline(
+        template_featurizer=template_featurizer,
+    )
+    output_dir_base = args.output_dir
+    random_seed = args.data_random_seed
+    if random_seed is None:
+        random_seed = random.randrange(2**32)
+    np.random.seed(random_seed)
+    torch.manual_seed(random_seed + 1)
+    feature_processor = feature_pipeline.FeaturePipeline(config.data)
+    if not os.path.exists(output_dir_base):
+        os.makedirs(output_dir_base)
+    if args.use_precomputed_alignments is None:
+        alignment_dir = os.path.join(output_dir_base, "alignments")
+    else:
+        alignment_dir = args.use_precomputed_alignments
+    tag_list = []
+    seq_list = []
+    for fasta_file in list_files_with_extensions(args.fasta_dir, (".fasta", ".fa")):
+        # Gather input sequences
+        with open(os.path.join(args.fasta_dir, fasta_file), "r") as fp:
+            data = fp.read()
+        tags, seqs = parse_fasta(data)
+        # assert len(tags) == len(set(tags)), "All FASTA tags must be unique"
+        tag = '-'.join(tags)
+        tag_list.append((tag, tags))
+        seq_list.append(seqs)
+    seq_sort_fn = lambda target: sum([len(s) for s in target[1]])
+    sorted_targets = sorted(zip(tag_list, seq_list), key=seq_sort_fn)
+    feature_dicts = {}
+    model_generator = load_models_from_command_line(
+        config,
+        args.model_device,
+        args.openfold_checkpoint_path,
+        args.jax_param_path,
+        args.output_dir)
+    for model, output_directory in model_generator:
+        cur_tracing_interval = 0
+        for (tag, tags), seqs in sorted_targets:
+            output_name = f'{tag}_{args.config_preset}'
+            if args.output_postfix is not None:
+                output_name = f'{output_name}_{args.output_postfix}'
+            # Does nothing if the alignments have already been computed
+            precompute_alignments(tags, seqs, alignment_dir, args)
+            feature_dict = feature_dicts.get(tag, None)
+            if(feature_dict is None):
+                feature_dict = generate_feature_dict(
+                    tags,
+                    seqs,
+                    alignment_dir,
+                    data_processor,
+                    args,
+                )
+                if(args.trace_model):
+                    n = feature_dict["aatype"].shape[-2]
+                    rounded_seqlen = round_up_seqlen(n)
+                    feature_dict = pad_feature_dict_seq(
+                        feature_dict, rounded_seqlen,
+                    )
+                feature_dicts[tag] = feature_dict
+            processed_feature_dict = feature_processor.process_features(
+                feature_dict, mode='predict',
+            )
+            processed_feature_dict = {
+                k:torch.as_tensor(v, device=args.model_device)
+                for k,v in processed_feature_dict.items()
+            }
+            if(args.trace_model):
+                if(rounded_seqlen > cur_tracing_interval):
+                    logger.info(
+                        f"Tracing model at {rounded_seqlen} residues..."
+                    )
+                    t = time.perf_counter()
+                    trace_model_(model, processed_feature_dict)
+                    tracing_time = time.perf_counter() - t
+                    logger.info(
+                        f"Tracing time: {tracing_time}"
+                    )
+                    cur_tracing_interval = rounded_seqlen
+            out = run_model(model, processed_feature_dict, tag, args.output_dir)
+            # Toss out the recycling dimensions --- we don't need them anymore
+            processed_feature_dict = tensor_tree_map(
+                lambda x: np.array(x[..., -1].cpu()),
+                processed_feature_dict
+            )
+            out = tensor_tree_map(lambda x: np.array(x.cpu()), out)
+            unrelaxed_protein = prep_output(
+                out,
+                processed_feature_dict,
+                feature_dict,
+                feature_processor,
+                args.config_preset,
+                args.multimer_ri_gap,
+                args.subtract_plddt
+            )
+            unrelaxed_file_suffix = "_unrelaxed.pdb"
+            if args.cif_output:
+                unrelaxed_file_suffix = "_unrelaxed.cif"
+            unrelaxed_output_path = os.path.join(
+                output_directory, f'{output_name}{unrelaxed_file_suffix}'
+            )
+            with open(unrelaxed_output_path, 'w') as fp:
+                if args.cif_output:
+                    fp.write(protein.to_modelcif(unrelaxed_protein))
+                else:
+                    fp.write(protein.to_pdb(unrelaxed_protein))
+            logger.info(f"Output written to {unrelaxed_output_path}...")
+            if not args.skip_relaxation:
+                # Relax the prediction.
+                logger.info(f"Running relaxation on {unrelaxed_output_path}...")
+                relax_protein(config, args.model_device, unrelaxed_protein, output_directory, output_name, args.cif_output)
+            if args.save_outputs:
+                output_dict_path = os.path.join(
+                    output_directory, f'{output_name}_output_dict.pkl'
+                )
+                with open(output_dict_path, "wb") as fp:
+                    pickle.dump(out, fp, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(f"Model output written to {output_dict_path}...")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "fasta_dir", type=str,
+        help="Path to directory containing FASTA files, one sequence per file"
+    )
+    parser.add_argument(
+        "template_mmcif_dir", type=str,
+    )
+    parser.add_argument(
+        "--use_precomputed_alignments", type=str, default=None,
+        help="""Path to alignment directory. If provided, alignment computation 
+                is skipped and database path arguments are ignored."""
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default=os.getcwd(),
+        help="""Name of the directory in which to output the prediction""",
+    )
+    parser.add_argument(
+        "--model_device", type=str, default="cpu",
+        help="""Name of the device on which to run the model. Any valid torch
+             device name is accepted (e.g. "cpu", "cuda:0")"""
+    )
+    parser.add_argument(
+        "--config_preset", type=str, default="model_1",
+        help="""Name of a model config preset defined in openfold/config.py"""
+    )
+    parser.add_argument(
+        "--jax_param_path", type=str, default=None,
+        help="""Path to JAX model parameters. If None, and openfold_checkpoint_path
+             is also None, parameters are selected automatically according to 
+             the model name from openfold/resources/params"""
+    )
+    parser.add_argument(
+        "--openfold_checkpoint_path", type=str, default=None,
+        help="""Path to OpenFold checkpoint. Can be either a DeepSpeed 
+             checkpoint directory or a .pt file"""
+    )
+    parser.add_argument(
+        "--save_outputs", action="store_true", default=False,
+        help="Whether to save all model outputs, including embeddings, etc."
+    )
+    parser.add_argument(
+        "--cpus", type=int, default=4,
+        help="""Number of CPUs with which to run alignment tools"""
+    )
+    parser.add_argument(
+        "--preset", type=str, default='full_dbs',
+        choices=('reduced_dbs', 'full_dbs')
+    )
+    parser.add_argument(
+        "--output_postfix", type=str, default=None,
+        help="""Postfix for output prediction filenames"""
+    )
+    parser.add_argument(
+        "--data_random_seed", type=str, default=None
+    )
+    parser.add_argument(
+        "--skip_relaxation", action="store_true", default=False,
+    )
+    parser.add_argument(
+        "--multimer_ri_gap", type=int, default=200,
+        help="""Residue index offset between multiple sequences, if provided"""
+    )
+    parser.add_argument(
+        "--trace_model", action="store_true", default=False,
+        help="""Whether to convert parts of each model to TorchScript.
+                Significantly improves runtime at the cost of lengthy
+                'compilation.' Useful for large batch jobs."""
+    )
+    parser.add_argument(
+        "--subtract_plddt", action="store_true", default=False,
+        help=""""Whether to output (100 - pLDDT) in the B-factor column instead
+                 of the pLDDT itself"""
+    )
+    parser.add_argument(
+        "--long_sequence_inference", action="store_true", default=False,
+        help="""enable options to reduce memory usage at the cost of speed, helps longer sequences fit into GPU memory, see the README for details"""
+    )
+    parser.add_argument(
+        "--cif_output", action="store_true", default=False,
+        help="Output predicted models in ModelCIF format instead of PDB format (default)"
+    )
+    add_data_args(parser)
+    args = parser.parse_args()
+    if(args.jax_param_path is None and args.openfold_checkpoint_path is None):
+        args.jax_param_path = os.path.join(
+            "openfold", "resources", "params",
+            "params_" + args.config_preset + ".npz"
+        )
+    if(args.model_device == "cpu" and torch.cuda.is_available()):
+        logging.warning(
+            """The model is being run on CPU. Consider specifying 
+            --model_device for better performance"""
+        )
+    main(args)
--- a/scripts/activate_conda_env.sh
+++ b/scripts/activate_conda_env.sh
+#!/bin/bash
+source scripts/vars.sh
+source lib/conda/etc/profile.d/conda.sh
+conda activate $ENV_NAME
--- a/scripts/alignment_db_scripts/create_alignment_db.py
+++ b/scripts/alignment_db_scripts/create_alignment_db.py
+import argparse
+import json
+import os
+def main(args):
+    db_path = os.path.join(args.output_db_path, f"{args.output_db_name}.db")
+    index_path = os.path.join(
+        args.output_db_path, f"{args.output_db_name}.index"
+    )
+    db_fp = open(db_path, "wb")
+    index = {}
+    db_offset = 0
+    for chain_alignment_dir in os.listdir(args.alignment_dir):
+        cad_path = os.path.join(args.alignment_dir, chain_alignment_dir)
+        for f in os.listdir(cad_path):
+            f_path = os.path.join(cad_path, f)
+            with open(f_path, "rb") as fp:
+                file_bytes = fp.read()
+            l = len(file_bytes)
+            file_list = index.setdefault(chain_alignment_dir, [])
+            file_list.append((f, db_offset, l))
+            db_fp.write(file_bytes)
+            db_offset += l
+    db_fp.close()
+    with open(index_path, "w") as fp:
+        json.dump(index, fp)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "alignment_dir", type=str, 
+        help="""Path to precomputed alignment directory, with one subdirectory 
+                per chain."""
+    )
+    parser.add_argument("output_db_path", type=str)
+    parser.add_argument("output_db_name", type=str)
+    args = parser.parse_args()
+    main(args)
--- a/scripts/alignment_db_scripts/unify_alignment_db_indices.py
+++ b/scripts/alignment_db_scripts/unify_alignment_db_indices.py
+import argparse
+import json
+import os
+""" Unifies databases created with create_alignment_db.py """
+def main(args):
+    super_index = {}
+    for f in os.listdir(args.alignment_db_dir):
+        if(not os.path.splitext(f)[-1] == ".index"):
+            continue
+        with open(os.path.join(args.alignment_db_dir, f), "r") as fp:
+            index = json.load(fp)
+        db_name = f"{os.path.splitext(f)[0]}.db"
+        for k in index:
+            super_index[k] = {
+                "db": db_name,
+                "files": index[k],
+            }
+    with open(os.path.join(args.output_dir, "super.index"), "w") as fp:
+        json.dump(super_index, fp)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("alignment_db_dir", type=str, help="Path to directory containing alignment_dbs")
+    parser.add_argument("output_dir", type=str, help="Path in which to output super index")
+    args = parser.parse_args()
+    main(args)
--- a/scripts/build_deepspeed_config.py
+++ b/scripts/build_deepspeed_config.py
+# Copyright 2021 AlQuraishi Laboratory
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+parser = argparse.ArgumentParser(description='''Outputs a DeepSpeed 
+                                                configuration file to 
+                                                stdout''')
+parser.add_argument("--gradient_clipping", type=float, default=None,
+                    help="Value of gradient clipping")
+p = parser.add_argument_group("Optimizer")
+p.add_argument("--optimizer", default=None,
+               help='''Choice of optimizer. Choose between "Adam" or 
+                       "OneBitAdam"''')
+p.add_argument("--lr", dest="lr", type=float, default=1e-3,
+               help="The learning rate")
+p.add_argument("--freeze_step", type=int, default=100,
+               help='''Number of warm-up steps before 1-bit compression 
+                       activates. Applies only when --optimizer is 
+                       OneBitAdam''')
+p.add_argument("--cuda_aware", action="store_true", default=False, 
+               help='''Indicates that the underlying MPI library supports 
+                       CUDA-Aware communication. Applies only when 
+                       --optimizer is OneBitAdam''')
+p.add_argument("--comm_backend_name", type=str, default="nccl",
+               help='''Communication implementation for OneBitAdam. Choose
+                       from nccl and mpi''')
+p.add_argument("--eps", type=float, default=1e-8,
+               help="Adam epsilon parameter")
+sched = parser.add_argument_group("Scheduler")
+sched.add_argument(
+    "--scheduler", type=str, default=None,
+    help='''The LR scheduler. Choose from "LRRangeTest", "OneCycle", WarmupLR, 
+            and WarmupDecayLR". Documentation for each can be found here: 
+            deepspeed.readthedocs.io/en/latest/schedulers.html'''
+)
+range_test = sched.add_argument_group("LRRangeTest")
+range_test.add_argument(
+    "--lr_range_test_min_lr", type=float, default=1e-04
+)
+range_test.add_argument(
+    "--lr_range_test_step_size", type=int, default=2000
+)
+range_test.add_argument(
+    "--lr_range_test_step_rate", type=float, default=1.0
+)
+range_test.add_argument(
+    "--lr_range_test_staircase", type=bool, default=False
+)
+cycle = sched.add_argument_group("OneCycle")
+cycle.add_argument(
+    "--cycle_min_lr", type=float, default=1e-06
+)
+cycle.add_argument(
+    "--cycle_max_lr", type=float, default=1e-03
+)
+cycle.add_argument(
+    "--cycle_decay_lr_rate", type=float, default=0
+)
+cycle.add_argument(
+    "--cycle_first_step_size", type=int, default=2000
+)
+cycle.add_argument(
+    "--cycle_second_step_size", type=int, default=None
+)
+cycle.add_argument(
+    "--cycle_first_stair_count", type=int, default=0       
+)
+cycle.add_argument(
+    "--cycle_second_stair_count", type=int, default=0     
+)
+cycle.add_argument(
+    "--cycle_decay_step_size", type=int, default=0
+)
+cycle.add_argument(
+    "--cycle_momentum", type=bool, default=True
+)
+cycle.add_argument(
+    "--cycle_min_mom", type=float, default=0.8
+)
+cycle.add_argument(
+    "--cycle_max_mom", type=float, default=0.9
+)
+cycle.add_argument(
+    "--cycle_decay_mom_rate", type=float, default=0
+)
+warmup = sched.add_argument_group("WarmupLR")
+warmup.add_argument(
+    "--warmup_min_lr", type=float, default=0.
+)
+warmup.add_argument(
+    "--warmup_max_lr", type=float, default=0.001
+)
+warmup.add_argument(
+    "--warmup_num_steps", type=int, default=1000
+)
+warmup_decay = sched.add_argument_group("WarmupDecayLR")
+warmup_decay.add_argument(
+    "--warmup_decay_total_num_steps", type=int, default=1e05
+)
+warmup_decay.add_argument(
+    "--warmup_decay_min_lr", type=float, default=0.
+)
+warmup_decay.add_argument(
+    "--warmup_decay_max_lr", type=float, default=0.001
+)
+warmup_decay.add_argument(
+    "--warmup_decay_num_steps", type=int, default=1000
+)
+p = parser.add_argument_group("Half-precision training (fp16)")
+p.add_argument("--fp16", dest="fp16", action="store_true", default=False,
+               help="""Whether to train in 16-bit/mixed-precision mode. 
+                       Mutually exclusive with --amp""")
+p = parser.add_argument_group("Half-precision training (bfloat16)")
+p.add_argument("--bfloat16", dest="bfloat16", action="store_true",
+               default=False,
+               help="""Whether to train in 16-bit bfloat16 mode. Mutually
+                       exclusive with --amp and --fp16. Requires hardware
+                       support""")
+p = parser.add_argument_group("AMP")
+p.add_argument("--amp", action="store_true", default=False,
+                help="""Whether to enable AMP training. Mutually exclusive with 
+                      --fp16""")
+p.add_argument("--opt_level", action="store_true", default=False,
+                help="""AMP optimization level. One of "O0", "O1", "O2", or 
+                        "O3".""")
+p = parser.add_argument_group("Activation checkpointing")
+p.add_argument("--partition_activations", action="store_true", 
+                default=False,
+               help="Activation checkpointing")
+p.add_argument("--cpu_checkpointing", action="store_true", default=False,
+               help="Offload activation checkpoints to CPU")
+p.add_argument("--profile", action="store_true", 
+                default=False,
+               help="Whether to profile activation checkpointing")
+p = parser.add_argument_group("ZeRO optimization")
+p.add_argument("--zero_stage", type=int, default=2,
+               help="ZeRO optimizer stage")
+p.add_argument("--allgather_partitions", action="store_true", 
+               default=False,
+               help='''Allgather collective vs. broadcast collectives 
+                          for parameter gathering''')
+p.add_argument("--allgather_bucket_size", type=int, default=1e9,
+               help="Number of elements allgathered at one time")
+p.add_argument("--overlap_comm", action="store_true", default=False,
+               help='''Whether to overlap gradient reduction and backward 
+                          pass''')
+p.add_argument("--reduce_scatter", action="store_true", default=False,
+               help="Use reduce to average gradients")
+p.add_argument("--reduce_bucket_size", type=int, default=1e9,
+               help="Number of elements reduced at one time")
+p.add_argument("--offload_optimizer", action="store_true", default=False,
+               help='''Offload optimizer state to CPU. Valid only when 
+                          --stage is 2 or 3''')
+p.add_argument("--pin_memory", action="store_true", default=False,
+                help="Speeds up offloaded throughput at the cost of memory")
+p = parser.add_argument_group("Flops profiler")
+p.add_argument("--flops_profiler", action="store_true", default=False,
+               help="Whether to enable the DeepSpeed Flops Profiler")
+p.add_argument("--profile_step", type=int, default=1,
+               help='''The global training step at which to run the flops
+                       profiler. Has no effect unless --flops_profiler is 
+                       given''')
+p.add_argument("--module_depth", type=int, default=-1,
+               help='''Depth to which aggregated module info is printed. Has 
+                       no effect unless --flops_profiler is given''')
+p.add_argument("--top_modules", type=int, default=3,
+               help='''Number of top modules to print in the aggregated 
+                       profile. Has no effect unless --flops_profiler is
+                       given''')
+p.add_argument("--detailed_flops_profile", action="store_true", 
+                default=False,
+               help='''Whether the flops_profiler should be detailed. Has
+                       no effect unless --flops_profiler is given''')
+args = parser.parse_args()
+d = {}
+# Optimizer settings
+if(args.optimizer is not None):
+    optimizer = {}
+    optimizer["type"] = args.optimizer
+    params = {}
+    params["lr"] = args.lr
+    params["eps"] = args.eps
+    if(args.optimizer == "OneBitAdam"):
+        params["freeze_step"] = args.freeze_step
+        params["cuda_aware"] = args.cuda_aware
+        params["comm_backend_name"] = args.comm_backend_name
+    optimizer["params"] = params
+    d["optimizer"] = optimizer
+# LR scheduler
+if(args.scheduler is not None):
+    scheduler = {}
+    scheduler["type"] = args.scheduler
+    params = {}
+    if(args.scheduler == "LRRangeTest"):
+        params["lr_range_test_min_lr"] = args.lr_range_test_min_lr
+        params["lr_range_test_step_size"] = args.lr_range_test_step_size
+        params["lr_range_test_step_rate"] = args.lr_range_test_step_rate
+        params["lr_range_test_staircase"] = args.lr_range_test_staircase
+    elif(args.scheduler == "OneCycle"):
+        params["cycle_min_lr"] = args.cycle_min_lr
+        params["cycle_max_lr"] = args.cycle_max_lr
+        params["decay_lr_rate"] = args.cycle_decay_lr_rate
+        params["cycle_first_step_size"] = args.cycle_first_step_size
+        params["cycle_second_step_size"] = args.cycle_second_step_size
+        params["cycle_first_stair_count"] = args.cycle_first_stair_count
+        params["cycle_second_stair_count"] = args.cycle_second_stair_count
+        params["cycle_momentum"] = args.cycle_momentum
+        params["cycle_min_mom"] = args.cycle_min_mom
+        params["cycle_max_mom"] = args.cycle_max_mom
+        params["decay_mom_rate"] = args.cycle_decay_mom_rate
+    elif(args.scheduler == "WarmupLR"):
+        params["warmup_min_lr"] = args.warmup_min_lr
+        params["warmup_max_lr"] = args.warmup_max_lr
+        params["warmup_num_steps"] = args.warmup_num_steps
+    elif(args.scheduler == "WarmupDecayLR"):
+        params["warmup_min_lr"] = args.warmup_decay_min_lr
+        params["warmup_max_lr"] = args.warmup_decay_max_lr
+        params["warmup_num_steps"] = args.warmup_decay_num_steps
+        params["total_num_steps"] = args.warmup_decay_total_num_steps
+    else:
+        raise ValueError("Invalid scheduler")
+    scheduler["params"] = params
+    d["scheduler"] = scheduler
+# 16-bit training
+if(sum([args.amp, args.fp16, args.bfloat16]) > 1):
+    raise ValueError("Only one of --fp16, --amp, or --bfloat16 can be enabled")
+if(args.amp):
+    amp = {}
+    amp["enabled"] = True
+    amp["pin_memory"] = args.opt_level
+    d["amp"] = amp
+elif(args.fp16):
+    fp16 = {}
+    fp16["enabled"] = args.fp16
+    d["fp16"] = fp16
+elif(args.bfloat16):
+    bfloat16 = {}
+    bfloat16["enabled"] = args.bfloat16
+    d["bfloat16"] = bfloat16
+# Activation checkpointing
+ac = {}
+ac["partition_activations"] = args.partition_activations
+ac["cpu_checkpointing"] = args.cpu_checkpointing
+ac["profile"] = args.profile
+d["activation_checkpointing"] = ac
+# ZeRO optimization
+zo = {}
+zo["stage"] = args.zero_stage
+zo["allgather_partitions"] = args.allgather_partitions
+zo["allgather_bucket_size"] = args.allgather_bucket_size
+zo["reduce_bucket_size"] = args.reduce_bucket_size
+zo["overlap_comm"] = args.overlap_comm
+zo["reduce_scatter"] = args.reduce_scatter
+if(args.offload_optimizer):
+    oo = {}
+    oo["device"] = "cpu"
+    oo["pin_memory"] = args.pin_memory
+    zo["offload_optimizer"] = oo
+d["zero_optimization"] = zo
+# Flops Profiler
+flops_profiler = {}
+flops_profiler["enabled"] = args.flops_profiler
+flops_profiler["profile_step"] = args.profile_step
+flops_profiler["module_depth"] = args.module_depth
+flops_profiler["top_modules"] = args.top_modules
+flops_profiler["detailed"] = args.detailed_flops_profile
+d ["flops_profiler"] = flops_profiler
+if(args.gradient_clipping):
+    d["gradient_clipping"] = args.gradient_clipping
+print(json.dumps(d, indent=2))
--- a/scripts/colabfold_search.sh
+++ b/scripts/colabfold_search.sh
+#!/bin/bash -e
+# Copied from colabfold.mmseqs.com
+MMSEQS="$1"
+QUERY="$2"
+DBBASE="$3"
+BASE="$4"
+DB1="$5"
+DB2="$6"
+DB3="$7"
+USE_ENV="${8:-1}"
+USE_TEMPLATES="${9:-0}"
+FILTER="${10:-1}"
+INDEX=${11:-1}
+DB_LOAD_MODE="${12:-2}"
+EXPAND_EVAL=inf
+ALIGN_EVAL=10
+DIFF=3000
+QSC=-20.0
+MAX_ACCEPT=1000000
+if [ "${FILTER}" = "1" ]; then
+# 0.1 was not used in benchmarks due to POSIX shell bug in line above
+#  EXPAND_EVAL=0.1
+  ALIGN_EVAL=10
+  QSC=0.8
+  MAX_ACCEPT=100000
+fi
+if [ "${INDEX}" = "1" ]; then
+  SEQ=".idx"
+  ALN=".idx"
+  IDX=".idx"
+else
+  SEQ="_seq"
+  ALN="_aln"
+  IDX=""
+  export MMSEQS_IGNORE_INDEX=1
+fi
+export MMSEQS_CALL_DEPTH=1
+SEARCH_PARAM="--num-iterations 3 --db-load-mode ${DB_LOAD_MODE} -a -s 8 -e 0.1 --max-seqs 10000"
+FILTER_PARAM="--filter-msa ${FILTER} --filter-min-enable 1000 --diff ${DIFF} --qid 0.0,0.2,0.4,0.6,0.8,1.0 --qsc 0 --max-seq-id 0.95"
+EXPAND_PARAM="--expansion-mode 0 -e ${EXPAND_EVAL} --expand-filter-clusters ${FILTER} --max-seq-id 0.95"
+mkdir -p "${BASE}"
+"${MMSEQS}" createdb "${QUERY}" "${BASE}/qdb"
+"${MMSEQS}" search "${BASE}/qdb" "${DBBASE}/${DB1}" "${BASE}/res" "${BASE}/tmp" $SEARCH_PARAM
+"${MMSEQS}" expandaln "${BASE}/qdb" "${DBBASE}/${DB1}${SEQ}" "${BASE}/res" "${DBBASE}/${DB1}${ALN}" "${BASE}/res_exp" --db-load-mode ${DB_LOAD_MODE} ${EXPAND_PARAM}
+"${MMSEQS}" mvdb "${BASE}/tmp/latest/profile_1" "${BASE}/prof_res"
+"${MMSEQS}" lndb "${BASE}/qdb_h" "${BASE}/prof_res_h"
+"${MMSEQS}" align "${BASE}/prof_res" "${DBBASE}/${DB1}${SEQ}" "${BASE}/res_exp" "${BASE}/res_exp_realign" --db-load-mode ${DB_LOAD_MODE} -e ${ALIGN_EVAL} --max-accept ${MAX_ACCEPT} --alt-ali 10 -a
+"${MMSEQS}" filterresult "${BASE}/qdb" "${DBBASE}/${DB1}${SEQ}" "${BASE}/res_exp_realign" "${BASE}/res_exp_realign_filter" --db-load-mode ${DB_LOAD_MODE} --qid 0 --qsc $QSC --diff 0 --max-seq-id 1.0 --filter-min-enable 100
+"${MMSEQS}" result2msa "${BASE}/qdb" "${DBBASE}/${DB1}${SEQ}" "${BASE}/res_exp_realign_filter" "${BASE}/uniref.a3m" --msa-format-mode 6 --db-load-mode ${DB_LOAD_MODE} ${FILTER_PARAM}
+"${MMSEQS}" rmdb "${BASE}/res_exp_realign"
+"${MMSEQS}" rmdb "${BASE}/res_exp"
+"${MMSEQS}" rmdb "${BASE}/res"
+"${MMSEQS}" rmdb "${BASE}/res_exp_realign_filter"
+if [ "${USE_TEMPLATES}" = "1" ]; then
+  "${MMSEQS}" search "${BASE}/prof_res" "${DBBASE}/${DB2}" "${BASE}/res_pdb" "${BASE}/tmp" --db-load-mode ${DB_LOAD_MODE} -s 7.5 -a -e 0.1
+  "${MMSEQS}" convertalis "${BASE}/prof_res" "${DBBASE}/${DB2}${IDX}" "${BASE}/res_pdb" "${BASE}/${DB2}.m8" --format-output query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,cigar --db-load-mode ${DB_LOAD_MODE}
+  "${MMSEQS}" rmdb "${BASE}/res_pdb"
+fi
+if [ "${USE_ENV}" = "1" ]; then
+  "${MMSEQS}" search "${BASE}/prof_res" "${DBBASE}/${DB3}" "${BASE}/res_env" "${BASE}/tmp" $SEARCH_PARAM
+  "${MMSEQS}" expandaln "${BASE}/prof_res" "${DBBASE}/${DB3}${SEQ}" "${BASE}/res_env" "${DBBASE}/${DB3}${ALN}" "${BASE}/res_env_exp" -e ${EXPAND_EVAL} --expansion-mode 0 --db-load-mode ${DB_LOAD_MODE}
+  "${MMSEQS}" align "${BASE}/tmp/latest/profile_1" "${DBBASE}/${DB3}${SEQ}" "${BASE}/res_env_exp" "${BASE}/res_env_exp_realign" --db-load-mode ${DB_LOAD_MODE} -e ${ALIGN_EVAL} --max-accept ${MAX_ACCEPT} --alt-ali 10 -a
+  "${MMSEQS}" filterresult "${BASE}/qdb" "${DBBASE}/${DB3}${SEQ}" "${BASE}/res_env_exp_realign" "${BASE}/res_env_exp_realign_filter" --db-load-mode ${DB_LOAD_MODE} --qid 0 --qsc $QSC --diff 0 --max-seq-id 1.0 --filter-min-enable 100
+  "${MMSEQS}" result2msa "${BASE}/qdb" "${DBBASE}/${DB3}${SEQ}" "${BASE}/res_env_exp_realign_filter" "${BASE}/bfd.mgnify30.metaeuk30.smag30.a3m" --msa-format-mode 6 --db-load-mode ${DB_LOAD_MODE} ${FILTER_PARAM}
+  "${MMSEQS}" rmdb "${BASE}/res_env_exp_realign_filter"
+  "${MMSEQS}" rmdb "${BASE}/res_env_exp_realign"
+  "${MMSEQS}" rmdb "${BASE}/res_env_exp"
+  "${MMSEQS}" rmdb "${BASE}/res_env"
+fi
+"${MMSEQS}" rmdb "${BASE}/qdb"
+"${MMSEQS}" rmdb "${BASE}/qdb_h"
+"${MMSEQS}" rmdb "${BASE}/res"
+rm -f -- "${BASE}/prof_res"*
+rm -rf -- "${BASE}/tmp"
--- a/scripts/convert_of_weights_to_jax.py
+++ b/scripts/convert_of_weights_to_jax.py
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Converts OpenFold .pt checkpoints into AlphaFold .npz ones, which can then be
+# used to run inference using DeepMind's JAX code.
+import argparse
+import numpy as np
+import torch
+from openfold.config import model_config
+from openfold.model.model import AlphaFold
+from openfold.utils.import_weights import (
+    Param, 
+    ParamType, 
+    generate_translation_dict, 
+    process_translation_dict,
+)
+from openfold.utils.tensor_utils import tree_map
+def reshape_fn(of_param, af_weight):
+    transformations = {
+        ParamType.LinearWeight: lambda w: w.transpose(-1, -2),
+        ParamType.LinearWeightMHA: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
+        ParamType.LinearMHAOutputWeight: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
+        ParamType.LinearBiasMHA: lambda w: w.reshape(af_weight.shape),
+        ParamType.LinearWeightOPM: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
+        ParamType.Other: lambda w: w,
+    }
+    if(of_param.stacked):
+        of_weight = torch.stack([torch.Tensor(p) for p in of_param.param])
+    else:
+        of_weight = torch.Tensor(of_param.param)
+    return transformations[of_param.param_type](of_weight)
+def transfer(of_dict, af_weight_template):
+    for k in of_dict:
+        if(type(of_dict[k]) == dict):
+            transfer(of_dict[k], af_weight_template[k])
+        else:
+            reshaped = reshape_fn(of_dict[k], af_weight_template[k])
+            reshaped = reshaped.detach().numpy()
+            np.copyto(af_weight_template[k], reshaped)
+def main(args):
+    d = torch.load(args.of_pt_path)
+    config = model_config(args.config_preset)
+    model = AlphaFold(config)
+    model.load_state_dict(d)
+    translation = generate_translation_dict(model, args.config_preset)
+    translation = process_translation_dict(translation)
+    af_weight_template = np.load(args.template_npz_path)
+    af_weight_template = {k:v for k,v in af_weight_template.items() if k in translation}
+    zero = lambda n: n * 0
+    af_weight_template = tree_map(zero, af_weight_template, np.ndarray)
+    transfer(translation, af_weight_template)
+    np.savez(args.out_path, **af_weight_template)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "of_pt_path", type=str, help="Path to OpenFold .pt checkpoint file"
+    )
+    parser.add_argument(
+        "config_preset", type=str, help="The corresponding config preset"
+    )
+    parser.add_argument(
+        "out_path", type=str, help="Path for output .npz file"
+    )
+    parser.add_argument(
+        "--template_npz_path", 
+        type=str, 
+        default="openfold/resources/params/params_model_1_ptm.npz",
+        help="""Path to an AlphaFold checkpoint w/ a superset of the OF
+                checkpoint's parameters. params_model_1_ptm.npz always works.
+             """
+    )
+    args = parser.parse_args()
+    main(args)
--- a/scripts/data_dir_to_fasta.py
+++ b/scripts/data_dir_to_fasta.py
+import argparse
+import logging
+import os
+from openfold.data import mmcif_parsing
+from openfold.np import protein, residue_constants
+def main(args):
+    fasta = []
+    for fname in os.listdir(args.data_dir):
+        basename, ext = os.path.splitext(fname)
+        basename = basename.upper()
+        fpath = os.path.join(args.data_dir, fname)
+        if(ext == ".cif"):
+            with open(fpath, 'r') as fp:
+                mmcif_str = fp.read()
+            mmcif = mmcif_parsing.parse(
+                file_id=basename, mmcif_string=mmcif_str
+            )
+            if(mmcif.mmcif_object is None):
+                logging.warning(f'Failed to parse {fname}...')
+                if(args.raise_errors):
+                    raise list(mmcif.errors.values())[0]
+                else:
+                    continue
+            mmcif = mmcif.mmcif_object
+            for chain, seq in mmcif.chain_to_seqres.items():
+                chain_id = '_'.join([basename, chain])
+                fasta.append(f">{chain_id}")
+                fasta.append(seq)
+        elif(ext == ".core"):
+            with open(fpath, 'r') as fp:
+                core_str = fp.read()
+            core_protein = protein.from_proteinnet_string(core_str)
+            aatype = core_protein.aatype
+            seq = ''.join([
+                residue_constants.restypes_with_x[aatype[i]] 
+                for i in range(len(aatype))
+            ])
+            fasta.append(f">{basename}")
+            fasta.append(seq)
+    with open(args.output_path, "w") as fp:
+        fp.write('\n'.join(fasta))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "data_dir", type=str,
+        help="Path to a directory containing mmCIF or .core files"
+    )
+    parser.add_argument(
+        "output_path", type=str,
+        help="Path to output FASTA file"
+    )
+    parser.add_argument(
+        "--raise_errors", type=bool, default=False,
+        help="Whether to crash on parsing errors"
+    )
+    args = parser.parse_args()
+    main(args)
--- a/scripts/deactivate_conda_env.sh
+++ b/scripts/deactivate_conda_env.sh
+#!/bin/bash
+conda deactivate
--- a/scripts/download_alphafold_dbs.sh
+++ b/scripts/download_alphafold_dbs.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips all required data for AlphaFold.
+#
+# Usage: bash download_all_data.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs.
+if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]]
+then
+  echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized."
+  exit 1
+fi
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+if [[ "${DOWNLOAD_MODE}" = full_dbs ]] ; then
+  echo "Downloading BFD..."
+  bash "${SCRIPT_DIR}/download_bfd.sh" "${DOWNLOAD_DIR}"
+else
+  echo "Downloading Small BFD..."
+  bash "${SCRIPT_DIR}/download_small_bfd.sh" "${DOWNLOAD_DIR}"
+fi
+echo "Downloading MGnify..."
+bash "${SCRIPT_DIR}/download_mgnify.sh" "${DOWNLOAD_DIR}"
+echo "Downloading PDB70..."
+bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}"
+echo "Downloading PDB mmCIF files..."
+bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"
+echo "Downloading Uniclust30..."
+bash "${SCRIPT_DIR}/download_uniclust30.sh" "${DOWNLOAD_DIR}"
+echo "Downloading Uniref90..."
+bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}"
+echo "All data downloaded."
--- a/scripts/download_alphafold_params.sh
+++ b/scripts/download_alphafold_params.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the AlphaFold parameters.
+#
+# Usage: bash download_alphafold_params.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/params"
+SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-01-19.tar"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}" --preserve-permissions
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_bfd.sh
+++ b/scripts/download_bfd.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the BFD database for AlphaFold.
+#
+# Usage: bash download_bfd.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/bfd"
+# Mirror of:
+# https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz.
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_cameo.py
+++ b/scripts/download_cameo.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+import re
+import requests
+from openfold.data import mmcif_parsing
+VALID_PERIODS = [
+    "1-year",
+    "6-months",
+    "3-months",
+    "1-month",
+    "1-week",
+]
+def generate_url(period, end_date):
+    return '/'.join([
+        "https://www.cameo3d.org/",
+        "modeling",
+        "targets",
+        period,
+        "ajax",
+        f"?to_date={end_date}",
+    ])
+def main(args):
+    data_dir_path = os.path.join(args.output_dir, "data_dir")
+    fasta_dir_path = os.path.join(args.output_dir, "fasta_dir")
+    os.makedirs(data_dir_path, exist_ok=True)
+    os.makedirs(fasta_dir_path, exist_ok=True)
+    url = generate_url(args.period, args.end_date)
+    raw_data = requests.get(url).text
+    parsed_data = json.loads(raw_data)
+    chain_data = parsed_data["aaData"]
+    for chain in chain_data:
+        pdb_id = chain["pdbid"]
+        chain_id = chain["pdbid_chain"]
+        pdb_url = f"https://files.rcsb.org/view/{pdb_id.upper()}.cif"
+        pdb_file = requests.get(pdb_url).text
+        parsed_cif = mmcif_parsing.parse(
+            file_id=pdb_id, mmcif_string=pdb_file
+        )
+        mmcif_object = parsed_cif.mmcif_object
+        if(mmcif_object is None):
+            raise list(parsed_cif.errors.values())[0]
+        seq = mmcif_object.chain_to_seqres[chain_id]
+        if(args.max_seqlen > 0 and len(seq) > args.max_seqlen):
+            continue
+        fasta_file = '\n'.join([
+            f">{pdb_id}_{chain_id}",
+            seq,
+        ])
+        fasta_filename = f"{pdb_id}_{chain_id}.fasta"
+        with open(os.path.join(fasta_dir_path, fasta_filename), "w") as fp:
+            fp.write(fasta_file)
+        cif_filename = f"{pdb_id}.cif"
+        with open(os.path.join(data_dir_path, cif_filename), "w") as fp:
+            fp.write(pdb_file)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "period", type=str,
+        help=f"""The length of the period from which to draw CAMEO proteins. 
+             Choose from {VALID_PERIODS}"""
+    )
+    parser.add_argument(
+        "end_date", type=str,
+        help="The date marking the end of the period (YYYY-MM-DD)"
+    )
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--max_seqlen", type=int, default=700,
+        help="The maximum length in residues of downloaded proteins (or -1)"
+    )
+    args = parser.parse_args()
+    if(args.period not in VALID_PERIODS):
+        raise ValueError(f"Invalid period. Choose from {VALID_PERIODS}")
+    date_regex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")
+    if(not date_regex.match(args.end_date)):
+        raise ValueError(f"Invalid end_date: {args.end_date}. Use YYYY-MM-DD format")
+    main(args)
--- a/scripts/download_colabfold_envdb.sh
+++ b/scripts/download_colabfold_envdb.sh
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the BFD database for AlphaFold.
+#
+# Usage: bash download_bfd.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}"
+SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
--- a/scripts/download_mgnify.sh
+++ b/scripts/download_mgnify.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the MGnify database for AlphaFold.
+#
+# Usage: bash download_mgnify.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/mgnify"
+# Mirror of:
+# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_mmseqs_dbs.sh
+++ b/scripts/download_mmseqs_dbs.sh
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratory 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips all required data for AlphaFold.
+#
+# Usage: bash download_all_data.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs.
+if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]]
+then
+  echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized."
+  exit 1
+fi
+SCRIPT_DIR="$(dirname "$(realpath "$0")")"
+echo "Downloading Uniref30..."
+bash "${SCRIPT_DIR}/download_uniref30.sh" "${DOWNLOAD_DIR}"
+echo "Downloading ColabFold's environmental database..."
+bash "${SCRIPT_DIR}/download_colabfold_envdb.sh" "${DOWNLOAD_DIR}"
+echo "All data downloaded."
--- a/scripts/download_openfold_params.sh
+++ b/scripts/download_openfold_params.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads OpenFold parameters.
+#
+# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aws &> /dev/null ; then
+    echo "Error: aws could not be found. Please install aws."
+    exit 1
+fi
+DOWNLOAD_DIR="${1}/openfold_params"
+mkdir -p "${DOWNLOAD_DIR}"
+aws s3 cp --no-sign-request --region us-east-1 s3://openfold/openfold_params/ "${DOWNLOAD_DIR}" --recursive
--- a/scripts/download_openfold_params_gdrive.sh
+++ b/scripts/download_openfold_params_gdrive.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips OpenFold parameters from Google Drive. Alternative to
+# the HuggingFace version.
+#
+# Usage: bash download_openfold_params_gdrive.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+FILE_ID="1GVzZA2nbdBbz6TKydvzquhfELJ3Movnb"
+FILENAME="openfold_params_07_22.tar.gz"
+download_from_gdrive() {
+    FILE_ID="$1"
+    OUT_DIR="$2"
+    MSG=$(wget \
+         --quiet \
+         --save-cookies /tmp/cookies_$$.txt \
+         --keep-session-cookies \
+         --no-check-certificate \
+         "https://docs.google.com/uc?export=download&id=${FILE_ID}" \
+         -O- \
+    )
+    CONFIRM=$(echo $MSG | sed -rn "s/.*confirm=([0-9A-Za-z_]+).*/\1\n/p")
+    FILENAME=$(echo $MSG | sed -e "s/.*<a href=\"\/open?id=${FILE_ID}\">\(.*\)<\/a> (.*/\1/")
+    FILEPATH="${OUT_DIR}/${FILENAME}"
+    wget \
+        --quiet \
+        --load-cookies /tmp/cookies_$$.txt \
+        "https://docs.google.com/uc?export=download&confirm=${CONFIRM}&id=${FILE_ID}" \
+        -O "${FILEPATH}"
+    rm /tmp/cookies_$$.txt
+    echo $FILEPATH
+}
+DOWNLOAD_DIR="$1"
+mkdir -p "${DOWNLOAD_DIR}"
+DOWNLOAD_PATH=$(download_from_gdrive $FILE_ID "${DOWNLOAD_DIR}")
+DOWNLOAD_FILENAME=$(basename "${DOWNLOAD_PATH}")
+if [[ $FILENAME != $DOWNLOAD_FILENAME ]]; then
+    echo "Error: Downloaded filename ${DOWNLOAD_FILENAME} does not match expected filename ${FILENAME}"
+    rm "${DOWNLOAD_PATH}"
+    exit
+fi
+tar --extract --verbose --file="${DOWNLOAD_PATH}" \
+  --directory="${DOWNLOAD_DIR}" --preserve-permissions
+rm "${DOWNLOAD_PATH}"
--- a/scripts/download_openfold_params_huggingface.sh
+++ b/scripts/download_openfold_params_huggingface.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips OpenFold parameters.
+#
+# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+URL="https://huggingface.co/nz/OpenFold"
+DOWNLOAD_DIR="${1}/openfold_params/"
+mkdir -p "${DOWNLOAD_DIR}"
+git clone $URL "${DOWNLOAD_DIR}"
+rm -rf "${DOWNLOAD_DIR}/.git"