Merge branch 'main' of https://github.com/aqlaboratory/openfold

13f8f163 · zhuwenwen · a509a4c5 · b5fa2ba3 · 13f8f163 · 13f8f163
Commit 13f8f163 authored Apr 26, 2023 by zhuwenwen
20 changed files
--- a/scripts/download_pdb70.sh
+++ b/scripts/download_pdb70.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the PDB70 database for AlphaFold.
+#
+# Usage: bash download_pdb70.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/pdb70"
+SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" --check-certificate=false
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_pdb_mmcif.sh
+++ b/scripts/download_pdb_mmcif.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads, unzips and flattens the PDB database for AlphaFold.
+#
+# Usage: bash download_pdb_mmcif.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+if ! command -v rsync &> /dev/null ; then
+    echo "Error: rsync could not be found. Please install rsync."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/pdb_mmcif"
+RAW_DIR="${ROOT_DIR}/raw"
+MMCIF_DIR="${ROOT_DIR}/mmcif_files"
+echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..."
+mkdir --parents "${RAW_DIR}"
+rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
+  rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
+  "${RAW_DIR}"
+echo "Unzipping all mmCIF files..."
+find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} +
+echo "Flattening all mmCIF files..."
+mkdir --parents "${MMCIF_DIR}"
+find "${RAW_DIR}" -type d -empty -delete  # Delete empty directories.
+for subdir in "${RAW_DIR}"/*; do
+  mv "${subdir}/"*.cif "${MMCIF_DIR}"
+done
+# Delete empty download directory structure.
+find "${RAW_DIR}" -type d -empty -delete
+aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
--- a/scripts/download_roda_pdbs.sh
+++ b/scripts/download_roda_pdbs.sh
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratories
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads .cif files matching the RODA alignments. Outputs a list of 
+# RODA alignments for which .cif files could not be found..
+if [[ $# != 2 ]]; then
+    echo "usage: ./download_roda_pdbs.sh <out_dir> <roda_pdb_alignment_dir>"
+    exit 1
+fi
+OUT_DIR=$1
+RODA_ALIGNMENT_DIR=$2
+if [[ -d $OUT_DIR ]]; then
+    echo "${OUT_DIR} already exists. Download failed..."
+    exit 1
+fi
+SERVER=snapshotrsync.rcsb.org                       # RCSB server name
+PORT=873                                           # port RCSB server is using
+rsync -rlpt -v -z --delete --port=$PORT $SERVER::20220103/pub/pdb/data/structures/divided/mmCIF/ $OUT_DIR 2>&1 > /dev/null
+for f in $(find $OUT_DIR -mindepth 2 -type f); do
+    mv $f $OUT_DIR
+    BASENAME=$(basename $f)
+    gunzip "${OUT_DIR}/${BASENAME}"
+done
+find $OUT_DIR -mindepth 1 -type d,l -delete
+for d in $(find $RODA_ALIGNMENT_DIR -mindepth 1 -maxdepth 1 -type d); do
+    BASENAME=$(basename $d)
+    PDB_ID=$(echo $BASENAME | cut -d '_' -f 1)
+    CIF_PATH="${OUT_DIR}/${PDB_ID}.cif"
+    if [[ ! -f $CIF_PATH ]]; then
+        echo $d
+    fi
+done
--- a/scripts/download_small_bfd.sh
+++ b/scripts/download_small_bfd.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the Small BFD database for AlphaFold.
+#
+# Usage: bash download_small_bfd.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/small_bfd"
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+pushd "${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${BASENAME}"
+popd
--- a/scripts/download_uniclust30.sh
+++ b/scripts/download_uniclust30.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the Uniclust30 database for AlphaFold.
+#
+# Usage: bash download_uniclust30.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/uniclust30"
+# Mirror of:
+# http://wwwuser.gwdg.de/~compbiol/uniclust/2018_08/uniclust30_2018_08_hhsuite.tar.gz
+SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
+  --directory="${ROOT_DIR}"
+rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_uniref30.sh
+++ b/scripts/download_uniref30.sh
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the BFD database for AlphaFold.
+#
+# Usage: bash download_bfd.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}"
+SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
+gunzip "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_uniref90.sh
+++ b/scripts/download_uniref90.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips the UniRef90 database for AlphaFold.
+#
+# Usage: bash download_uniref90.sh /path/to/download/directory
+set -e
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+if ! command -v aria2c &> /dev/null ; then
+    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+    exit 1
+fi
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/uniref90"
+SOURCE_URL="ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz"
+BASENAME=$(basename "${SOURCE_URL}")
+mkdir --parents "${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+gunzip "${ROOT_DIR}/${BASENAME}"
--- a/scripts/flatten_roda.sh
+++ b/scripts/flatten_roda.sh
+#!/usr/bin/env sh
+#
+# Flattens a downloaded RODA database into the format expected by OpenFold
+# Args:
+#     roda_dir: 
+#           The path to the database you want to flatten. E.g. "roda/pdb" 
+#           or "roda/uniclust30". Note that, to save space, this script
+#           will empty this directory.
+#     output_dir:
+#           The directory in which to construct the reformatted data
+if [[ $# != 2 ]]; then
+    echo "usage: ./flatten_roda.sh <roda_dir> <output_dir>"
+    exit 1
+fi
+RODA_DIR=$1
+OUTPUT_DIR=$2
+DATA_DIR="${OUTPUT_DIR}/data"
+ALIGNMENT_DIR="${OUTPUT_DIR}/alignments"
+mkdir -p "${DATA_DIR}"
+mkdir -p "${ALIGNMENT_DIR}"
+for chain_dir in $(ls "${RODA_DIR}"); do
+    CHAIN_DIR_PATH="${RODA_DIR}/${chain_dir}"
+    for subdir in $(ls "${CHAIN_DIR_PATH}"); do
+        if [[ $subdir = "pdb" ]] || [[ $subdir = "cif" ]]; then
+            mv "${CHAIN_DIR_PATH}/${subdir}"/* "${DATA_DIR}"
+        else
+            CHAIN_ALIGNMENT_DIR="${ALIGNMENT_DIR}/${chain_dir}"
+            mkdir -p "${CHAIN_ALIGNMENT_DIR}"
+            mv "${CHAIN_DIR_PATH}/${subdir}"/* "${CHAIN_ALIGNMENT_DIR}"
+        fi
+    done
+done
+NO_DATA_FILES=$(find "${DATA_DIR}" -type f | wc -l)
+if [[ $NO_DATA_FILES = 0 ]]; then
+    rm -rf ${DATA_DIR}
+fi
--- a/scripts/generate_alphafold_feature_dict.py
+++ b/scripts/generate_alphafold_feature_dict.py
+import argparse
+import os
+import pickle
+from alphafold.data import pipeline, templates
+from scripts.utils import add_data_args
+def main(args):
+    template_featurizer = templates.TemplateHitFeaturizer(
+        mmcif_dir=args.mmcif_dir,
+        max_template_date=args.max_template_date,
+        max_hits=20,
+        kalign_binary_path=args.kalign_binary_path,
+        release_dates_path=None,
+        obsolete_pdbs_path=args.obsolete_pdbs_path,
+    )
+    data_pipeline = pipeline.DataPipeline(
+        jackhmmer_binary_path=args.jackhmmer_binary_path,
+        hhblits_binary_path=args.hhblits_binary_path,
+        hhsearch_binary_path=args.hhsearch_binary_path,
+        uniref90_database_path=args.uniref90_database_path,
+        mgnify_database_path=args.mgnify_database_path,
+        bfd_database_path=args.bfd_database_path,
+        uniclust30_database_path=args.uniclust30_database_path,
+        pdb70_database_path=args.pdb70_database_path,
+        small_bfd_database_path=None,
+        template_featurizer=template_featurizer,
+        use_small_bfd=False,
+    )
+    feature_dict = data_pipeline.process(
+        input_fasta_path=args.fasta_path,
+        msa_output_dir=args.output_dir,
+    )
+    with open(os.path.join(args.output_dir, "feature_dict.pickle"), "wb") as fp:
+        pickle.dump(feature_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("fasta_path", type=str)
+    parser.add_argument("mmcif_dir", type=str)
+    parser.add_argument("output_dir", type=str)
+    add_data_args(parser)
+    args = parser.parse_args()
+    main(args)
--- a/scripts/generate_chain_data_cache.py
+++ b/scripts/generate_chain_data_cache.py
+import argparse
+from functools import partial
+import json
+import logging
+from multiprocessing import Pool
+import os
+import sys
+sys.path.append(".") # an innocent hack to get this to run from the top level
+from tqdm import tqdm
+from openfold.data.mmcif_parsing import parse 
+from openfold.np import protein, residue_constants
+def parse_file(
+    f, 
+    args,
+    chain_cluster_size_dict
+):
+    file_id, ext = os.path.splitext(f)
+    if(ext == ".cif"):
+        with open(os.path.join(args.data_dir, f), "r") as fp:
+            mmcif_string = fp.read()
+        mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
+        if mmcif.mmcif_object is None:
+            logging.info(f"Could not parse {f}. Skipping...")
+            return {}
+        else:
+            mmcif = mmcif.mmcif_object
+        out = {}
+        for chain_id, seq in mmcif.chain_to_seqres.items():
+            full_name = "_".join([file_id, chain_id])
+            out[full_name] = {}
+            local_data = out[full_name]
+            local_data["release_date"] = mmcif.header["release_date"]
+            local_data["seq"] = seq
+            local_data["resolution"] = mmcif.header["resolution"]
+            if(chain_cluster_size_dict is not None):
+                cluster_size = chain_cluster_size_dict.get(
+                    full_name.upper(), -1
+                )
+                local_data["cluster_size"] = cluster_size
+    elif(ext == ".pdb"):
+        with open(os.path.join(args.data_dir, f), "r") as fp:
+            pdb_string = fp.read()
+        protein_object = protein.from_pdb_string(pdb_string, None)
+        chain_dict = {} 
+        chain_dict["seq"] = residue_constants.aatype_to_str_sequence(
+            protein_object.aatype,
+        )
+        chain_dict["resolution"] = 0.
+        if(chain_cluster_size_dict is not None):
+            cluster_size = chain_cluster_size_dict.get(
+                full_name.upper(), -1
+            )
+            chain_dict["cluster_size"] = cluster_size
+        out = {file_id: chain_dict}
+    return out
+def main(args):
+    chain_cluster_size_dict = None
+    if(args.cluster_file is not None):
+        chain_cluster_size_dict = {}
+        with open(args.cluster_file, "r") as fp:
+            clusters = [l.strip() for l in fp.readlines()]
+        for cluster in clusters:
+            chain_ids = cluster.split()
+            cluster_len = len(chain_ids)
+            for chain_id in chain_ids:
+                chain_id = chain_id.upper()
+                chain_cluster_size_dict[chain_id] = cluster_len
+    accepted_exts = [".cif", ".pdb"]
+    files = list(os.listdir(args.data_dir))
+    files = [f for f in files if os.path.splitext(f)[-1] in accepted_exts]
+    fn = partial(
+        parse_file, 
+        args=args,
+        chain_cluster_size_dict=chain_cluster_size_dict,
+    )
+    data = {}
+    with Pool(processes=args.no_workers) as p:
+        with tqdm(total=len(files)) as pbar:
+            for d in p.imap_unordered(fn, files, chunksize=args.chunksize):
+                data.update(d)
+                pbar.update()
+    with open(args.output_path, "w") as fp:
+        fp.write(json.dumps(data, indent=4))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "data_dir", type=str, help="Directory containing mmCIF or PDB files"
+    )
+    parser.add_argument(
+        "output_path", type=str, help="Path for .json output"
+    )
+    parser.add_argument(
+        "--cluster_file", type=str, default=None,
+        help=(
+            "Path to a cluster file (e.g. PDB40), one cluster "
+            "({PROT1_ID}_{CHAIN_ID} {PROT2_ID}_{CHAIN_ID} ...) per line. "
+            "Chains not in this cluster file will NOT be filtered by cluster "
+            "size."
+        )
+    )
+    parser.add_argument(
+        "--no_workers", type=int, default=4,
+        help="Number of workers to use for parsing"
+    )
+    parser.add_argument(
+        "--chunksize", type=int, default=10,
+        help="How many files should be distributed to each worker at a time"
+    )
+    args = parser.parse_args()
+    main(args)
--- a/scripts/generate_mmcif_cache.py
+++ b/scripts/generate_mmcif_cache.py
+import argparse
+from functools import partial
+import json
+import logging
+from multiprocessing import Pool
+import os
+import sys
+sys.path.append(".") # an innocent hack to get this to run from the top level
+from tqdm import tqdm
+from openfold.data.mmcif_parsing import parse 
+def parse_file(f, args):
+    with open(os.path.join(args.mmcif_dir, f), "r") as fp:
+        mmcif_string = fp.read()
+    file_id = os.path.splitext(f)[0]
+    mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
+    if mmcif.mmcif_object is None:
+        logging.info(f"Could not parse {f}. Skipping...")
+        return {}
+    else:
+        mmcif = mmcif.mmcif_object
+    local_data = {}
+    local_data["release_date"] = mmcif.header["release_date"]
+    chain_ids, seqs = list(zip(*mmcif.chain_to_seqres.items()))
+    local_data["chain_ids"] = chain_ids
+    local_data["seqs"] = seqs
+    local_data["no_chains"] = len(chain_ids)
+    local_data["resolution"] = mmcif.header["resolution"]
+    return {file_id: local_data}
+def main(args):
+    files = [f for f in os.listdir(args.mmcif_dir) if ".cif" in f]
+    fn = partial(parse_file, args=args)
+    data = {}
+    with Pool(processes=args.no_workers) as p:
+        with tqdm(total=len(files)) as pbar:
+            for d in p.imap_unordered(fn, files, chunksize=args.chunksize):
+                data.update(d)
+                pbar.update()
+    with open(args.output_path, "w") as fp:
+        fp.write(json.dumps(data, indent=4))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "mmcif_dir", type=str, help="Directory containing mmCIF files"
+    )
+    parser.add_argument(
+        "output_path", type=str, help="Path for .json output"
+    )
+    parser.add_argument(
+        "--no_workers", type=int, default=4,
+        help="Number of workers to use for parsing"
+    )
+    parser.add_argument(
+        "--chunksize", type=int, default=10,
+        help="How many files should be distributed to each worker at a time"
+    )
+    args = parser.parse_args()
+    main(args)
--- a/scripts/install_hh_suite.sh
+++ b/scripts/install_hh_suite.sh
+#!/bin/bash
+git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \
+  && mkdir /tmp/hh-suite/build \
+  && pushd /tmp/hh-suite/build \
+  && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \
+  && make -j 4 && make install \
+  && ln -sf /opt/hhsuite/bin/* /usr/bin \
+  && popd \
+  && rm -rf /tmp/hh-suite
--- a/scripts/install_third_party_dependencies.sh
+++ b/scripts/install_third_party_dependencies.sh
+#!/bin/bash
+CONDA_INSTALL_URL=${CONDA_INSTALL_URL:-"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"}
+source scripts/vars.sh
+# Install Miniconda locally
+rm -rf lib/conda
+rm -f /tmp/Miniconda3-latest-Linux-x86_64.sh
+wget -P /tmp \
+    "${CONDA_INSTALL_URL}" \
+    && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p lib/conda \
+    && rm /tmp/Miniconda3-latest-Linux-x86_64.sh
+# Grab conda-only packages
+export PATH=lib/conda/bin:$PATH
+lib/conda/bin/python3 -m pip install nvidia-pyindex
+conda env create --name=${ENV_NAME} -f environment.yml
+source scripts/activate_conda_env.sh
+echo "Attempting to install FlashAttention"
+git clone https://github.com/HazyResearch/flash-attention
+CUR_DIR=$PWD
+cd flash-attention
+git checkout 5b838a8bef
+python3 setup.py install
+cd $CUR_DIR
+# Install DeepMind's OpenMM patch
+OPENFOLD_DIR=$PWD
+pushd lib/conda/envs/$ENV_NAME/lib/python3.7/site-packages/ \
+    && patch -p0 < $OPENFOLD_DIR/lib/openmm.patch \
+    && popd
+# Download folding resources
+wget --no-check-certificate -P openfold/resources \
+    https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
+# Certain tests need access to this file
+mkdir -p tests/test_data/alphafold/common
+ln -rs openfold/resources/stereo_chemical_props.txt tests/test_data/alphafold/common
+echo "Downloading OpenFold parameters..."
+bash scripts/download_openfold_params.sh openfold/resources
+echo "Downloading AlphaFold parameters..."
+bash scripts/download_alphafold_params.sh openfold/resources
+# Decompress test data
+gunzip tests/test_data/sample_feats.pickle.gz
--- a/scripts/precompute_alignments.py
+++ b/scripts/precompute_alignments.py
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import threading
+from multiprocessing import cpu_count
+from shutil import copyfile
+import tempfile
+import openfold.data.mmcif_parsing as mmcif_parsing
+from openfold.data.data_pipeline import AlignmentRunner
+from openfold.data.parsers import parse_fasta
+from openfold.np import protein, residue_constants
+from utils import add_data_args
+logging.basicConfig(level=logging.WARNING)
+def run_seq_group_alignments(seq_groups, alignment_runner, args):
+    dirs = set(os.listdir(args.output_dir))
+    for seq, names in seq_groups:
+        first_name = names[0]
+        alignment_dir = os.path.join(args.output_dir, first_name)
+        try:
+            os.makedirs(alignment_dir)
+        except Exception as e:
+            logging.warning(f"Failed to create directory for {first_name} with exception {e}...")
+            continue
+        fd, fasta_path = tempfile.mkstemp(suffix=".fasta")
+        with os.fdopen(fd, 'w') as fp:
+            fp.write(f'>query\n{seq}')
+        try:
+            alignment_runner.run(
+                fasta_path, alignment_dir
+            )
+        except:
+            logging.warning(f"Failed to run alignments for {first_name}. Skipping...")
+            os.remove(fasta_path)
+            os.rmdir(alignment_dir)
+            continue
+        os.remove(fasta_path)
+        for name in names[1:]:
+            if(name in dirs):
+                logging.warning(
+                    f'{name} has already been processed. Skipping...'
+                )
+                continue
+            cp_dir = os.path.join(args.output_dir, name)
+            os.makedirs(cp_dir, exist_ok=True)
+            for f in os.listdir(alignment_dir):
+                copyfile(os.path.join(alignment_dir, f), os.path.join(cp_dir, f))
+def parse_and_align(files, alignment_runner, args):
+    for f in files:
+        path = os.path.join(args.input_dir, f)
+        file_id = os.path.splitext(f)[0]
+        seq_group_dict = {}  
+        if(f.endswith('.cif')):
+            with open(path, 'r') as fp:
+                mmcif_str = fp.read()
+            mmcif = mmcif_parsing.parse(
+                file_id=file_id, mmcif_string=mmcif_str
+            )
+            if(mmcif.mmcif_object is None):
+                logging.warning(f'Failed to parse {f}...')
+                if(args.raise_errors):
+                    raise list(mmcif.errors.values())[0]
+                else:
+                    continue
+            mmcif = mmcif.mmcif_object
+            for chain_letter, seq in mmcif.chain_to_seqres.items():
+                chain_id = '_'.join([file_id, chain_letter])
+                l = seq_group_dict.setdefault(seq, [])
+                l.append(chain_id)
+        elif(f.endswith('.fasta') or f.endswith('.fa')):
+            with open(path, 'r') as fp:
+                fasta_str = fp.read()
+            input_seqs, _ = parse_fasta(fasta_str)
+            if len(input_seqs) != 1: 
+                msg = f'More than one input_sequence found in {f}'
+                if(args.raise_errors):
+                    raise ValueError(msg)
+                else:
+                    logging.warning(msg)
+            input_sequence = input_seqs[0]
+            seq_group_dict[input_sequence] = [file_id]
+        elif(f.endswith('.core')):
+            with open(path, 'r') as fp:
+                core_str = fp.read()
+            core_prot = protein.from_proteinnet_string(core_str)
+            aatype = core_prot.aatype
+            seq = ''.join([
+                residue_constants.restypes_with_x[aatype[i]] 
+                for i in range(len(aatype))
+            ])
+            seq_group_dict[seq] = [file_id]
+        else:
+            continue
+        seq_group_tuples = [(k,v) for k,v in seq_group_dict.items()]
+        run_seq_group_alignments(seq_group_tuples, alignment_runner, args)
+def main(args):
+    # Build the alignment tool runner
+    alignment_runner = AlignmentRunner(
+        jackhmmer_binary_path=args.jackhmmer_binary_path,
+        hhblits_binary_path=args.hhblits_binary_path,
+        hhsearch_binary_path=args.hhsearch_binary_path,
+        uniref90_database_path=args.uniref90_database_path,
+        mgnify_database_path=args.mgnify_database_path,
+        bfd_database_path=args.bfd_database_path,
+        uniclust30_database_path=args.uniclust30_database_path,
+        pdb70_database_path=args.pdb70_database_path,
+        use_small_bfd=args.bfd_database_path is None,
+        no_cpus=args.cpus_per_task,
+    )
+    files = list(os.listdir(args.input_dir))
+    # Do some filtering
+    if(args.mmcif_cache is not None):
+        with open(args.mmcif_cache, "r") as fp:
+            cache = json.load(fp)
+    else:
+        cache = None
+    dirs = []
+    if(cache is not None and args.filter):
+        dirs = set(os.listdir(args.output_dir))
+        def prot_is_done(f):
+            prot_id = os.path.splitext(f)[0]
+            if(prot_id in cache):
+                chain_ids = cache[prot_id]["chain_ids"]
+                for c in chain_ids:
+                    full_name = prot_id + "_" + c
+                    if(not full_name in dirs):
+                        return False
+            else:
+                return False
+            return True
+        files = [f for f in files if not prot_is_done(f)]
+    def split_up_arglist(arglist):
+        # Split up the survivors
+        if(os.environ.get("SLURM_JOB_NUM_NODES", 0)):
+            num_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
+            if(num_nodes > 1):
+                node_id = int(os.environ["SLURM_NODEID"])
+                logging.warning(f"Num nodes: {num_nodes}")
+                logging.warning(f"Node ID: {node_id}")
+                arglist = arglist[node_id::num_nodes]
+        t_arglist = []
+        for i in range(args.no_tasks):
+            t_arglist.append(arglist[i::args.no_tasks])
+        return t_arglist
+    if(cache is not None and "seqs" in next(iter(cache.values()))):
+        seq_group_dict = {}
+        for f in files:
+            prot_id = os.path.splitext(f)[0]
+            if(prot_id in cache):
+                prot_cache = cache[prot_id]
+                chains_seqs = zip(
+                    prot_cache["chain_ids"], prot_cache["seqs"]
+                )
+                for chain, seq in chains_seqs:
+                    chain_name = prot_id + "_" + chain
+                    if(chain_name not in dirs):
+                        l = seq_group_dict.setdefault(seq, [])
+                        l.append(chain_name)
+        func = partial(run_seq_group_alignments, 
+            alignment_runner=alignment_runner, 
+            args=args
+        )
+        seq_groups = [(k,v) for k,v in seq_group_dict.items()]
+        # Sort them by group length so the tasks are approximately balanced
+        seq_groups = sorted(seq_groups, key=lambda x: len(x[1]))
+        task_arglist = [[a] for a in split_up_arglist(seq_groups)]
+    else:
+        func = partial(parse_and_align,
+            alignment_runner=alignment_runner,
+            args=args,
+        )
+        task_arglist = [[a] for a in split_up_arglist(files)]
+    threads = []
+    for i, task_args in enumerate(task_arglist):
+        print(f"Started thread {i}...")
+        t = threading.Thread(target=func, args=task_args)
+        threads.append(t)
+        t.start()
+    for t in threads:
+        t.join()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "input_dir", type=str,
+        help="""Path to directory containing mmCIF, FASTA and/or ProteinNet
+                .core files"""
+    )
+    parser.add_argument(
+        "output_dir", type=str,
+        help="Directory in which to output alignments"
+    )
+    add_data_args(parser)
+    parser.add_argument(
+        "--raise_errors", action="store_true", default=False,
+        help="Whether to crash on parsing errors"
+    )
+    parser.add_argument(
+        "--cpus_per_task", type=int, default=cpu_count(),
+        help="Number of CPUs to use"
+    )
+    parser.add_argument(
+        "--mmcif_cache", type=str, default=None,
+        help="Path to mmCIF cache. Used to filter files to be parsed"
+    )
+    parser.add_argument(
+        "--no_tasks", type=int, default=1,
+    )
+    parser.add_argument(
+        "--filter", type=bool, default=True,
+    )
+    args = parser.parse_args()
+    main(args)
--- a/scripts/precompute_alignments_mmseqs.py
+++ b/scripts/precompute_alignments_mmseqs.py
+import argparse
+import logging
+import os
+from pathlib import Path
+import subprocess
+from openfold.data.tools import hhsearch
+def _split_a3ms(output_dir):
+    for fname in os.listdir(output_dir):
+        if(not os.path.splitext(fname)[-1] == ".a3m"):
+            continue
+        fpath = os.path.join(output_dir, fname)
+        with open(fpath, "r") as fp:
+            a3ms = fp.read()
+        # Split by the null byte, excluding the terminating null byte
+        a3ms = a3ms.split('\x00')[:-1]
+        for a3m in a3ms:
+            name = a3m.split('\n', 1)[0][1:]
+            prot_dir = os.path.join(output_dir, name)
+            Path(prot_dir).mkdir(parents=True, exist_ok=True)
+            with open(os.path.join(prot_dir, fname), "w") as fp:
+                fp.write(a3m)
+        os.remove(fpath)
+        os.remove(fpath + ".dbtype")
+        os.remove(fpath + ".index")
+def main(args):
+    with open(args.input_fasta, "r") as f:
+        lines = [l.strip() for l in f.readlines()]
+    names = lines[::2]
+    seqs =  lines[1::2]
+    if(args.fasta_chunk_size is None):
+        chunk_size = len(seqs)
+    else:
+        chunk_size = args.fasta_chunk_size
+    # Make the output directory
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    s = 0
+    while(s < len(seqs)):
+        e = s + chunk_size
+        chunk_fasta = [el for tup in zip(names[s:e], seqs[s:e]) for el in tup] 
+        s = e
+        prot_dir = os.path.join(args.output_dir, chunk_fasta[0][1:].upper())
+        if(os.path.exists(prot_dir)):
+            # We've already computed this chunk
+            continue
+        chunk_fasta_path = os.path.join(args.output_dir, "tmp.fasta")
+        with open(chunk_fasta_path, "w") as f:
+            f.write('\n'.join(chunk_fasta) + '\n')
+        cmd = [
+            "scripts/colabfold_search.sh",
+            args.mmseqs_binary_path,
+            chunk_fasta_path,
+            args.mmseqs_db_dir,
+            args.output_dir,
+            args.uniref_db,
+            '""',
+            '""' if args.env_db is None else args.env_db,
+            "0" if args.env_db is None else "1",
+            "0", # compute templates
+            "1", # filter
+            "1", # use precomputed index 
+            "0", # db-load-mode
+        ]
+        logging.info('Launching subprocess "%s"', " ".join(cmd))
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        retcode = process.wait()
+        if retcode:
+            raise RuntimeError(
+                "MMseqs failed\nstdout:\n%s\n\nstderr:\n%s\n"
+                % (stdout.decode("utf-8"), stderr.decode("utf-8"))
+            )
+        _split_a3ms(args.output_dir)
+        # Clean up temporary files
+        os.remove(chunk_fasta_path)
+    hhsearch_pdb70_runner = hhsearch.HHSearch(
+        binary_path=args.hhsearch_binary_path, databases=[args.pdb70]
+    )
+    for d in os.listdir(args.output_dir):
+        dpath = os.path.join(args.output_dir, d)
+        if(not os.path.isdir(dpath)):
+            continue
+        for fname in os.listdir(dpath):
+            fpath = os.path.join(dpath, fname)
+            if(not "uniref" in fname or 
+                not os.path.splitext(fname)[-1] == ".a3m"):
+                continue
+            with open(fpath, "r") as fp:
+                a3m = fp.read()
+            hhsearch_result = hhsearch_pdb70_runner.query(a3m)
+            pdb70_out_path = os.path.join(dpath, "pdb70_hits.hhr")
+            with open(pdb70_out_path, "w") as f:
+                f.write(hhsearch_result)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "input_fasta", type=str,
+        help="Path to input FASTA file. Can contain one or more sequences."
+    )
+    parser.add_argument(
+        "mmseqs_db_dir", type=str,
+        help="""Path to directory containing pre-processed MMSeqs2 DBs 
+                (see README)"""
+    )
+    parser.add_argument(
+        "uniref_db", type=str,
+        help="Basename of uniref database"
+    )
+    parser.add_argument(
+        "output_dir", type=str,
+        help="Output directory"
+    )
+    parser.add_argument(
+        "mmseqs_binary_path", type=str,
+        help="Path to mmseqs binary"
+    )
+    parser.add_argument(
+        "--hhsearch_binary_path", type=str, default=None,
+        help="""Path to hhsearch binary (for template search). In future 
+                versions, we'll also use mmseqs for this"""
+    )
+    parser.add_argument(
+        "--pdb70", type=str, default=None,
+        help="Basename of the pdb70 database"
+    )
+    parser.add_argument(
+        "--env_db", type=str, default=None,
+        help="Basename of environmental database"
+    )
+    parser.add_argument(
+        "--fasta_chunk_size", type=int, default=None,
+        help="""How many sequences should be processed at once. All sequences 
+                processed at once by default."""
+    )
+    args = parser.parse_args()
+    if(args.hhsearch_binary_path is not None and args.pdb70 is None):
+        raise ValueError(
+            "pdb70 must be specified along with hhsearch_binary_path"
+        )
+    main(args)
--- a/scripts/prep_mmseqs_dbs.sh
+++ b/scripts/prep_mmseqs_dbs.sh
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratory 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips all required data for AlphaFold.
+#
+# Usage: bash download_all_data.sh /path/to/download/directory
+set -e
+DOWNLOAD_DIR="$1"
+ROOT_DIR="${DOWNLOAD_DIR}/mmseqs_dbs"
+mkdir -p $ROOT_DIR
+for f in $(ls ${DOWNLOAD_DIR}/*.tar*)
+do
+  tar --extract --verbose --file="${f}" \
+      --directory=$ROOT_DIR
+  rm "${f}"
+  BASENAME="$(basename ${f%%.*})"
+  DB_NAME="${BASENAME}_db"
+  OLD_PWD=$(pwd)
+  cd $ROOT_DIR 
+  mmseqs tsv2exprofiledb "${BASENAME}" "${DB_NAME}"
+  mmseqs createindex "${DB_NAME}" "${DOWNLOAD_DIR}/tmp/"
+  cd "${OLD_PWD}"
+done
--- a/scripts/prep_proteinnet_msas.py
+++ b/scripts/prep_proteinnet_msas.py
+import argparse
+import logging
+import os
+import shutil
+def main(args):
+    count = 0
+    max_count = args.max_count if args.max_count is not None else -1
+    msas = sorted(f for f in os.listdir(args.msa_dir))
+    mmcifs = sorted(f for f in os.listdir(args.mmcif_dir))
+    mmcif_idx = 0
+    for f in msas:
+        if(count == max_count):
+            break
+        path = os.path.join(args.msa_dir, f)
+        name = os.path.splitext(f)[0]
+        spl = name.upper().split('_')
+        if(len(spl) != 3):
+            continue
+        pdb_id, _, chain_id = spl
+        while pdb_id > os.path.splitext(mmcifs[mmcif_idx])[0].upper():
+            mmcif_idx += 1
+        # Only consider files with matching mmCIF files
+        if(pdb_id == os.path.splitext(mmcifs[mmcif_idx])[0].upper()):
+            dirname = os.path.join(args.out_dir, '_'.join([pdb_id, chain_id]))
+            os.makedirs(dirname, exist_ok=True)
+            dest = os.path.join(dirname, f)
+            if(args.copy):
+                shutil.copyfile(path, dest)
+            else:
+                os.rename(path, dest)
+            count += 1
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=
+        "Converts raw ProteinNet MSAs into a format recognized by the parser"
+    )
+    parser.add_argument(
+        "msa_dir", type=str, help="Directory containing ProteinNet MSAs"
+    )
+    parser.add_argument(
+        "mmcif_dir", type=str, help="Directory containing PDB mmCIFs"
+    )
+    parser.add_argument(
+        "out_dir", type=str,
+        help="Directory to which output should be saved"
+    )
+    parser.add_argument(
+        "--copy", type=bool, default=True,
+        help="Whether to copy the MSAs to out_dir rather than moving them"
+    )
+    parser.add_argument(
+        "--max_count", type=int, default=None,
+        help="A bound on the number of MSAs to process"
+    )
+    args = parser.parse_args()
+    main(args)
--- a/scripts/run_unit_tests.sh
+++ b/scripts/run_unit_tests.sh
+#!/bin/bash
+CUDA_VISIBLE_DEVICES="0"
+python3 -m unittest "$@" || \
+echo -e "\nTest(s) failed. Make sure you've installed all Python dependencies."
--- a/scripts/slurm_scripts/run_uniclust30_search.sh
+++ b/scripts/slurm_scripts/run_uniclust30_search.sh
+#!/bin/bash
+# Generates uniclust30 all-against-all alignments on a SLURM cluster.
+# Thanks to Milot Mirdita for help & feedback on this script.
+set -e
+if [[ $# != 3 ]]; then
+    echo "usage: ./run_uniclust30_search.sh <uniclust30_path> <scratch_dir> <out_dir>"
+    exit
+fi
+UNICLUST_PATH=$1
+SCRATCH_DIR_BN=$2
+OUT_DIR=$3
+CPUS_PER_TASK=4
+MAX_SIZE=10000000000 # 10GB
+SCRATCH_DIR="${SCRATCH_DIR_BN}_${SLURM_NODEID}"
+mkdir -p ${SCRATCH_DIR}
+mkdir -p ${OUT_DIR}
+# copy database to local ssd
+DB_BN=$(basename $UNICLUST_PATH)
+DB_DIR="/dev/shm/uniclust30"
+mkdir -p $DB_DIR
+cp ${UNICLUST_PATH}*.ff* $DB_DIR
+DB="${DB_DIR}/${DB_BN}"
+for f in $(ls $OUT_DIR/*.zip)
+do 
+    zipinfo -1 $f '*/' | awk -F/ '{print $(NF-1)}' >> ${DB_DIR}/already_searched.txt
+done
+python3 filter_ffindex.py ${DB}_a3m.ffindex ${DB_DIR}/already_searched.txt ${DB_DIR}/filtered_a3m.ffindex 
+TARGET="${DB}_a3m_${SLURM_NODEID}.ffindex"
+split -n "l/$((SLURM_NODEID + 1))/${SLURM_JOB_NUM_NODES}" "${DB_DIR}/filtered_a3m.ffindex" > $TARGET
+open_sem() {
+    mkfifo pipe-$$
+    exec 3<>pipe-$$
+    rm pipe-$$
+    local i=$1
+    for ((;i>0;i--)); do
+        printf %s 000 >&3
+    done
+}
+# run the given command asynchronously and pop/push tokens
+run_with_lock() {
+    local x
+    # this read waits until there is something to read
+    read -u 3 -n 3 x && ((0==x)) || exit $x
+    (
+        ( "$@"; )
+        # push the return code of the command to the semaphore
+        printf '%.3d' $? >&3
+    )&
+}
+task() {
+    dd if="${DB}_a3m.ffdata" ibs=1 skip="${OFF}" count="${LEN}" status=none | \
+	hhblits -i stdin \
+            -oa3m "${SCRATCH_DIR}/${KEY}/uniclust30.a3m" \
+            -v 0 \
+            -o /dev/null \
+            -cpu $CPUS_PER_TASK \
+            -d $DB \
+            -n 3 \
+            -e 0.001
+}
+zip_or_not() {
+    SIZE=$(du -hbs $SCRATCH_DIR | sed 's/|/ /' | awk '{print $1}')
+    #if [[ "$SIZE" -gt "$MAX_SIZE" ]]
+    if [[ "2" -gt "1" ]]
+    then
+        wait
+        RANDOM_NAME=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 32)
+        zip -r "${OUT_DIR}/${RANDOM_NAME}.zip" $SCRATCH_DIR
+        find $SCRATCH_DIR -mindepth 1 -type d -exec rm -rf {} +
+    fi
+}
+N=$(($(nproc) / ${CPUS_PER_TASK}))
+open_sem $N
+while read -r KEY OFF LEN; do
+    PROT_DIR="${SCRATCH_DIR}/${KEY}"
+    if [[ -d $PROT_DIR ]]
+    then
+        continue
+    fi
+    mkdir -p $PROT_DIR
+    run_with_lock task "${KEY}" "${OFF}" "${LEN}"
+    zip_or_not
+done < $TARGET
+wait
+zip_or_not
+wait
--- a/scripts/unpack_proteinnet.py
+++ b/scripts/unpack_proteinnet.py
+import argparse
+import os
+from pathlib import Path
+def _write_file(args, file_in_progress):
+    file_id = file_in_progress[1]
+    fname = file_id.upper() + ".core"
+    fpath = os.path.join(args.output_dir, fname)
+    with open(fpath, "w") as fp:
+        fp.write('\n'.join(file_in_progress))
+def main(args):
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)    
+    with open(args.proteinnet_file, "r") as fp:
+        proteinnet_string = fp.readlines()
+    file_in_progress = []
+    for line in proteinnet_string:
+        if(line == "[ID]\n"):
+            if(len(file_in_progress) > 0):
+                _write_file(args, file_in_progress)
+                file_in_progress = []
+        file_in_progress.append(line.strip())
+    if(len(file_in_progress) > 0):
+        _write_file(args, file_in_progress)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "proteinnet_file", type=str,
+        help="Path to ProteinNet file to unpack"
+    )
+    parser.add_argument(
+        "output_dir", type=str,
+        help="Path to directory in which to output .core files"
+    )
+    args = parser.parse_args()
+    main(args)