Merging in main branch

39a6d0e6 · Christina Floristean · d8ee9c5f · 84659c93 · 39a6d0e6 · 39a6d0e6
Commit 39a6d0e6 authored Apr 09, 2023 by Christina Floristean
20 changed files
--- a/scripts/colabfold_search.sh
+++ b/scripts/colabfold_search.sh
+#!/bin/bash -e
 # Copied from colabfold.mmseqs.com

-#!/bin/bash -e
 MMSEQS="$1"
 QUERY="$2"
 DBBASE="$3"

--- a/scripts/convert_of_weights_to_jax.py
+++ b/scripts/convert_of_weights_to_jax.py
+# Copyright 2022 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Converts OpenFold .pt checkpoints into AlphaFold .npz ones, which can then be
+# used to run inference using DeepMind's JAX code.
+import argparse
+
+import numpy as np
+import torch
+
+from openfold.config import model_config
+from openfold.model.model import AlphaFold
+from openfold.utils.import_weights import (
+    Param, 
+    ParamType, 
+    generate_translation_dict, 
+    process_translation_dict,
+)
+from openfold.utils.tensor_utils import tree_map
+
+
+def reshape_fn(of_param, af_weight):
+    transformations = {
+        ParamType.LinearWeight: lambda w: w.transpose(-1, -2),
+        ParamType.LinearWeightMHA: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
+        ParamType.LinearMHAOutputWeight: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
+        ParamType.LinearBiasMHA: lambda w: w.reshape(af_weight.shape),
+        ParamType.LinearWeightOPM: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
+        ParamType.Other: lambda w: w,
+    }
+
+    if(of_param.stacked):
+        of_weight = torch.stack([torch.Tensor(p) for p in of_param.param])
+    else:
+        of_weight = torch.Tensor(of_param.param)
+    
+    return transformations[of_param.param_type](of_weight)
+
+
+def transfer(of_dict, af_weight_template):
+    for k in of_dict:
+        if(type(of_dict[k]) == dict):
+            transfer(of_dict[k], af_weight_template[k])
+        else:
+            reshaped = reshape_fn(of_dict[k], af_weight_template[k])
+            reshaped = reshaped.detach().numpy()
+            np.copyto(af_weight_template[k], reshaped)
+
+
+def main(args):
+    d = torch.load(args.of_pt_path)
+
+    config = model_config(args.config_preset)
+    model = AlphaFold(config)
+    model.load_state_dict(d)
+    
+    translation = generate_translation_dict(model, args.config_preset)
+    translation = process_translation_dict(translation)
+    
+    af_weight_template = np.load(args.template_npz_path)
+    af_weight_template = {k:v for k,v in af_weight_template.items() if k in translation}
+    zero = lambda n: n * 0
+    af_weight_template = tree_map(zero, af_weight_template, np.ndarray)
+    
+    transfer(translation, af_weight_template)
+    
+    np.savez(args.out_path, **af_weight_template)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "of_pt_path", type=str, help="Path to OpenFold .pt checkpoint file"
+    )
+    parser.add_argument(
+        "config_preset", type=str, help="The corresponding config preset"
+    )
+    parser.add_argument(
+        "out_path", type=str, help="Path for output .npz file"
+    )
+    parser.add_argument(
+        "--template_npz_path", 
+        type=str, 
+        default="openfold/resources/params/params_model_1_ptm.npz",
+        help="""Path to an AlphaFold checkpoint w/ a superset of the OF
+                checkpoint's parameters. params_model_1_ptm.npz always works.
+             """
+    )
+
+    args = parser.parse_args()
+
+    main(args)
--- a/scripts/download_cameo.py
+++ b/scripts/download_cameo.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import os
+import re
+import requests
+
+from openfold.data import mmcif_parsing
+
+
+VALID_PERIODS = [
+    "1-year",
+    "6-months",
+    "3-months",
+    "1-month",
+    "1-week",
+]
+
+
+def generate_url(period, end_date):
+    return '/'.join([
+        "https://www.cameo3d.org/",
+        "modeling",
+        "targets",
+        period,
+        "ajax",
+        f"?to_date={end_date}",
+    ])
+
+
+def main(args):
+    data_dir_path = os.path.join(args.output_dir, "data_dir")
+    fasta_dir_path = os.path.join(args.output_dir, "fasta_dir")
+    
+    os.makedirs(data_dir_path, exist_ok=True)
+    os.makedirs(fasta_dir_path, exist_ok=True)
+
+    url = generate_url(args.period, args.end_date)
+    raw_data = requests.get(url).text
+    parsed_data = json.loads(raw_data)
+
+    chain_data = parsed_data["aaData"]
+    for chain in chain_data:
+        pdb_id = chain["pdbid"]
+        chain_id = chain["pdbid_chain"]
+
+        pdb_url = f"https://files.rcsb.org/view/{pdb_id.upper()}.cif"
+        pdb_file = requests.get(pdb_url).text
+
+        parsed_cif = mmcif_parsing.parse(
+            file_id=pdb_id, mmcif_string=pdb_file
+        )
+        mmcif_object = parsed_cif.mmcif_object
+        if(mmcif_object is None):
+            raise list(parsed_cif.errors.values())[0]
+
+        seq = mmcif_object.chain_to_seqres[chain_id]
+
+        if(args.max_seqlen > 0):
+            if(len(seq) > len(seq)):
+                continue
+
+        fasta_file = '\n'.join([
+            f">{pdb_id}_{chain_id}",
+            seq,
+        ])
+
+        fasta_filename = f"{pdb_id}_{chain_id}.fasta"
+        with open(os.path.join(fasta_dir_path, fasta_filename), "w") as fp:
+            fp.write(fasta_file)
+
+        cif_filename = f"{pdb_id}.cif"
+        with open(os.path.join(data_dir_path, cif_filename), "w") as fp:
+            fp.write(pdb_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "period", type=str,
+        help=f"""The length of the period from which to draw CAMEO proteins. 
+             Choose from {VALID_PERIODS}"""
+    )
+    parser.add_argument(
+        "end_date", type=str,
+        help="The date marking the end of the period (YYYY-MM-DD)"
+    )
+    parser.add_argument("output_dir")
+    parser.add_argument(
+        "--max_seqlen", type=int, default=700,
+        help="The maximum length in residues of downloaded proteins (or -1)"
+    )
+
+    args = parser.parse_args()
+
+    if(args.period not in VALID_PERIODS):
+        raise ValueError(f"Invalid period. Choose from {VALID_PERIODS}")
+
+    date_regex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")
+    if(not date_regex.match(args.end_date)):
+        raise ValueError(f"Invalid end_date: {args.end_date}. Use YYYY-MM-DD format")
+
+    main(args)
--- a/scripts/download_colabfold_envdb.sh
+++ b/scripts/download_colabfold_envdb.sh
@@ -35,4 +35,4 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.ta
 BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"
-aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
--- a/scripts/download_mgnify.sh
+++ b/scripts/download_mgnify.sh
@@ -38,6 +38,4 @@ BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"
 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
-pushd "${ROOT_DIR}"
 gunzip "${ROOT_DIR}/${BASENAME}"
-popd
--- a/scripts/download_data.sh
+++ b/scripts/download_data.sh
@@ -14,9 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Downloads and unzips all required data for AlphaFold.
+# Downloads OpenFold parameters.
 #
-# Usage: bash download_all_data.sh /path/to/download/directory
+# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
 set -e

 if [[ $# -eq 0 ]]; then
@@ -24,28 +24,11 @@ if [[ $# -eq 0 ]]; then
    exit 1
 fi

-if ! command -v aria2c &> /dev/null ; then
-    echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
+if ! command -v aws &> /dev/null ; then
+    echo "Error: aws could not be found. Please install aws."
    exit 1
 fi

-DOWNLOAD_DIR="$1"
-DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs.
-if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]]
-then
-  echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized."
-  exit 1
-fi
-
-SCRIPT_DIR="$(dirname "$(realpath "$0")")"
-
-echo "Downloading AlphaFold parameters..."
-bash "${SCRIPT_DIR}/download_alphafold_params.sh" "${DOWNLOAD_DIR}"
-
-echo "Downloading PDB70..."
-bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}"
-
-echo "Downloading PDB mmCIF files..."
-bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"
-
-echo "All data downloaded."
+DOWNLOAD_DIR="${1}/openfold_params"
+mkdir -p "${DOWNLOAD_DIR}"
+aws s3 cp --no-sign-request --region us-east-1 s3://openfold/openfold_params/ "${DOWNLOAD_DIR}" --recursive
--- a/scripts/download_openfold_params_gdrive.sh
+++ b/scripts/download_openfold_params_gdrive.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips OpenFold parameters from Google Drive. Alternative to
+# the HuggingFace version.
+#
+# Usage: bash download_openfold_params_gdrive.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+FILE_ID="1GVzZA2nbdBbz6TKydvzquhfELJ3Movnb"
+FILENAME="openfold_params_07_22.tar.gz"
+
+download_from_gdrive() {
+    FILE_ID="$1"
+    OUT_DIR="$2"
+    MSG=$(wget \
+         --quiet \
+         --save-cookies /tmp/cookies_$$.txt \
+         --keep-session-cookies \
+         --no-check-certificate \
+         "https://docs.google.com/uc?export=download&id=${FILE_ID}" \
+         -O- \
+    )
+    CONFIRM=$(echo $MSG | sed -rn "s/.*confirm=([0-9A-Za-z_]+).*/\1\n/p")
+    FILENAME=$(echo $MSG | sed -e "s/.*<a href=\"\/open?id=${FILE_ID}\">\(.*\)<\/a> (.*/\1/")
+    FILEPATH="${OUT_DIR}/${FILENAME}"
+    wget \
+        --quiet \
+        --load-cookies /tmp/cookies_$$.txt \
+        "https://docs.google.com/uc?export=download&confirm=${CONFIRM}&id=${FILE_ID}" \
+        -O "${FILEPATH}"
+    rm /tmp/cookies_$$.txt
+    echo $FILEPATH
+}
+
+DOWNLOAD_DIR="$1"
+mkdir -p "${DOWNLOAD_DIR}"
+DOWNLOAD_PATH=$(download_from_gdrive $FILE_ID "${DOWNLOAD_DIR}")
+
+DOWNLOAD_FILENAME=$(basename "${DOWNLOAD_PATH}")
+if [[ $FILENAME != $DOWNLOAD_FILENAME ]]; then
+    echo "Error: Downloaded filename ${DOWNLOAD_FILENAME} does not match expected filename ${FILENAME}"
+    rm "${DOWNLOAD_PATH}"
+    exit
+fi
+
+tar --extract --verbose --file="${DOWNLOAD_PATH}" \
+  --directory="${DOWNLOAD_DIR}" --preserve-permissions
+rm "${DOWNLOAD_PATH}"
--- a/scripts/download_openfold_params_huggingface.sh
+++ b/scripts/download_openfold_params_huggingface.sh
+#!/bin/bash
+#
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads and unzips OpenFold parameters.
+#
+# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
+set -e
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: download directory must be provided as an input argument."
+    exit 1
+fi
+
+URL="https://huggingface.co/nz/OpenFold"
+
+DOWNLOAD_DIR="${1}/openfold_params/"
+mkdir -p "${DOWNLOAD_DIR}"
+git clone $URL "${DOWNLOAD_DIR}"
+rm -rf "${DOWNLOAD_DIR}/.git"
--- a/scripts/download_pdb70.sh
+++ b/scripts/download_pdb70.sh
@@ -35,7 +35,7 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/
 BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"
-aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" --check-certificate=false
 tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
  --directory="${ROOT_DIR}"
 rm "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_roda_pdbs.sh
+++ b/scripts/download_roda_pdbs.sh
+#!/bin/bash
+#
+# Copyright 2021 AlQuraishi Laboratories
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Downloads .cif files matching the RODA alignments. Outputs a list of 
+# RODA alignments for which .cif files could not be found..
+if [[ $# != 2 ]]; then
+    echo "usage: ./download_roda_pdbs.sh <out_dir> <roda_pdb_alignment_dir>"
+    exit 1
+fi
+
+OUT_DIR=$1
+RODA_ALIGNMENT_DIR=$2
+
+if [[ -d $OUT_DIR ]]; then
+    echo "${OUT_DIR} already exists. Download failed..."
+    exit 1
+fi
+
+SERVER=snapshotrsync.rcsb.org                       # RCSB server name
+PORT=873                                           # port RCSB server is using
+
+rsync -rlpt -v -z --delete --port=$PORT $SERVER::20220103/pub/pdb/data/structures/divided/mmCIF/ $OUT_DIR 2>&1 > /dev/null
+
+for f in $(find $OUT_DIR -mindepth 2 -type f); do
+    mv $f $OUT_DIR
+    BASENAME=$(basename $f)
+    gunzip "${OUT_DIR}/${BASENAME}"
+done
+
+find $OUT_DIR -mindepth 1 -type d,l -delete
+
+for d in $(find $RODA_ALIGNMENT_DIR -mindepth 1 -maxdepth 1 -type d); do
+    BASENAME=$(basename $d)
+    PDB_ID=$(echo $BASENAME | cut -d '_' -f 1)
+    CIF_PATH="${OUT_DIR}/${PDB_ID}.cif"
+    if [[ ! -f $CIF_PATH ]]; then
+        echo $d
+    fi
+done
--- a/scripts/download_uniref30.sh
+++ b/scripts/download_uniref30.sh
@@ -35,4 +35,5 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
 BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"
-aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4
+aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
+gunzip "${ROOT_DIR}/${BASENAME}"
--- a/scripts/download_uniref90.sh
+++ b/scripts/download_uniref90.sh
@@ -36,6 +36,5 @@ BASENAME=$(basename "${SOURCE_URL}")

 mkdir --parents "${ROOT_DIR}"
 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
-pushd "${ROOT_DIR}"
 gunzip "${ROOT_DIR}/${BASENAME}"
-popd
+
--- a/scripts/flatten_roda.sh
+++ b/scripts/flatten_roda.sh
+#!/usr/bin/env sh
+#
+# Flattens a downloaded RODA database into the format expected by OpenFold
+# Args:
+#     roda_dir: 
+#           The path to the database you want to flatten. E.g. "roda/pdb" 
+#           or "roda/uniclust30". Note that, to save space, this script
+#           will empty this directory.
+#     output_dir:
+#           The directory in which to construct the reformatted data
+
+if [[ $# != 2 ]]; then
+    echo "usage: ./flatten_roda.sh <roda_dir> <output_dir>"
+    exit 1
+fi
+
+RODA_DIR=$1
+OUTPUT_DIR=$2
+
+DATA_DIR="${OUTPUT_DIR}/data"
+ALIGNMENT_DIR="${OUTPUT_DIR}/alignments"
+
+mkdir -p "${DATA_DIR}"
+mkdir -p "${ALIGNMENT_DIR}"
+
+for chain_dir in $(ls "${RODA_DIR}"); do
+    CHAIN_DIR_PATH="${RODA_DIR}/${chain_dir}"
+    for subdir in $(ls "${CHAIN_DIR_PATH}"); do
+        if [[ $subdir = "pdb" ]] || [[ $subdir = "cif" ]]; then
+            mv "${CHAIN_DIR_PATH}/${subdir}"/* "${DATA_DIR}"
+        else
+            CHAIN_ALIGNMENT_DIR="${ALIGNMENT_DIR}/${chain_dir}"
+            mkdir -p "${CHAIN_ALIGNMENT_DIR}"
+            mv "${CHAIN_DIR_PATH}/${subdir}"/* "${CHAIN_ALIGNMENT_DIR}"
+        fi
+    done
+done
+
+NO_DATA_FILES=$(find "${DATA_DIR}" -type f | wc -l)
+if [[ $NO_DATA_FILES = 0 ]]; then
+    rm -rf ${DATA_DIR}
+fi
--- a/scripts/generate_alphafold_feature_dict.py
+++ b/scripts/generate_alphafold_feature_dict.py
@@ -2,35 +2,62 @@ import argparse
 import os
 import pickle

-from alphafold.data import pipeline, templates
+from alphafold.data import pipeline, pipeline_multimer, templates
+from alphafold.data.tools import hmmsearch, hhsearch

 from scripts.utils import add_data_args


 def main(args):
-    template_featurizer = templates.TemplateHitFeaturizer(
-        mmcif_dir=args.mmcif_dir,
-        max_template_date=args.max_template_date,
-        max_hits=20,
-        kalign_binary_path=args.kalign_binary_path,
-        release_dates_path=None,
-        obsolete_pdbs_path=args.obsolete_pdbs_path,
-    )
+    if (args.multimer):
+        template_searcher = hmmsearch.Hmmsearch(
+            binary_path=args.hmmsearch_binary_path,
+            hmmbuild_binary_path=args.hmmbuild_binary_path,
+            database_path=args.pdb_seqres_database_path,
+        )
+
+        template_featurizer = templates.HmmsearchHitFeaturizer(
+            mmcif_dir=args.template_mmcif_dir,
+            max_template_date=args.max_template_date,
+            max_hits=20,
+            kalign_binary_path=args.kalign_binary_path,
+            release_dates_path=args.release_dates_path,
+            obsolete_pdbs_path=args.obsolete_pdbs_path
+        )
+    else:
+        template_searcher = hhsearch.HHSearch(
+            binary_path=args.hhsearch_binary_path,
+            databases=[args.pdb70_database_path],
+        )
+
+        template_featurizer = templates.HhsearchHitFeaturizer(
+            mmcif_dir=args.template_mmcif_dir,
+            max_template_date=args.max_template_date,
+            max_hits=20,
+            kalign_binary_path=args.kalign_binary_path,
+            release_dates_path=None,
+            obsolete_pdbs_path=args.obsolete_pdbs_path
+        )

    data_pipeline = pipeline.DataPipeline(
        jackhmmer_binary_path=args.jackhmmer_binary_path,
        hhblits_binary_path=args.hhblits_binary_path,
-        hhsearch_binary_path=args.hhsearch_binary_path,
        uniref90_database_path=args.uniref90_database_path,
        mgnify_database_path=args.mgnify_database_path,
        bfd_database_path=args.bfd_database_path,
        uniclust30_database_path=args.uniclust30_database_path,
-        pdb70_database_path=args.pdb70_database_path,
        small_bfd_database_path=None,
        template_featurizer=template_featurizer,
+        template_searcher=template_searcher,
        use_small_bfd=False,
    )

+    if (args.multimer):
+        data_pipeline = pipeline_multimer.DataPipeline(
+            monomer_data_pipeline=data_pipeline,
+            jackhmmer_binary_path=args.jackhmmer_binary_path,
+            uniprot_database_path=args.uniprot_database_path)
+
    feature_dict = data_pipeline.process(
        input_fasta_path=args.fasta_path,
        msa_output_dir=args.output_dir,
@@ -44,6 +71,7 @@ if __name__ == "__main__":
    parser.add_argument("fasta_path", type=str)
    parser.add_argument("mmcif_dir", type=str)
    parser.add_argument("output_dir", type=str)
+    parser.add_argument("--multimer", action='store_true')
    add_data_args(parser)

    args = parser.parse_args()

--- a/scripts/generate_chain_data_cache.py
+++ b/scripts/generate_chain_data_cache.py
@@ -54,9 +54,8 @@ def parse_file(
        chain_dict["seq"] = residue_constants.aatype_to_str_sequence(
            protein_object.aatype,
        )
-        local_data["resolution"] = 0.
-
-        cluster_size = chain_cluster_size_dict.get(file_id.upper(), -1)
+        chain_dict["resolution"] = 0.
+        
        if(chain_cluster_size_dict is not None):
            cluster_size = chain_cluster_size_dict.get(
                full_name.upper(), -1

--- a/scripts/install_hh_suite.sh
+++ b/scripts/install_hh_suite.sh
 #!/bin/bash

-git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite
-&& mkdir /tmp/hh-suite/build
-&& pushd /tmp/hh-suite/build
-&& cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite ..
-&& make -j 4 && make install
-&& ln -s /opt/hhsuite/bin/* /usr/bin
-&& popd
-&& rm -rf /tmp/hh-suite
+git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \
+  && mkdir /tmp/hh-suite/build \
+  && pushd /tmp/hh-suite/build \
+  && cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \
+  && make -j 4 && make install \
+  && ln -sf /opt/hhsuite/bin/* /usr/bin \
+  && popd \
+  && rm -rf /tmp/hh-suite
--- a/scripts/install_third_party_dependencies.sh
+++ b/scripts/install_third_party_dependencies.sh
@@ -15,7 +15,15 @@ wget -P /tmp \
 export PATH=lib/conda/bin:$PATH
 lib/conda/bin/python3 -m pip install nvidia-pyindex
 conda env create --name=${ENV_NAME} -f environment.yml
-source activate ${ENV_NAME}
+source scripts/activate_conda_env.sh
+
+echo "Attempting to install FlashAttention"
+git clone https://github.com/HazyResearch/flash-attention
+CUR_DIR=$PWD
+cd flash-attention
+git checkout 5b838a8bef
+python3 setup.py install
+cd $CUR_DIR

 # Install DeepMind's OpenMM patch
 OPENFOLD_DIR=$PWD
@@ -24,15 +32,18 @@ pushd lib/conda/envs/$ENV_NAME/lib/python3.7/site-packages/ \
    && popd

 # Download folding resources
-wget -q -P openfold/resources \
+wget --no-check-certificate -P openfold/resources \
    https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt

 # Certain tests need access to this file
 mkdir -p tests/test_data/alphafold/common
 ln -rs openfold/resources/stereo_chemical_props.txt tests/test_data/alphafold/common

-# Download pretrained openfold weights
-scripts/download_alphafold_params.sh openfold/resources
+echo "Downloading OpenFold parameters..."
+bash scripts/download_openfold_params.sh openfold/resources
+
+echo "Downloading AlphaFold parameters..."
+bash scripts/download_alphafold_params.sh openfold/resources

 # Decompress test data
 gunzip tests/test_data/sample_feats.pickle.gz
--- a/scripts/precompute_alignments.py
+++ b/scripts/precompute_alignments.py
@@ -227,7 +227,7 @@ if __name__ == "__main__":
    )
    add_data_args(parser)
    parser.add_argument(
-        "--raise_errors", type=bool, default=False,
+        "--raise_errors", action="store_true", default=False,
        help="Whether to crash on parsing errors"
    )
    parser.add_argument(

--- a/scripts/prep_mmseqs_dbs.sh
+++ b/scripts/prep_mmseqs_dbs.sh
@@ -23,12 +23,12 @@ DOWNLOAD_DIR="$1"
 ROOT_DIR="${DOWNLOAD_DIR}/mmseqs_dbs"
 mkdir -p $ROOT_DIR

-for f in $(ls ${DOWNLOAD_DIR}/*.tar.gz)
+for f in $(ls ${DOWNLOAD_DIR}/*.tar*)
 do
  tar --extract --verbose --file="${f}" \
      --directory=$ROOT_DIR
  rm "${f}"
-  BASENAME="$(basename {f%%.*})"
+  BASENAME="$(basename ${f%%.*})"
  DB_NAME="${BASENAME}_db"
  OLD_PWD=$(pwd)
  cd $ROOT_DIR 

--- a/scripts/slurm_scripts/run_uniclust30_search.sh
+++ b/scripts/slurm_scripts/run_uniclust30_search.sh
+#!/bin/bash
+
+# Generates uniclust30 all-against-all alignments on a SLURM cluster.
+# Thanks to Milot Mirdita for help & feedback on this script.
+
+set -e
+
+if [[ $# != 3 ]]; then
+    echo "usage: ./run_uniclust30_search.sh <uniclust30_path> <scratch_dir> <out_dir>"
+    exit
+fi
+
+UNICLUST_PATH=$1
+SCRATCH_DIR_BN=$2
+OUT_DIR=$3
+
+CPUS_PER_TASK=4
+MAX_SIZE=10000000000 # 10GB
+
+SCRATCH_DIR="${SCRATCH_DIR_BN}_${SLURM_NODEID}"
+
+mkdir -p ${SCRATCH_DIR}
+mkdir -p ${OUT_DIR}
+
+# copy database to local ssd
+DB_BN=$(basename $UNICLUST_PATH)
+DB_DIR="/dev/shm/uniclust30"
+mkdir -p $DB_DIR
+cp ${UNICLUST_PATH}*.ff* $DB_DIR
+DB="${DB_DIR}/${DB_BN}"
+
+for f in $(ls $OUT_DIR/*.zip)
+do 
+    zipinfo -1 $f '*/' | awk -F/ '{print $(NF-1)}' >> ${DB_DIR}/already_searched.txt
+done
+
+python3 filter_ffindex.py ${DB}_a3m.ffindex ${DB_DIR}/already_searched.txt ${DB_DIR}/filtered_a3m.ffindex 
+
+TARGET="${DB}_a3m_${SLURM_NODEID}.ffindex"
+split -n "l/$((SLURM_NODEID + 1))/${SLURM_JOB_NUM_NODES}" "${DB_DIR}/filtered_a3m.ffindex" > $TARGET
+
+open_sem() {
+    mkfifo pipe-$$
+    exec 3<>pipe-$$
+    rm pipe-$$
+    local i=$1
+    for ((;i>0;i--)); do
+        printf %s 000 >&3
+    done
+}
+
+# run the given command asynchronously and pop/push tokens
+run_with_lock() {
+    local x
+    # this read waits until there is something to read
+    read -u 3 -n 3 x && ((0==x)) || exit $x
+    (
+        ( "$@"; )
+        # push the return code of the command to the semaphore
+        printf '%.3d' $? >&3
+    )&
+}
+
+task() {
+    dd if="${DB}_a3m.ffdata" ibs=1 skip="${OFF}" count="${LEN}" status=none | \
+	hhblits -i stdin \
+            -oa3m "${SCRATCH_DIR}/${KEY}/uniclust30.a3m" \
+            -v 0 \
+            -o /dev/null \
+            -cpu $CPUS_PER_TASK \
+            -d $DB \
+            -n 3 \
+            -e 0.001
+}
+
+zip_or_not() {
+    SIZE=$(du -hbs $SCRATCH_DIR | sed 's/|/ /' | awk '{print $1}')
+    #if [[ "$SIZE" -gt "$MAX_SIZE" ]]
+    if [[ "2" -gt "1" ]]
+    then
+        wait
+        RANDOM_NAME=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 32)
+        zip -r "${OUT_DIR}/${RANDOM_NAME}.zip" $SCRATCH_DIR
+        find $SCRATCH_DIR -mindepth 1 -type d -exec rm -rf {} +
+    fi
+}
+
+N=$(($(nproc) / ${CPUS_PER_TASK}))
+open_sem $N
+while read -r KEY OFF LEN; do
+    PROT_DIR="${SCRATCH_DIR}/${KEY}"
+    
+    if [[ -d $PROT_DIR ]]
+    then
+        continue
+    fi
+    
+    mkdir -p $PROT_DIR
+    run_with_lock task "${KEY}" "${OFF}" "${LEN}"
+    zip_or_not
+done < $TARGET
+
+wait
+
+zip_or_not
+
+wait