Commit 39a6d0e6 authored by Christina Floristean's avatar Christina Floristean
Browse files

Merging in main branch

parents d8ee9c5f 84659c93
#!/bin/bash -e
# Copied from colabfold.mmseqs.com
#!/bin/bash -e
MMSEQS="$1"
QUERY="$2"
DBBASE="$3"
......
# Copyright 2022 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Converts OpenFold .pt checkpoints into AlphaFold .npz ones, which can then be
# used to run inference using DeepMind's JAX code.
import argparse
import numpy as np
import torch
from openfold.config import model_config
from openfold.model.model import AlphaFold
from openfold.utils.import_weights import (
Param,
ParamType,
generate_translation_dict,
process_translation_dict,
)
from openfold.utils.tensor_utils import tree_map
def reshape_fn(of_param, af_weight):
transformations = {
ParamType.LinearWeight: lambda w: w.transpose(-1, -2),
ParamType.LinearWeightMHA: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
ParamType.LinearMHAOutputWeight: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
ParamType.LinearBiasMHA: lambda w: w.reshape(af_weight.shape),
ParamType.LinearWeightOPM: lambda w: w.transpose(-1, -2).reshape(af_weight.shape),
ParamType.Other: lambda w: w,
}
if(of_param.stacked):
of_weight = torch.stack([torch.Tensor(p) for p in of_param.param])
else:
of_weight = torch.Tensor(of_param.param)
return transformations[of_param.param_type](of_weight)
def transfer(of_dict, af_weight_template):
for k in of_dict:
if(type(of_dict[k]) == dict):
transfer(of_dict[k], af_weight_template[k])
else:
reshaped = reshape_fn(of_dict[k], af_weight_template[k])
reshaped = reshaped.detach().numpy()
np.copyto(af_weight_template[k], reshaped)
def main(args):
d = torch.load(args.of_pt_path)
config = model_config(args.config_preset)
model = AlphaFold(config)
model.load_state_dict(d)
translation = generate_translation_dict(model, args.config_preset)
translation = process_translation_dict(translation)
af_weight_template = np.load(args.template_npz_path)
af_weight_template = {k:v for k,v in af_weight_template.items() if k in translation}
zero = lambda n: n * 0
af_weight_template = tree_map(zero, af_weight_template, np.ndarray)
transfer(translation, af_weight_template)
np.savez(args.out_path, **af_weight_template)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"of_pt_path", type=str, help="Path to OpenFold .pt checkpoint file"
)
parser.add_argument(
"config_preset", type=str, help="The corresponding config preset"
)
parser.add_argument(
"out_path", type=str, help="Path for output .npz file"
)
parser.add_argument(
"--template_npz_path",
type=str,
default="openfold/resources/params/params_model_1_ptm.npz",
help="""Path to an AlphaFold checkpoint w/ a superset of the OF
checkpoint's parameters. params_model_1_ptm.npz always works.
"""
)
args = parser.parse_args()
main(args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import requests
from openfold.data import mmcif_parsing
VALID_PERIODS = [
"1-year",
"6-months",
"3-months",
"1-month",
"1-week",
]
def generate_url(period, end_date):
return '/'.join([
"https://www.cameo3d.org/",
"modeling",
"targets",
period,
"ajax",
f"?to_date={end_date}",
])
def main(args):
data_dir_path = os.path.join(args.output_dir, "data_dir")
fasta_dir_path = os.path.join(args.output_dir, "fasta_dir")
os.makedirs(data_dir_path, exist_ok=True)
os.makedirs(fasta_dir_path, exist_ok=True)
url = generate_url(args.period, args.end_date)
raw_data = requests.get(url).text
parsed_data = json.loads(raw_data)
chain_data = parsed_data["aaData"]
for chain in chain_data:
pdb_id = chain["pdbid"]
chain_id = chain["pdbid_chain"]
pdb_url = f"https://files.rcsb.org/view/{pdb_id.upper()}.cif"
pdb_file = requests.get(pdb_url).text
parsed_cif = mmcif_parsing.parse(
file_id=pdb_id, mmcif_string=pdb_file
)
mmcif_object = parsed_cif.mmcif_object
if(mmcif_object is None):
raise list(parsed_cif.errors.values())[0]
seq = mmcif_object.chain_to_seqres[chain_id]
if(args.max_seqlen > 0):
if(len(seq) > len(seq)):
continue
fasta_file = '\n'.join([
f">{pdb_id}_{chain_id}",
seq,
])
fasta_filename = f"{pdb_id}_{chain_id}.fasta"
with open(os.path.join(fasta_dir_path, fasta_filename), "w") as fp:
fp.write(fasta_file)
cif_filename = f"{pdb_id}.cif"
with open(os.path.join(data_dir_path, cif_filename), "w") as fp:
fp.write(pdb_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"period", type=str,
help=f"""The length of the period from which to draw CAMEO proteins.
Choose from {VALID_PERIODS}"""
)
parser.add_argument(
"end_date", type=str,
help="The date marking the end of the period (YYYY-MM-DD)"
)
parser.add_argument("output_dir")
parser.add_argument(
"--max_seqlen", type=int, default=700,
help="The maximum length in residues of downloaded proteins (or -1)"
)
args = parser.parse_args()
if(args.period not in VALID_PERIODS):
raise ValueError(f"Invalid period. Choose from {VALID_PERIODS}")
date_regex = re.compile("^[0-9]{4}-[0-9]{2}-[0-9]{2}$")
if(not date_regex.match(args.end_date)):
raise ValueError(f"Invalid end_date: {args.end_date}. Use YYYY-MM-DD format")
main(args)
......@@ -35,4 +35,4 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/colabfold_envdb_202108.ta
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
......@@ -38,6 +38,4 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
popd
......@@ -14,9 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips all required data for AlphaFold.
# Downloads OpenFold parameters.
#
# Usage: bash download_all_data.sh /path/to/download/directory
# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
......@@ -24,28 +24,11 @@ if [[ $# -eq 0 ]]; then
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
if ! command -v aws &> /dev/null ; then
echo "Error: aws could not be found. Please install aws."
exit 1
fi
DOWNLOAD_DIR="$1"
DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs.
if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]]
then
echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized."
exit 1
fi
SCRIPT_DIR="$(dirname "$(realpath "$0")")"
echo "Downloading AlphaFold parameters..."
bash "${SCRIPT_DIR}/download_alphafold_params.sh" "${DOWNLOAD_DIR}"
echo "Downloading PDB70..."
bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}"
echo "Downloading PDB mmCIF files..."
bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}"
echo "All data downloaded."
DOWNLOAD_DIR="${1}/openfold_params"
mkdir -p "${DOWNLOAD_DIR}"
aws s3 cp --no-sign-request --region us-east-1 s3://openfold/openfold_params/ "${DOWNLOAD_DIR}" --recursive
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips OpenFold parameters from Google Drive. Alternative to
# the HuggingFace version.
#
# Usage: bash download_openfold_params_gdrive.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
FILE_ID="1GVzZA2nbdBbz6TKydvzquhfELJ3Movnb"
FILENAME="openfold_params_07_22.tar.gz"
download_from_gdrive() {
FILE_ID="$1"
OUT_DIR="$2"
MSG=$(wget \
--quiet \
--save-cookies /tmp/cookies_$$.txt \
--keep-session-cookies \
--no-check-certificate \
"https://docs.google.com/uc?export=download&id=${FILE_ID}" \
-O- \
)
CONFIRM=$(echo $MSG | sed -rn "s/.*confirm=([0-9A-Za-z_]+).*/\1\n/p")
FILENAME=$(echo $MSG | sed -e "s/.*<a href=\"\/open?id=${FILE_ID}\">\(.*\)<\/a> (.*/\1/")
FILEPATH="${OUT_DIR}/${FILENAME}"
wget \
--quiet \
--load-cookies /tmp/cookies_$$.txt \
"https://docs.google.com/uc?export=download&confirm=${CONFIRM}&id=${FILE_ID}" \
-O "${FILEPATH}"
rm /tmp/cookies_$$.txt
echo $FILEPATH
}
DOWNLOAD_DIR="$1"
mkdir -p "${DOWNLOAD_DIR}"
DOWNLOAD_PATH=$(download_from_gdrive $FILE_ID "${DOWNLOAD_DIR}")
DOWNLOAD_FILENAME=$(basename "${DOWNLOAD_PATH}")
if [[ $FILENAME != $DOWNLOAD_FILENAME ]]; then
echo "Error: Downloaded filename ${DOWNLOAD_FILENAME} does not match expected filename ${FILENAME}"
rm "${DOWNLOAD_PATH}"
exit
fi
tar --extract --verbose --file="${DOWNLOAD_PATH}" \
--directory="${DOWNLOAD_DIR}" --preserve-permissions
rm "${DOWNLOAD_PATH}"
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips OpenFold parameters.
#
# Usage: bash download_openfold_params_huggingface.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
URL="https://huggingface.co/nz/OpenFold"
DOWNLOAD_DIR="${1}/openfold_params/"
mkdir -p "${DOWNLOAD_DIR}"
git clone $URL "${DOWNLOAD_DIR}"
rm -rf "${DOWNLOAD_DIR}/.git"
......@@ -35,7 +35,7 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" --check-certificate=false
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}"
rm "${ROOT_DIR}/${BASENAME}"
#!/bin/bash
#
# Copyright 2021 AlQuraishi Laboratories
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads .cif files matching the RODA alignments. Outputs a list of
# RODA alignments for which .cif files could not be found..
if [[ $# != 2 ]]; then
echo "usage: ./download_roda_pdbs.sh <out_dir> <roda_pdb_alignment_dir>"
exit 1
fi
OUT_DIR=$1
RODA_ALIGNMENT_DIR=$2
if [[ -d $OUT_DIR ]]; then
echo "${OUT_DIR} already exists. Download failed..."
exit 1
fi
SERVER=snapshotrsync.rcsb.org # RCSB server name
PORT=873 # port RCSB server is using
rsync -rlpt -v -z --delete --port=$PORT $SERVER::20220103/pub/pdb/data/structures/divided/mmCIF/ $OUT_DIR 2>&1 > /dev/null
for f in $(find $OUT_DIR -mindepth 2 -type f); do
mv $f $OUT_DIR
BASENAME=$(basename $f)
gunzip "${OUT_DIR}/${BASENAME}"
done
find $OUT_DIR -mindepth 1 -type d,l -delete
for d in $(find $RODA_ALIGNMENT_DIR -mindepth 1 -maxdepth 1 -type d); do
BASENAME=$(basename $d)
PDB_ID=$(echo $BASENAME | cut -d '_' -f 1)
CIF_PATH="${OUT_DIR}/${PDB_ID}.cif"
if [[ ! -f $CIF_PATH ]]; then
echo $d
fi
done
......@@ -35,4 +35,5 @@ SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
gunzip "${ROOT_DIR}/${BASENAME}"
......@@ -36,6 +36,5 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
popd
#!/usr/bin/env sh
#
# Flattens a downloaded RODA database into the format expected by OpenFold
# Args:
# roda_dir:
# The path to the database you want to flatten. E.g. "roda/pdb"
# or "roda/uniclust30". Note that, to save space, this script
# will empty this directory.
# output_dir:
# The directory in which to construct the reformatted data
if [[ $# != 2 ]]; then
echo "usage: ./flatten_roda.sh <roda_dir> <output_dir>"
exit 1
fi
RODA_DIR=$1
OUTPUT_DIR=$2
DATA_DIR="${OUTPUT_DIR}/data"
ALIGNMENT_DIR="${OUTPUT_DIR}/alignments"
mkdir -p "${DATA_DIR}"
mkdir -p "${ALIGNMENT_DIR}"
for chain_dir in $(ls "${RODA_DIR}"); do
CHAIN_DIR_PATH="${RODA_DIR}/${chain_dir}"
for subdir in $(ls "${CHAIN_DIR_PATH}"); do
if [[ $subdir = "pdb" ]] || [[ $subdir = "cif" ]]; then
mv "${CHAIN_DIR_PATH}/${subdir}"/* "${DATA_DIR}"
else
CHAIN_ALIGNMENT_DIR="${ALIGNMENT_DIR}/${chain_dir}"
mkdir -p "${CHAIN_ALIGNMENT_DIR}"
mv "${CHAIN_DIR_PATH}/${subdir}"/* "${CHAIN_ALIGNMENT_DIR}"
fi
done
done
NO_DATA_FILES=$(find "${DATA_DIR}" -type f | wc -l)
if [[ $NO_DATA_FILES = 0 ]]; then
rm -rf ${DATA_DIR}
fi
......@@ -2,35 +2,62 @@ import argparse
import os
import pickle
from alphafold.data import pipeline, templates
from alphafold.data import pipeline, pipeline_multimer, templates
from alphafold.data.tools import hmmsearch, hhsearch
from scripts.utils import add_data_args
def main(args):
template_featurizer = templates.TemplateHitFeaturizer(
mmcif_dir=args.mmcif_dir,
max_template_date=args.max_template_date,
max_hits=20,
kalign_binary_path=args.kalign_binary_path,
release_dates_path=None,
obsolete_pdbs_path=args.obsolete_pdbs_path,
)
if (args.multimer):
template_searcher = hmmsearch.Hmmsearch(
binary_path=args.hmmsearch_binary_path,
hmmbuild_binary_path=args.hmmbuild_binary_path,
database_path=args.pdb_seqres_database_path,
)
template_featurizer = templates.HmmsearchHitFeaturizer(
mmcif_dir=args.template_mmcif_dir,
max_template_date=args.max_template_date,
max_hits=20,
kalign_binary_path=args.kalign_binary_path,
release_dates_path=args.release_dates_path,
obsolete_pdbs_path=args.obsolete_pdbs_path
)
else:
template_searcher = hhsearch.HHSearch(
binary_path=args.hhsearch_binary_path,
databases=[args.pdb70_database_path],
)
template_featurizer = templates.HhsearchHitFeaturizer(
mmcif_dir=args.template_mmcif_dir,
max_template_date=args.max_template_date,
max_hits=20,
kalign_binary_path=args.kalign_binary_path,
release_dates_path=None,
obsolete_pdbs_path=args.obsolete_pdbs_path
)
data_pipeline = pipeline.DataPipeline(
jackhmmer_binary_path=args.jackhmmer_binary_path,
hhblits_binary_path=args.hhblits_binary_path,
hhsearch_binary_path=args.hhsearch_binary_path,
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
pdb70_database_path=args.pdb70_database_path,
small_bfd_database_path=None,
template_featurizer=template_featurizer,
template_searcher=template_searcher,
use_small_bfd=False,
)
if (args.multimer):
data_pipeline = pipeline_multimer.DataPipeline(
monomer_data_pipeline=data_pipeline,
jackhmmer_binary_path=args.jackhmmer_binary_path,
uniprot_database_path=args.uniprot_database_path)
feature_dict = data_pipeline.process(
input_fasta_path=args.fasta_path,
msa_output_dir=args.output_dir,
......@@ -44,6 +71,7 @@ if __name__ == "__main__":
parser.add_argument("fasta_path", type=str)
parser.add_argument("mmcif_dir", type=str)
parser.add_argument("output_dir", type=str)
parser.add_argument("--multimer", action='store_true')
add_data_args(parser)
args = parser.parse_args()
......
......@@ -54,9 +54,8 @@ def parse_file(
chain_dict["seq"] = residue_constants.aatype_to_str_sequence(
protein_object.aatype,
)
local_data["resolution"] = 0.
cluster_size = chain_cluster_size_dict.get(file_id.upper(), -1)
chain_dict["resolution"] = 0.
if(chain_cluster_size_dict is not None):
cluster_size = chain_cluster_size_dict.get(
full_name.upper(), -1
......
#!/bin/bash
git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite
&& mkdir /tmp/hh-suite/build
&& pushd /tmp/hh-suite/build
&& cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite ..
&& make -j 4 && make install
&& ln -s /opt/hhsuite/bin/* /usr/bin
&& popd
&& rm -rf /tmp/hh-suite
git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \
&& mkdir /tmp/hh-suite/build \
&& pushd /tmp/hh-suite/build \
&& cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \
&& make -j 4 && make install \
&& ln -sf /opt/hhsuite/bin/* /usr/bin \
&& popd \
&& rm -rf /tmp/hh-suite
......@@ -15,7 +15,15 @@ wget -P /tmp \
export PATH=lib/conda/bin:$PATH
lib/conda/bin/python3 -m pip install nvidia-pyindex
conda env create --name=${ENV_NAME} -f environment.yml
source activate ${ENV_NAME}
source scripts/activate_conda_env.sh
echo "Attempting to install FlashAttention"
git clone https://github.com/HazyResearch/flash-attention
CUR_DIR=$PWD
cd flash-attention
git checkout 5b838a8bef
python3 setup.py install
cd $CUR_DIR
# Install DeepMind's OpenMM patch
OPENFOLD_DIR=$PWD
......@@ -24,15 +32,18 @@ pushd lib/conda/envs/$ENV_NAME/lib/python3.7/site-packages/ \
&& popd
# Download folding resources
wget -q -P openfold/resources \
wget --no-check-certificate -P openfold/resources \
https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
# Certain tests need access to this file
mkdir -p tests/test_data/alphafold/common
ln -rs openfold/resources/stereo_chemical_props.txt tests/test_data/alphafold/common
# Download pretrained openfold weights
scripts/download_alphafold_params.sh openfold/resources
echo "Downloading OpenFold parameters..."
bash scripts/download_openfold_params.sh openfold/resources
echo "Downloading AlphaFold parameters..."
bash scripts/download_alphafold_params.sh openfold/resources
# Decompress test data
gunzip tests/test_data/sample_feats.pickle.gz
......@@ -227,7 +227,7 @@ if __name__ == "__main__":
)
add_data_args(parser)
parser.add_argument(
"--raise_errors", type=bool, default=False,
"--raise_errors", action="store_true", default=False,
help="Whether to crash on parsing errors"
)
parser.add_argument(
......
......@@ -23,12 +23,12 @@ DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/mmseqs_dbs"
mkdir -p $ROOT_DIR
for f in $(ls ${DOWNLOAD_DIR}/*.tar.gz)
for f in $(ls ${DOWNLOAD_DIR}/*.tar*)
do
tar --extract --verbose --file="${f}" \
--directory=$ROOT_DIR
rm "${f}"
BASENAME="$(basename {f%%.*})"
BASENAME="$(basename ${f%%.*})"
DB_NAME="${BASENAME}_db"
OLD_PWD=$(pwd)
cd $ROOT_DIR
......
#!/bin/bash
# Generates uniclust30 all-against-all alignments on a SLURM cluster.
# Thanks to Milot Mirdita for help & feedback on this script.
set -e
if [[ $# != 3 ]]; then
echo "usage: ./run_uniclust30_search.sh <uniclust30_path> <scratch_dir> <out_dir>"
exit
fi
UNICLUST_PATH=$1
SCRATCH_DIR_BN=$2
OUT_DIR=$3
CPUS_PER_TASK=4
MAX_SIZE=10000000000 # 10GB
SCRATCH_DIR="${SCRATCH_DIR_BN}_${SLURM_NODEID}"
mkdir -p ${SCRATCH_DIR}
mkdir -p ${OUT_DIR}
# copy database to local ssd
DB_BN=$(basename $UNICLUST_PATH)
DB_DIR="/dev/shm/uniclust30"
mkdir -p $DB_DIR
cp ${UNICLUST_PATH}*.ff* $DB_DIR
DB="${DB_DIR}/${DB_BN}"
for f in $(ls $OUT_DIR/*.zip)
do
zipinfo -1 $f '*/' | awk -F/ '{print $(NF-1)}' >> ${DB_DIR}/already_searched.txt
done
python3 filter_ffindex.py ${DB}_a3m.ffindex ${DB_DIR}/already_searched.txt ${DB_DIR}/filtered_a3m.ffindex
TARGET="${DB}_a3m_${SLURM_NODEID}.ffindex"
split -n "l/$((SLURM_NODEID + 1))/${SLURM_JOB_NUM_NODES}" "${DB_DIR}/filtered_a3m.ffindex" > $TARGET
open_sem() {
mkfifo pipe-$$
exec 3<>pipe-$$
rm pipe-$$
local i=$1
for ((;i>0;i--)); do
printf %s 000 >&3
done
}
# run the given command asynchronously and pop/push tokens
run_with_lock() {
local x
# this read waits until there is something to read
read -u 3 -n 3 x && ((0==x)) || exit $x
(
( "$@"; )
# push the return code of the command to the semaphore
printf '%.3d' $? >&3
)&
}
task() {
dd if="${DB}_a3m.ffdata" ibs=1 skip="${OFF}" count="${LEN}" status=none | \
hhblits -i stdin \
-oa3m "${SCRATCH_DIR}/${KEY}/uniclust30.a3m" \
-v 0 \
-o /dev/null \
-cpu $CPUS_PER_TASK \
-d $DB \
-n 3 \
-e 0.001
}
zip_or_not() {
SIZE=$(du -hbs $SCRATCH_DIR | sed 's/|/ /' | awk '{print $1}')
#if [[ "$SIZE" -gt "$MAX_SIZE" ]]
if [[ "2" -gt "1" ]]
then
wait
RANDOM_NAME=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 32)
zip -r "${OUT_DIR}/${RANDOM_NAME}.zip" $SCRATCH_DIR
find $SCRATCH_DIR -mindepth 1 -type d -exec rm -rf {} +
fi
}
N=$(($(nproc) / ${CPUS_PER_TASK}))
open_sem $N
while read -r KEY OFF LEN; do
PROT_DIR="${SCRATCH_DIR}/${KEY}"
if [[ -d $PROT_DIR ]]
then
continue
fi
mkdir -p $PROT_DIR
run_with_lock task "${KEY}" "${OFF}" "${LEN}"
zip_or_not
done < $TARGET
wait
zip_or_not
wait
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment