Commit 13f8f163 authored by zhuwenwen's avatar zhuwenwen
Browse files
parents a509a4c5 b5fa2ba3
Pipeline #235 failed with stages
in 0 seconds
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips the PDB70 database for AlphaFold.
#
# Usage: bash download_pdb70.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/pdb70"
SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" --check-certificate=false
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}"
rm "${ROOT_DIR}/${BASENAME}"
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads, unzips and flattens the PDB database for AlphaFold.
#
# Usage: bash download_pdb_mmcif.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi
if ! command -v rsync &> /dev/null ; then
echo "Error: rsync could not be found. Please install rsync."
exit 1
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/pdb_mmcif"
RAW_DIR="${ROOT_DIR}/raw"
MMCIF_DIR="${ROOT_DIR}/mmcif_files"
echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..."
mkdir --parents "${RAW_DIR}"
rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \
rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \
"${RAW_DIR}"
echo "Unzipping all mmCIF files..."
find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} +
echo "Flattening all mmCIF files..."
mkdir --parents "${MMCIF_DIR}"
find "${RAW_DIR}" -type d -empty -delete # Delete empty directories.
for subdir in "${RAW_DIR}"/*; do
mv "${subdir}/"*.cif "${MMCIF_DIR}"
done
# Delete empty download directory structure.
find "${RAW_DIR}" -type d -empty -delete
aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}"
#!/bin/bash
#
# Copyright 2021 AlQuraishi Laboratories
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads .cif files matching the RODA alignments. Outputs a list of
# RODA alignments for which .cif files could not be found..
if [[ $# != 2 ]]; then
echo "usage: ./download_roda_pdbs.sh <out_dir> <roda_pdb_alignment_dir>"
exit 1
fi
OUT_DIR=$1
RODA_ALIGNMENT_DIR=$2
if [[ -d $OUT_DIR ]]; then
echo "${OUT_DIR} already exists. Download failed..."
exit 1
fi
SERVER=snapshotrsync.rcsb.org # RCSB server name
PORT=873 # port RCSB server is using
rsync -rlpt -v -z --delete --port=$PORT $SERVER::20220103/pub/pdb/data/structures/divided/mmCIF/ $OUT_DIR 2>&1 > /dev/null
for f in $(find $OUT_DIR -mindepth 2 -type f); do
mv $f $OUT_DIR
BASENAME=$(basename $f)
gunzip "${OUT_DIR}/${BASENAME}"
done
find $OUT_DIR -mindepth 1 -type d,l -delete
for d in $(find $RODA_ALIGNMENT_DIR -mindepth 1 -maxdepth 1 -type d); do
BASENAME=$(basename $d)
PDB_ID=$(echo $BASENAME | cut -d '_' -f 1)
CIF_PATH="${OUT_DIR}/${PDB_ID}.cif"
if [[ ! -f $CIF_PATH ]]; then
echo $d
fi
done
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips the Small BFD database for AlphaFold.
#
# Usage: bash download_small_bfd.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/small_bfd"
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
pushd "${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
popd
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips the Uniclust30 database for AlphaFold.
#
# Usage: bash download_uniclust30.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/uniclust30"
# Mirror of:
# http://wwwuser.gwdg.de/~compbiol/uniclust/2018_08/uniclust30_2018_08_hhsuite.tar.gz
SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}"
rm "${ROOT_DIR}/${BASENAME}"
#!/bin/bash
#
# Copyright 2021 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips the BFD database for AlphaFold.
#
# Usage: bash download_bfd.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}"
SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
gunzip "${ROOT_DIR}/${BASENAME}"
#!/bin/bash
#
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips the UniRef90 database for AlphaFold.
#
# Usage: bash download_uniref90.sh /path/to/download/directory
set -e
if [[ $# -eq 0 ]]; then
echo "Error: download directory must be provided as an input argument."
exit 1
fi
if ! command -v aria2c &> /dev/null ; then
echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)."
exit 1
fi
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/uniref90"
SOURCE_URL="ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz"
BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
gunzip "${ROOT_DIR}/${BASENAME}"
#!/usr/bin/env sh
#
# Flattens a downloaded RODA database into the format expected by OpenFold
# Args:
# roda_dir:
# The path to the database you want to flatten. E.g. "roda/pdb"
# or "roda/uniclust30". Note that, to save space, this script
# will empty this directory.
# output_dir:
# The directory in which to construct the reformatted data
if [[ $# != 2 ]]; then
echo "usage: ./flatten_roda.sh <roda_dir> <output_dir>"
exit 1
fi
RODA_DIR=$1
OUTPUT_DIR=$2
DATA_DIR="${OUTPUT_DIR}/data"
ALIGNMENT_DIR="${OUTPUT_DIR}/alignments"
mkdir -p "${DATA_DIR}"
mkdir -p "${ALIGNMENT_DIR}"
for chain_dir in $(ls "${RODA_DIR}"); do
CHAIN_DIR_PATH="${RODA_DIR}/${chain_dir}"
for subdir in $(ls "${CHAIN_DIR_PATH}"); do
if [[ $subdir = "pdb" ]] || [[ $subdir = "cif" ]]; then
mv "${CHAIN_DIR_PATH}/${subdir}"/* "${DATA_DIR}"
else
CHAIN_ALIGNMENT_DIR="${ALIGNMENT_DIR}/${chain_dir}"
mkdir -p "${CHAIN_ALIGNMENT_DIR}"
mv "${CHAIN_DIR_PATH}/${subdir}"/* "${CHAIN_ALIGNMENT_DIR}"
fi
done
done
NO_DATA_FILES=$(find "${DATA_DIR}" -type f | wc -l)
if [[ $NO_DATA_FILES = 0 ]]; then
rm -rf ${DATA_DIR}
fi
import argparse
import os
import pickle
from alphafold.data import pipeline, templates
from scripts.utils import add_data_args
def main(args):
template_featurizer = templates.TemplateHitFeaturizer(
mmcif_dir=args.mmcif_dir,
max_template_date=args.max_template_date,
max_hits=20,
kalign_binary_path=args.kalign_binary_path,
release_dates_path=None,
obsolete_pdbs_path=args.obsolete_pdbs_path,
)
data_pipeline = pipeline.DataPipeline(
jackhmmer_binary_path=args.jackhmmer_binary_path,
hhblits_binary_path=args.hhblits_binary_path,
hhsearch_binary_path=args.hhsearch_binary_path,
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
pdb70_database_path=args.pdb70_database_path,
small_bfd_database_path=None,
template_featurizer=template_featurizer,
use_small_bfd=False,
)
feature_dict = data_pipeline.process(
input_fasta_path=args.fasta_path,
msa_output_dir=args.output_dir,
)
with open(os.path.join(args.output_dir, "feature_dict.pickle"), "wb") as fp:
pickle.dump(feature_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fasta_path", type=str)
parser.add_argument("mmcif_dir", type=str)
parser.add_argument("output_dir", type=str)
add_data_args(parser)
args = parser.parse_args()
main(args)
import argparse
from functools import partial
import json
import logging
from multiprocessing import Pool
import os
import sys
sys.path.append(".") # an innocent hack to get this to run from the top level
from tqdm import tqdm
from openfold.data.mmcif_parsing import parse
from openfold.np import protein, residue_constants
def parse_file(
f,
args,
chain_cluster_size_dict
):
file_id, ext = os.path.splitext(f)
if(ext == ".cif"):
with open(os.path.join(args.data_dir, f), "r") as fp:
mmcif_string = fp.read()
mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
if mmcif.mmcif_object is None:
logging.info(f"Could not parse {f}. Skipping...")
return {}
else:
mmcif = mmcif.mmcif_object
out = {}
for chain_id, seq in mmcif.chain_to_seqres.items():
full_name = "_".join([file_id, chain_id])
out[full_name] = {}
local_data = out[full_name]
local_data["release_date"] = mmcif.header["release_date"]
local_data["seq"] = seq
local_data["resolution"] = mmcif.header["resolution"]
if(chain_cluster_size_dict is not None):
cluster_size = chain_cluster_size_dict.get(
full_name.upper(), -1
)
local_data["cluster_size"] = cluster_size
elif(ext == ".pdb"):
with open(os.path.join(args.data_dir, f), "r") as fp:
pdb_string = fp.read()
protein_object = protein.from_pdb_string(pdb_string, None)
chain_dict = {}
chain_dict["seq"] = residue_constants.aatype_to_str_sequence(
protein_object.aatype,
)
chain_dict["resolution"] = 0.
if(chain_cluster_size_dict is not None):
cluster_size = chain_cluster_size_dict.get(
full_name.upper(), -1
)
chain_dict["cluster_size"] = cluster_size
out = {file_id: chain_dict}
return out
def main(args):
chain_cluster_size_dict = None
if(args.cluster_file is not None):
chain_cluster_size_dict = {}
with open(args.cluster_file, "r") as fp:
clusters = [l.strip() for l in fp.readlines()]
for cluster in clusters:
chain_ids = cluster.split()
cluster_len = len(chain_ids)
for chain_id in chain_ids:
chain_id = chain_id.upper()
chain_cluster_size_dict[chain_id] = cluster_len
accepted_exts = [".cif", ".pdb"]
files = list(os.listdir(args.data_dir))
files = [f for f in files if os.path.splitext(f)[-1] in accepted_exts]
fn = partial(
parse_file,
args=args,
chain_cluster_size_dict=chain_cluster_size_dict,
)
data = {}
with Pool(processes=args.no_workers) as p:
with tqdm(total=len(files)) as pbar:
for d in p.imap_unordered(fn, files, chunksize=args.chunksize):
data.update(d)
pbar.update()
with open(args.output_path, "w") as fp:
fp.write(json.dumps(data, indent=4))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"data_dir", type=str, help="Directory containing mmCIF or PDB files"
)
parser.add_argument(
"output_path", type=str, help="Path for .json output"
)
parser.add_argument(
"--cluster_file", type=str, default=None,
help=(
"Path to a cluster file (e.g. PDB40), one cluster "
"({PROT1_ID}_{CHAIN_ID} {PROT2_ID}_{CHAIN_ID} ...) per line. "
"Chains not in this cluster file will NOT be filtered by cluster "
"size."
)
)
parser.add_argument(
"--no_workers", type=int, default=4,
help="Number of workers to use for parsing"
)
parser.add_argument(
"--chunksize", type=int, default=10,
help="How many files should be distributed to each worker at a time"
)
args = parser.parse_args()
main(args)
import argparse
from functools import partial
import json
import logging
from multiprocessing import Pool
import os
import sys
sys.path.append(".") # an innocent hack to get this to run from the top level
from tqdm import tqdm
from openfold.data.mmcif_parsing import parse
def parse_file(f, args):
with open(os.path.join(args.mmcif_dir, f), "r") as fp:
mmcif_string = fp.read()
file_id = os.path.splitext(f)[0]
mmcif = parse(file_id=file_id, mmcif_string=mmcif_string)
if mmcif.mmcif_object is None:
logging.info(f"Could not parse {f}. Skipping...")
return {}
else:
mmcif = mmcif.mmcif_object
local_data = {}
local_data["release_date"] = mmcif.header["release_date"]
chain_ids, seqs = list(zip(*mmcif.chain_to_seqres.items()))
local_data["chain_ids"] = chain_ids
local_data["seqs"] = seqs
local_data["no_chains"] = len(chain_ids)
local_data["resolution"] = mmcif.header["resolution"]
return {file_id: local_data}
def main(args):
files = [f for f in os.listdir(args.mmcif_dir) if ".cif" in f]
fn = partial(parse_file, args=args)
data = {}
with Pool(processes=args.no_workers) as p:
with tqdm(total=len(files)) as pbar:
for d in p.imap_unordered(fn, files, chunksize=args.chunksize):
data.update(d)
pbar.update()
with open(args.output_path, "w") as fp:
fp.write(json.dumps(data, indent=4))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"mmcif_dir", type=str, help="Directory containing mmCIF files"
)
parser.add_argument(
"output_path", type=str, help="Path for .json output"
)
parser.add_argument(
"--no_workers", type=int, default=4,
help="Number of workers to use for parsing"
)
parser.add_argument(
"--chunksize", type=int, default=10,
help="How many files should be distributed to each worker at a time"
)
args = parser.parse_args()
main(args)
#!/bin/bash
git clone --branch v3.3.0 https://github.com/soedinglab/hh-suite.git /tmp/hh-suite \
&& mkdir /tmp/hh-suite/build \
&& pushd /tmp/hh-suite/build \
&& cmake -DCMAKE_INSTALL_PREFIX=/opt/hhsuite .. \
&& make -j 4 && make install \
&& ln -sf /opt/hhsuite/bin/* /usr/bin \
&& popd \
&& rm -rf /tmp/hh-suite
#!/bin/bash
CONDA_INSTALL_URL=${CONDA_INSTALL_URL:-"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"}
source scripts/vars.sh
# Install Miniconda locally
rm -rf lib/conda
rm -f /tmp/Miniconda3-latest-Linux-x86_64.sh
wget -P /tmp \
"${CONDA_INSTALL_URL}" \
&& bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p lib/conda \
&& rm /tmp/Miniconda3-latest-Linux-x86_64.sh
# Grab conda-only packages
export PATH=lib/conda/bin:$PATH
lib/conda/bin/python3 -m pip install nvidia-pyindex
conda env create --name=${ENV_NAME} -f environment.yml
source scripts/activate_conda_env.sh
echo "Attempting to install FlashAttention"
git clone https://github.com/HazyResearch/flash-attention
CUR_DIR=$PWD
cd flash-attention
git checkout 5b838a8bef
python3 setup.py install
cd $CUR_DIR
# Install DeepMind's OpenMM patch
OPENFOLD_DIR=$PWD
pushd lib/conda/envs/$ENV_NAME/lib/python3.7/site-packages/ \
&& patch -p0 < $OPENFOLD_DIR/lib/openmm.patch \
&& popd
# Download folding resources
wget --no-check-certificate -P openfold/resources \
https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
# Certain tests need access to this file
mkdir -p tests/test_data/alphafold/common
ln -rs openfold/resources/stereo_chemical_props.txt tests/test_data/alphafold/common
echo "Downloading OpenFold parameters..."
bash scripts/download_openfold_params.sh openfold/resources
echo "Downloading AlphaFold parameters..."
bash scripts/download_alphafold_params.sh openfold/resources
# Decompress test data
gunzip tests/test_data/sample_feats.pickle.gz
import argparse
from functools import partial
import json
import logging
import os
import threading
from multiprocessing import cpu_count
from shutil import copyfile
import tempfile
import openfold.data.mmcif_parsing as mmcif_parsing
from openfold.data.data_pipeline import AlignmentRunner
from openfold.data.parsers import parse_fasta
from openfold.np import protein, residue_constants
from utils import add_data_args
logging.basicConfig(level=logging.WARNING)
def run_seq_group_alignments(seq_groups, alignment_runner, args):
dirs = set(os.listdir(args.output_dir))
for seq, names in seq_groups:
first_name = names[0]
alignment_dir = os.path.join(args.output_dir, first_name)
try:
os.makedirs(alignment_dir)
except Exception as e:
logging.warning(f"Failed to create directory for {first_name} with exception {e}...")
continue
fd, fasta_path = tempfile.mkstemp(suffix=".fasta")
with os.fdopen(fd, 'w') as fp:
fp.write(f'>query\n{seq}')
try:
alignment_runner.run(
fasta_path, alignment_dir
)
except:
logging.warning(f"Failed to run alignments for {first_name}. Skipping...")
os.remove(fasta_path)
os.rmdir(alignment_dir)
continue
os.remove(fasta_path)
for name in names[1:]:
if(name in dirs):
logging.warning(
f'{name} has already been processed. Skipping...'
)
continue
cp_dir = os.path.join(args.output_dir, name)
os.makedirs(cp_dir, exist_ok=True)
for f in os.listdir(alignment_dir):
copyfile(os.path.join(alignment_dir, f), os.path.join(cp_dir, f))
def parse_and_align(files, alignment_runner, args):
for f in files:
path = os.path.join(args.input_dir, f)
file_id = os.path.splitext(f)[0]
seq_group_dict = {}
if(f.endswith('.cif')):
with open(path, 'r') as fp:
mmcif_str = fp.read()
mmcif = mmcif_parsing.parse(
file_id=file_id, mmcif_string=mmcif_str
)
if(mmcif.mmcif_object is None):
logging.warning(f'Failed to parse {f}...')
if(args.raise_errors):
raise list(mmcif.errors.values())[0]
else:
continue
mmcif = mmcif.mmcif_object
for chain_letter, seq in mmcif.chain_to_seqres.items():
chain_id = '_'.join([file_id, chain_letter])
l = seq_group_dict.setdefault(seq, [])
l.append(chain_id)
elif(f.endswith('.fasta') or f.endswith('.fa')):
with open(path, 'r') as fp:
fasta_str = fp.read()
input_seqs, _ = parse_fasta(fasta_str)
if len(input_seqs) != 1:
msg = f'More than one input_sequence found in {f}'
if(args.raise_errors):
raise ValueError(msg)
else:
logging.warning(msg)
input_sequence = input_seqs[0]
seq_group_dict[input_sequence] = [file_id]
elif(f.endswith('.core')):
with open(path, 'r') as fp:
core_str = fp.read()
core_prot = protein.from_proteinnet_string(core_str)
aatype = core_prot.aatype
seq = ''.join([
residue_constants.restypes_with_x[aatype[i]]
for i in range(len(aatype))
])
seq_group_dict[seq] = [file_id]
else:
continue
seq_group_tuples = [(k,v) for k,v in seq_group_dict.items()]
run_seq_group_alignments(seq_group_tuples, alignment_runner, args)
def main(args):
# Build the alignment tool runner
alignment_runner = AlignmentRunner(
jackhmmer_binary_path=args.jackhmmer_binary_path,
hhblits_binary_path=args.hhblits_binary_path,
hhsearch_binary_path=args.hhsearch_binary_path,
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniclust30_database_path=args.uniclust30_database_path,
pdb70_database_path=args.pdb70_database_path,
use_small_bfd=args.bfd_database_path is None,
no_cpus=args.cpus_per_task,
)
files = list(os.listdir(args.input_dir))
# Do some filtering
if(args.mmcif_cache is not None):
with open(args.mmcif_cache, "r") as fp:
cache = json.load(fp)
else:
cache = None
dirs = []
if(cache is not None and args.filter):
dirs = set(os.listdir(args.output_dir))
def prot_is_done(f):
prot_id = os.path.splitext(f)[0]
if(prot_id in cache):
chain_ids = cache[prot_id]["chain_ids"]
for c in chain_ids:
full_name = prot_id + "_" + c
if(not full_name in dirs):
return False
else:
return False
return True
files = [f for f in files if not prot_is_done(f)]
def split_up_arglist(arglist):
# Split up the survivors
if(os.environ.get("SLURM_JOB_NUM_NODES", 0)):
num_nodes = int(os.environ["SLURM_JOB_NUM_NODES"])
if(num_nodes > 1):
node_id = int(os.environ["SLURM_NODEID"])
logging.warning(f"Num nodes: {num_nodes}")
logging.warning(f"Node ID: {node_id}")
arglist = arglist[node_id::num_nodes]
t_arglist = []
for i in range(args.no_tasks):
t_arglist.append(arglist[i::args.no_tasks])
return t_arglist
if(cache is not None and "seqs" in next(iter(cache.values()))):
seq_group_dict = {}
for f in files:
prot_id = os.path.splitext(f)[0]
if(prot_id in cache):
prot_cache = cache[prot_id]
chains_seqs = zip(
prot_cache["chain_ids"], prot_cache["seqs"]
)
for chain, seq in chains_seqs:
chain_name = prot_id + "_" + chain
if(chain_name not in dirs):
l = seq_group_dict.setdefault(seq, [])
l.append(chain_name)
func = partial(run_seq_group_alignments,
alignment_runner=alignment_runner,
args=args
)
seq_groups = [(k,v) for k,v in seq_group_dict.items()]
# Sort them by group length so the tasks are approximately balanced
seq_groups = sorted(seq_groups, key=lambda x: len(x[1]))
task_arglist = [[a] for a in split_up_arglist(seq_groups)]
else:
func = partial(parse_and_align,
alignment_runner=alignment_runner,
args=args,
)
task_arglist = [[a] for a in split_up_arglist(files)]
threads = []
for i, task_args in enumerate(task_arglist):
print(f"Started thread {i}...")
t = threading.Thread(target=func, args=task_args)
threads.append(t)
t.start()
for t in threads:
t.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"input_dir", type=str,
help="""Path to directory containing mmCIF, FASTA and/or ProteinNet
.core files"""
)
parser.add_argument(
"output_dir", type=str,
help="Directory in which to output alignments"
)
add_data_args(parser)
parser.add_argument(
"--raise_errors", action="store_true", default=False,
help="Whether to crash on parsing errors"
)
parser.add_argument(
"--cpus_per_task", type=int, default=cpu_count(),
help="Number of CPUs to use"
)
parser.add_argument(
"--mmcif_cache", type=str, default=None,
help="Path to mmCIF cache. Used to filter files to be parsed"
)
parser.add_argument(
"--no_tasks", type=int, default=1,
)
parser.add_argument(
"--filter", type=bool, default=True,
)
args = parser.parse_args()
main(args)
import argparse
import logging
import os
from pathlib import Path
import subprocess
from openfold.data.tools import hhsearch
def _split_a3ms(output_dir):
for fname in os.listdir(output_dir):
if(not os.path.splitext(fname)[-1] == ".a3m"):
continue
fpath = os.path.join(output_dir, fname)
with open(fpath, "r") as fp:
a3ms = fp.read()
# Split by the null byte, excluding the terminating null byte
a3ms = a3ms.split('\x00')[:-1]
for a3m in a3ms:
name = a3m.split('\n', 1)[0][1:]
prot_dir = os.path.join(output_dir, name)
Path(prot_dir).mkdir(parents=True, exist_ok=True)
with open(os.path.join(prot_dir, fname), "w") as fp:
fp.write(a3m)
os.remove(fpath)
os.remove(fpath + ".dbtype")
os.remove(fpath + ".index")
def main(args):
with open(args.input_fasta, "r") as f:
lines = [l.strip() for l in f.readlines()]
names = lines[::2]
seqs = lines[1::2]
if(args.fasta_chunk_size is None):
chunk_size = len(seqs)
else:
chunk_size = args.fasta_chunk_size
# Make the output directory
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
s = 0
while(s < len(seqs)):
e = s + chunk_size
chunk_fasta = [el for tup in zip(names[s:e], seqs[s:e]) for el in tup]
s = e
prot_dir = os.path.join(args.output_dir, chunk_fasta[0][1:].upper())
if(os.path.exists(prot_dir)):
# We've already computed this chunk
continue
chunk_fasta_path = os.path.join(args.output_dir, "tmp.fasta")
with open(chunk_fasta_path, "w") as f:
f.write('\n'.join(chunk_fasta) + '\n')
cmd = [
"scripts/colabfold_search.sh",
args.mmseqs_binary_path,
chunk_fasta_path,
args.mmseqs_db_dir,
args.output_dir,
args.uniref_db,
'""',
'""' if args.env_db is None else args.env_db,
"0" if args.env_db is None else "1",
"0", # compute templates
"1", # filter
"1", # use precomputed index
"0", # db-load-mode
]
logging.info('Launching subprocess "%s"', " ".join(cmd))
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
retcode = process.wait()
if retcode:
raise RuntimeError(
"MMseqs failed\nstdout:\n%s\n\nstderr:\n%s\n"
% (stdout.decode("utf-8"), stderr.decode("utf-8"))
)
_split_a3ms(args.output_dir)
# Clean up temporary files
os.remove(chunk_fasta_path)
hhsearch_pdb70_runner = hhsearch.HHSearch(
binary_path=args.hhsearch_binary_path, databases=[args.pdb70]
)
for d in os.listdir(args.output_dir):
dpath = os.path.join(args.output_dir, d)
if(not os.path.isdir(dpath)):
continue
for fname in os.listdir(dpath):
fpath = os.path.join(dpath, fname)
if(not "uniref" in fname or
not os.path.splitext(fname)[-1] == ".a3m"):
continue
with open(fpath, "r") as fp:
a3m = fp.read()
hhsearch_result = hhsearch_pdb70_runner.query(a3m)
pdb70_out_path = os.path.join(dpath, "pdb70_hits.hhr")
with open(pdb70_out_path, "w") as f:
f.write(hhsearch_result)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"input_fasta", type=str,
help="Path to input FASTA file. Can contain one or more sequences."
)
parser.add_argument(
"mmseqs_db_dir", type=str,
help="""Path to directory containing pre-processed MMSeqs2 DBs
(see README)"""
)
parser.add_argument(
"uniref_db", type=str,
help="Basename of uniref database"
)
parser.add_argument(
"output_dir", type=str,
help="Output directory"
)
parser.add_argument(
"mmseqs_binary_path", type=str,
help="Path to mmseqs binary"
)
parser.add_argument(
"--hhsearch_binary_path", type=str, default=None,
help="""Path to hhsearch binary (for template search). In future
versions, we'll also use mmseqs for this"""
)
parser.add_argument(
"--pdb70", type=str, default=None,
help="Basename of the pdb70 database"
)
parser.add_argument(
"--env_db", type=str, default=None,
help="Basename of environmental database"
)
parser.add_argument(
"--fasta_chunk_size", type=int, default=None,
help="""How many sequences should be processed at once. All sequences
processed at once by default."""
)
args = parser.parse_args()
if(args.hhsearch_binary_path is not None and args.pdb70 is None):
raise ValueError(
"pdb70 must be specified along with hhsearch_binary_path"
)
main(args)
#!/bin/bash
#
# Copyright 2021 AlQuraishi Laboratory
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Downloads and unzips all required data for AlphaFold.
#
# Usage: bash download_all_data.sh /path/to/download/directory
set -e
DOWNLOAD_DIR="$1"
ROOT_DIR="${DOWNLOAD_DIR}/mmseqs_dbs"
mkdir -p $ROOT_DIR
for f in $(ls ${DOWNLOAD_DIR}/*.tar*)
do
tar --extract --verbose --file="${f}" \
--directory=$ROOT_DIR
rm "${f}"
BASENAME="$(basename ${f%%.*})"
DB_NAME="${BASENAME}_db"
OLD_PWD=$(pwd)
cd $ROOT_DIR
mmseqs tsv2exprofiledb "${BASENAME}" "${DB_NAME}"
mmseqs createindex "${DB_NAME}" "${DOWNLOAD_DIR}/tmp/"
cd "${OLD_PWD}"
done
import argparse
import logging
import os
import shutil
def main(args):
count = 0
max_count = args.max_count if args.max_count is not None else -1
msas = sorted(f for f in os.listdir(args.msa_dir))
mmcifs = sorted(f for f in os.listdir(args.mmcif_dir))
mmcif_idx = 0
for f in msas:
if(count == max_count):
break
path = os.path.join(args.msa_dir, f)
name = os.path.splitext(f)[0]
spl = name.upper().split('_')
if(len(spl) != 3):
continue
pdb_id, _, chain_id = spl
while pdb_id > os.path.splitext(mmcifs[mmcif_idx])[0].upper():
mmcif_idx += 1
# Only consider files with matching mmCIF files
if(pdb_id == os.path.splitext(mmcifs[mmcif_idx])[0].upper()):
dirname = os.path.join(args.out_dir, '_'.join([pdb_id, chain_id]))
os.makedirs(dirname, exist_ok=True)
dest = os.path.join(dirname, f)
if(args.copy):
shutil.copyfile(path, dest)
else:
os.rename(path, dest)
count += 1
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=
"Converts raw ProteinNet MSAs into a format recognized by the parser"
)
parser.add_argument(
"msa_dir", type=str, help="Directory containing ProteinNet MSAs"
)
parser.add_argument(
"mmcif_dir", type=str, help="Directory containing PDB mmCIFs"
)
parser.add_argument(
"out_dir", type=str,
help="Directory to which output should be saved"
)
parser.add_argument(
"--copy", type=bool, default=True,
help="Whether to copy the MSAs to out_dir rather than moving them"
)
parser.add_argument(
"--max_count", type=int, default=None,
help="A bound on the number of MSAs to process"
)
args = parser.parse_args()
main(args)
#!/bin/bash
CUDA_VISIBLE_DEVICES="0"
python3 -m unittest "$@" || \
echo -e "\nTest(s) failed. Make sure you've installed all Python dependencies."
#!/bin/bash
# Generates uniclust30 all-against-all alignments on a SLURM cluster.
# Thanks to Milot Mirdita for help & feedback on this script.
set -e
if [[ $# != 3 ]]; then
echo "usage: ./run_uniclust30_search.sh <uniclust30_path> <scratch_dir> <out_dir>"
exit
fi
UNICLUST_PATH=$1
SCRATCH_DIR_BN=$2
OUT_DIR=$3
CPUS_PER_TASK=4
MAX_SIZE=10000000000 # 10GB
SCRATCH_DIR="${SCRATCH_DIR_BN}_${SLURM_NODEID}"
mkdir -p ${SCRATCH_DIR}
mkdir -p ${OUT_DIR}
# copy database to local ssd
DB_BN=$(basename $UNICLUST_PATH)
DB_DIR="/dev/shm/uniclust30"
mkdir -p $DB_DIR
cp ${UNICLUST_PATH}*.ff* $DB_DIR
DB="${DB_DIR}/${DB_BN}"
for f in $(ls $OUT_DIR/*.zip)
do
zipinfo -1 $f '*/' | awk -F/ '{print $(NF-1)}' >> ${DB_DIR}/already_searched.txt
done
python3 filter_ffindex.py ${DB}_a3m.ffindex ${DB_DIR}/already_searched.txt ${DB_DIR}/filtered_a3m.ffindex
TARGET="${DB}_a3m_${SLURM_NODEID}.ffindex"
split -n "l/$((SLURM_NODEID + 1))/${SLURM_JOB_NUM_NODES}" "${DB_DIR}/filtered_a3m.ffindex" > $TARGET
open_sem() {
mkfifo pipe-$$
exec 3<>pipe-$$
rm pipe-$$
local i=$1
for ((;i>0;i--)); do
printf %s 000 >&3
done
}
# run the given command asynchronously and pop/push tokens
run_with_lock() {
local x
# this read waits until there is something to read
read -u 3 -n 3 x && ((0==x)) || exit $x
(
( "$@"; )
# push the return code of the command to the semaphore
printf '%.3d' $? >&3
)&
}
task() {
dd if="${DB}_a3m.ffdata" ibs=1 skip="${OFF}" count="${LEN}" status=none | \
hhblits -i stdin \
-oa3m "${SCRATCH_DIR}/${KEY}/uniclust30.a3m" \
-v 0 \
-o /dev/null \
-cpu $CPUS_PER_TASK \
-d $DB \
-n 3 \
-e 0.001
}
zip_or_not() {
SIZE=$(du -hbs $SCRATCH_DIR | sed 's/|/ /' | awk '{print $1}')
#if [[ "$SIZE" -gt "$MAX_SIZE" ]]
if [[ "2" -gt "1" ]]
then
wait
RANDOM_NAME=$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 32)
zip -r "${OUT_DIR}/${RANDOM_NAME}.zip" $SCRATCH_DIR
find $SCRATCH_DIR -mindepth 1 -type d -exec rm -rf {} +
fi
}
N=$(($(nproc) / ${CPUS_PER_TASK}))
open_sem $N
while read -r KEY OFF LEN; do
PROT_DIR="${SCRATCH_DIR}/${KEY}"
if [[ -d $PROT_DIR ]]
then
continue
fi
mkdir -p $PROT_DIR
run_with_lock task "${KEY}" "${OFF}" "${LEN}"
zip_or_not
done < $TARGET
wait
zip_or_not
wait
import argparse
import os
from pathlib import Path
def _write_file(args, file_in_progress):
file_id = file_in_progress[1]
fname = file_id.upper() + ".core"
fpath = os.path.join(args.output_dir, fname)
with open(fpath, "w") as fp:
fp.write('\n'.join(file_in_progress))
def main(args):
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
with open(args.proteinnet_file, "r") as fp:
proteinnet_string = fp.readlines()
file_in_progress = []
for line in proteinnet_string:
if(line == "[ID]\n"):
if(len(file_in_progress) > 0):
_write_file(args, file_in_progress)
file_in_progress = []
file_in_progress.append(line.strip())
if(len(file_in_progress) > 0):
_write_file(args, file_in_progress)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"proteinnet_file", type=str,
help="Path to ProteinNet file to unpack"
)
parser.add_argument(
"output_dir", type=str,
help="Path to directory in which to output .core files"
)
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment