"docs/source/README.md" did not exist on "996e8260b36a1d06bd550901e1dec8e7ad0c6eb9"
Commit 425bdb5e authored by Christina Floristean's avatar Christina Floristean
Browse files

Added UniRef30 to data pipeline

parent 68828c49
......@@ -330,6 +330,7 @@ class AlignmentRunner:
uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None,
uniref30_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None,
uniprot_database_path: Optional[str] = None,
template_searcher: Optional[TemplateSearcher] = None,
......@@ -355,12 +356,15 @@ class AlignmentRunner:
Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be
provided.
uniref30_database_path:
Path to uniref30. Searched alongside BFD if use_small_bfd is
false.
uniclust30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is
false.
use_small_bfd:
Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits.
in conjunction with uniref30/uniclust30 with hhblits.
no_cpus:
The number of CPUs available for alignment. By default, all
CPUs are used.
......@@ -413,7 +417,7 @@ class AlignmentRunner:
)
self.jackhmmer_small_bfd_runner = None
self.hhblits_bfd_uniclust_runner = None
self.hhblits_bfd_unirefclust_runner = None
if(bfd_database_path is not None):
if use_small_bfd:
self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
......@@ -423,9 +427,11 @@ class AlignmentRunner:
)
else:
dbs = [bfd_database_path]
if(uniclust30_database_path is not None):
if(uniref30_database_path is not None):
dbs.append(uniref30_database_path)
if (uniclust30_database_path is not None):
dbs.append(uniclust30_database_path)
self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
self.hhblits_bfd_unirefclust_runner = hhblits.HHBlits(
binary_path=hhblits_binary_path,
databases=dbs,
n_cpu=no_cpus,
......@@ -516,10 +522,17 @@ class AlignmentRunner:
msa_out_path=bfd_out_path,
msa_format="sto",
)
elif(self.hhblits_bfd_uniclust_runner is not None):
bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m")
hhblits_bfd_uniclust_result = run_msa_tool(
msa_runner=self.hhblits_bfd_uniclust_runner,
elif(self.hhblits_bfd_unirefclust_runner is not None):
uni_name = "uni"
for db_name in self.hhblits_bfd_unirefclust_runner.databases:
if "uniref" in db_name.lower():
uni_name = f"{uni_name}ref"
elif "uniclust" in db_name.lower():
uni_name = f"{uni_name}clust"
bfd_out_path = os.path.join(output_dir, f"bfd_{uni_name}_hits.a3m")
hhblits_bfd_unirefclust_result = run_msa_tool(
msa_runner=self.hhblits_bfd_unirefclust_runner,
fasta_path=fasta_path,
msa_out_path=bfd_out_path,
msa_format="a3m",
......
......@@ -88,6 +88,7 @@ def precompute_alignments(tags, seqs, alignment_dir, args, is_multimer):
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniref30_database_path=args.uniref30_database_path,
uniclust30_database_path=args.uniclust30_database_path,
no_cpus=args.cpus,
)
......@@ -208,6 +209,7 @@ def main(args):
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniref30_database_path=args.uniref30_database_path,
uniclust30_database_path=args.uniclust30_database_path,
uniprot_database_path=args.uniprot_database_path,
template_searcher=template_searcher,
......
......@@ -38,4 +38,7 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
gunzip "${ROOT_DIR}/${BASENAME}"
tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}"
rm "${ROOT_DIR}/${BASENAME}"
......@@ -11,6 +11,7 @@ import tempfile
import openfold.data.mmcif_parsing as mmcif_parsing
from openfold.data.data_pipeline import AlignmentRunner
from openfold.data.parsers import parse_fasta
from openfold.data.tools import hhsearch, hmmsearch
from openfold.np import protein, residue_constants
from utils import add_data_args
......@@ -114,15 +115,30 @@ def parse_and_align(files, alignment_runner, args):
def main(args):
# Build the alignment tool runner
if (args.hmmsearch_binary_path is not None):
template_searcher = hmmsearch.Hmmsearch(
binary_path=args.hmmsearch_binary_path,
hmmbuild_binary_path=args.hmmbuild_binary_path,
database_path=args.pdb_seqres_database_path,
)
elif (args.hhsearch_binary_path is not None):
template_searcher = hhsearch.HHSearch(
binary_path=args.hhsearch_binary_path,
databases=[args.pdb70_database_path],
)
else:
template_searcher = None
alignment_runner = AlignmentRunner(
jackhmmer_binary_path=args.jackhmmer_binary_path,
hhblits_binary_path=args.hhblits_binary_path,
hhsearch_binary_path=args.hhsearch_binary_path,
uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path,
uniref30_database_path=args.uniref30_database_path,
uniclust30_database_path=args.uniclust30_database_path,
pdb70_database_path=args.pdb70_database_path,
uniprot_database_path=args.uniprot_database_path,
template_searcher=template_searcher,
use_small_bfd=args.bfd_database_path is None,
no_cpus=args.cpus_per_task,
)
......
......@@ -17,6 +17,9 @@ def add_data_args(parser: argparse.ArgumentParser):
parser.add_argument(
'--pdb_seqres_database_path', type=str, default=None,
)
parser.add_argument(
'--uniref30_database_path', type=str, default=None,
)
parser.add_argument(
'--uniclust30_database_path', type=str, default=None,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment