"lib/bindings/git@developer.sourcefind.cn:OpenDAS/dynamo.git" did not exist on "110f3f8caeff051b32f168c44fda9faa0d71ed18"
Commit 425bdb5e authored by Christina Floristean's avatar Christina Floristean
Browse files

Added UniRef30 to data pipeline

parent 68828c49
...@@ -330,6 +330,7 @@ class AlignmentRunner: ...@@ -330,6 +330,7 @@ class AlignmentRunner:
uniref90_database_path: Optional[str] = None, uniref90_database_path: Optional[str] = None,
mgnify_database_path: Optional[str] = None, mgnify_database_path: Optional[str] = None,
bfd_database_path: Optional[str] = None, bfd_database_path: Optional[str] = None,
uniref30_database_path: Optional[str] = None,
uniclust30_database_path: Optional[str] = None, uniclust30_database_path: Optional[str] = None,
uniprot_database_path: Optional[str] = None, uniprot_database_path: Optional[str] = None,
template_searcher: Optional[TemplateSearcher] = None, template_searcher: Optional[TemplateSearcher] = None,
...@@ -355,12 +356,15 @@ class AlignmentRunner: ...@@ -355,12 +356,15 @@ class AlignmentRunner:
Path to BFD database. Depending on the value of use_small_bfd, Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be one of hhblits_binary_path or jackhmmer_binary_path must be
provided. provided.
uniref30_database_path:
Path to uniref30. Searched alongside BFD if use_small_bfd is
false.
uniclust30_database_path: uniclust30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is Path to uniclust30. Searched alongside BFD if use_small_bfd is
false. false.
use_small_bfd: use_small_bfd:
Whether to search the BFD database alone with jackhmmer or Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits. in conjunction with uniref30/uniclust30 with hhblits.
no_cpus: no_cpus:
The number of CPUs available for alignment. By default, all The number of CPUs available for alignment. By default, all
CPUs are used. CPUs are used.
...@@ -413,7 +417,7 @@ class AlignmentRunner: ...@@ -413,7 +417,7 @@ class AlignmentRunner:
) )
self.jackhmmer_small_bfd_runner = None self.jackhmmer_small_bfd_runner = None
self.hhblits_bfd_uniclust_runner = None self.hhblits_bfd_unirefclust_runner = None
if(bfd_database_path is not None): if(bfd_database_path is not None):
if use_small_bfd: if use_small_bfd:
self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer( self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
...@@ -423,9 +427,11 @@ class AlignmentRunner: ...@@ -423,9 +427,11 @@ class AlignmentRunner:
) )
else: else:
dbs = [bfd_database_path] dbs = [bfd_database_path]
if(uniclust30_database_path is not None): if(uniref30_database_path is not None):
dbs.append(uniref30_database_path)
if (uniclust30_database_path is not None):
dbs.append(uniclust30_database_path) dbs.append(uniclust30_database_path)
self.hhblits_bfd_uniclust_runner = hhblits.HHBlits( self.hhblits_bfd_unirefclust_runner = hhblits.HHBlits(
binary_path=hhblits_binary_path, binary_path=hhblits_binary_path,
databases=dbs, databases=dbs,
n_cpu=no_cpus, n_cpu=no_cpus,
...@@ -516,10 +522,17 @@ class AlignmentRunner: ...@@ -516,10 +522,17 @@ class AlignmentRunner:
msa_out_path=bfd_out_path, msa_out_path=bfd_out_path,
msa_format="sto", msa_format="sto",
) )
elif(self.hhblits_bfd_uniclust_runner is not None): elif(self.hhblits_bfd_unirefclust_runner is not None):
bfd_out_path = os.path.join(output_dir, "bfd_uniclust_hits.a3m") uni_name = "uni"
hhblits_bfd_uniclust_result = run_msa_tool( for db_name in self.hhblits_bfd_unirefclust_runner.databases:
msa_runner=self.hhblits_bfd_uniclust_runner, if "uniref" in db_name.lower():
uni_name = f"{uni_name}ref"
elif "uniclust" in db_name.lower():
uni_name = f"{uni_name}clust"
bfd_out_path = os.path.join(output_dir, f"bfd_{uni_name}_hits.a3m")
hhblits_bfd_unirefclust_result = run_msa_tool(
msa_runner=self.hhblits_bfd_unirefclust_runner,
fasta_path=fasta_path, fasta_path=fasta_path,
msa_out_path=bfd_out_path, msa_out_path=bfd_out_path,
msa_format="a3m", msa_format="a3m",
......
...@@ -88,6 +88,7 @@ def precompute_alignments(tags, seqs, alignment_dir, args, is_multimer): ...@@ -88,6 +88,7 @@ def precompute_alignments(tags, seqs, alignment_dir, args, is_multimer):
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniref30_database_path=args.uniref30_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniclust30_database_path=args.uniclust30_database_path,
no_cpus=args.cpus, no_cpus=args.cpus,
) )
...@@ -208,6 +209,7 @@ def main(args): ...@@ -208,6 +209,7 @@ def main(args):
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniref30_database_path=args.uniref30_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniclust30_database_path=args.uniclust30_database_path,
uniprot_database_path=args.uniprot_database_path, uniprot_database_path=args.uniprot_database_path,
template_searcher=template_searcher, template_searcher=template_searcher,
......
...@@ -38,4 +38,7 @@ BASENAME=$(basename "${SOURCE_URL}") ...@@ -38,4 +38,7 @@ BASENAME=$(basename "${SOURCE_URL}")
mkdir --parents "${ROOT_DIR}" mkdir --parents "${ROOT_DIR}"
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" -x 4 --check-certificate=false
gunzip "${ROOT_DIR}/${BASENAME}" tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \
--directory="${ROOT_DIR}"
rm "${ROOT_DIR}/${BASENAME}"
...@@ -11,6 +11,7 @@ import tempfile ...@@ -11,6 +11,7 @@ import tempfile
import openfold.data.mmcif_parsing as mmcif_parsing import openfold.data.mmcif_parsing as mmcif_parsing
from openfold.data.data_pipeline import AlignmentRunner from openfold.data.data_pipeline import AlignmentRunner
from openfold.data.parsers import parse_fasta from openfold.data.parsers import parse_fasta
from openfold.data.tools import hhsearch, hmmsearch
from openfold.np import protein, residue_constants from openfold.np import protein, residue_constants
from utils import add_data_args from utils import add_data_args
...@@ -114,15 +115,30 @@ def parse_and_align(files, alignment_runner, args): ...@@ -114,15 +115,30 @@ def parse_and_align(files, alignment_runner, args):
def main(args): def main(args):
# Build the alignment tool runner # Build the alignment tool runner
if (args.hmmsearch_binary_path is not None):
template_searcher = hmmsearch.Hmmsearch(
binary_path=args.hmmsearch_binary_path,
hmmbuild_binary_path=args.hmmbuild_binary_path,
database_path=args.pdb_seqres_database_path,
)
elif (args.hhsearch_binary_path is not None):
template_searcher = hhsearch.HHSearch(
binary_path=args.hhsearch_binary_path,
databases=[args.pdb70_database_path],
)
else:
template_searcher = None
alignment_runner = AlignmentRunner( alignment_runner = AlignmentRunner(
jackhmmer_binary_path=args.jackhmmer_binary_path, jackhmmer_binary_path=args.jackhmmer_binary_path,
hhblits_binary_path=args.hhblits_binary_path, hhblits_binary_path=args.hhblits_binary_path,
hhsearch_binary_path=args.hhsearch_binary_path,
uniref90_database_path=args.uniref90_database_path, uniref90_database_path=args.uniref90_database_path,
mgnify_database_path=args.mgnify_database_path, mgnify_database_path=args.mgnify_database_path,
bfd_database_path=args.bfd_database_path, bfd_database_path=args.bfd_database_path,
uniref30_database_path=args.uniref30_database_path,
uniclust30_database_path=args.uniclust30_database_path, uniclust30_database_path=args.uniclust30_database_path,
pdb70_database_path=args.pdb70_database_path, uniprot_database_path=args.uniprot_database_path,
template_searcher=template_searcher,
use_small_bfd=args.bfd_database_path is None, use_small_bfd=args.bfd_database_path is None,
no_cpus=args.cpus_per_task, no_cpus=args.cpus_per_task,
) )
......
...@@ -17,6 +17,9 @@ def add_data_args(parser: argparse.ArgumentParser): ...@@ -17,6 +17,9 @@ def add_data_args(parser: argparse.ArgumentParser):
parser.add_argument( parser.add_argument(
'--pdb_seqres_database_path', type=str, default=None, '--pdb_seqres_database_path', type=str, default=None,
) )
parser.add_argument(
'--uniref30_database_path', type=str, default=None,
)
parser.add_argument( parser.add_argument(
'--uniclust30_database_path', type=str, default=None, '--uniclust30_database_path', type=str, default=None,
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment