Unverified Commit 56a32c3b authored by Yuanqi Wang's avatar Yuanqi Wang Committed by GitHub
Browse files

Add more parameter functionality to HHBlits and HHSearch runners (#51)



* [ray] use ray for DB queries

* patched openmm

* add test exec script

* remove output before exec

* [tools] add coverage param to HHBlits

* [tools] add more param options to HHSearch

* [tools] remove ray from JackHmmer runner

* [tools] add comment for cov param in hhblits

* [tools] cleanup all debug code

* [tools] restore original files
Co-authored-by: default avatarshenggan <csg19971016@gmail.com>
parent ae44a3b7
...@@ -46,6 +46,7 @@ class HHBlits: ...@@ -46,6 +46,7 @@ class HHBlits:
alt: Optional[int] = None, alt: Optional[int] = None,
p: int = _HHBLITS_DEFAULT_P, p: int = _HHBLITS_DEFAULT_P,
z: int = _HHBLITS_DEFAULT_Z, z: int = _HHBLITS_DEFAULT_Z,
cov: int = 0,
): ):
"""Initializes the Python HHblits wrapper. """Initializes the Python HHblits wrapper.
...@@ -71,6 +72,8 @@ class HHBlits: ...@@ -71,6 +72,8 @@ class HHBlits:
HHblits default: 20. HHblits default: 20.
z: Hard cap on number of hits reported in the hhr file. z: Hard cap on number of hits reported in the hhr file.
HHblits default: 500. NB: The relevant HHblits flag is -Z not -z. HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
cov: Minimum coverage with master sequence (%).
HHBlits default: 0
Raises: Raises:
RuntimeError: If HHblits binary not found within the path. RuntimeError: If HHblits binary not found within the path.
...@@ -98,6 +101,7 @@ class HHBlits: ...@@ -98,6 +101,7 @@ class HHBlits:
self.alt = alt self.alt = alt
self.p = p self.p = p
self.z = z self.z = z
self.cov = cov
def query(self, input_fasta_path: str) -> Mapping[str, Any]: def query(self, input_fasta_path: str) -> Mapping[str, Any]:
"""Queries the database using HHblits.""" """Queries the database using HHblits."""
...@@ -139,6 +143,8 @@ class HHBlits: ...@@ -139,6 +143,8 @@ class HHBlits:
cmd += ["-p", str(self.p)] cmd += ["-p", str(self.p)]
if self.z != _HHBLITS_DEFAULT_Z: if self.z != _HHBLITS_DEFAULT_Z:
cmd += ["-Z", str(self.z)] cmd += ["-Z", str(self.z)]
if self.cov:
cmd += ["-cov", str(self.cov)]
cmd += db_cmd cmd += db_cmd
logging.info('Launching subprocess "%s"', " ".join(cmd)) logging.info('Launching subprocess "%s"', " ".join(cmd))
......
...@@ -18,7 +18,7 @@ import glob ...@@ -18,7 +18,7 @@ import glob
import logging import logging
import os import os
import subprocess import subprocess
from typing import Sequence from typing import Sequence, Union
from fastfold.data.tools import utils from fastfold.data.tools import utils
...@@ -33,6 +33,14 @@ class HHSearch: ...@@ -33,6 +33,14 @@ class HHSearch:
databases: Sequence[str], databases: Sequence[str],
n_cpu: int = 2, n_cpu: int = 2,
maxseq: int = 1_000_000, maxseq: int = 1_000_000,
mact: float = 0.35,
min_align: int = 10,
max_align: int = 500,
min_lines: int = 10,
max_lines: int = 500,
aliw: int = 100000,
e_value: float = 0.001,
min_prob: float = 20.0,
): ):
"""Initializes the Python HHsearch wrapper. """Initializes the Python HHsearch wrapper.
...@@ -44,6 +52,15 @@ class HHSearch: ...@@ -44,6 +52,15 @@ class HHSearch:
n_cpu: The number of CPUs to use n_cpu: The number of CPUs to use
maxseq: The maximum number of rows in an input alignment. Note that this maxseq: The maximum number of rows in an input alignment. Note that this
parameter is only supported in HHBlits version 3.1 and higher. parameter is only supported in HHBlits version 3.1 and higher.
mact: Posterior probability threshold for MAC realignment controlling greediness at alignment
ends.
min_align: Minimum number of alignments in alignment list. (-b)
max_align: Maximum number of alignments in alignment list. (-B)
min_lines: Minimum number of lines in summary hit list. (-z)
max_lines: Maximum number of lines in summary hit list. (-Z)
aliw: Number of columns per line in alignment list.
e_value: E-value cutoff for inclusion in result alignment. (-e)
min_prob: Minimum probability in summary and alignment list. (-p)
Raises: Raises:
RuntimeError: If HHsearch binary not found within the path. RuntimeError: If HHsearch binary not found within the path.
...@@ -52,6 +69,14 @@ class HHSearch: ...@@ -52,6 +69,14 @@ class HHSearch:
self.databases = databases self.databases = databases
self.n_cpu = n_cpu self.n_cpu = n_cpu
self.maxseq = maxseq self.maxseq = maxseq
self.mact = mact
self.min_align = min_align
self.max_align = max_align
self.min_lines = min_lines
self.max_lines = max_lines
self.aliw = aliw
self.e_value = e_value
self.min_prob = min_prob
for database_path in self.databases: for database_path in self.databases:
if not glob.glob(database_path + "_*"): if not glob.glob(database_path + "_*"):
...@@ -62,11 +87,12 @@ class HHSearch: ...@@ -62,11 +87,12 @@ class HHSearch:
f"Could not find HHsearch database {database_path}" f"Could not find HHsearch database {database_path}"
) )
def query(self, a3m: str) -> str: def query(self, a3m: str, gen_atab: bool = False) -> Union[str, tuple]:
"""Queries the database using HHsearch using a given a3m.""" """Queries the database using HHsearch using a given a3m."""
with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir: with utils.tmpdir_manager(base_dir="/tmp") as query_tmp_dir:
input_path = os.path.join(query_tmp_dir, "query.a3m") input_path = os.path.join(query_tmp_dir, "query.a3m")
hhr_path = os.path.join(query_tmp_dir, "output.hhr") hhr_path = os.path.join(query_tmp_dir, "output.hhr")
atab_path = os.path.join(query_tmp_dir, "output.atab")
with open(input_path, "w") as f: with open(input_path, "w") as f:
f.write(a3m) f.write(a3m)
...@@ -84,7 +110,25 @@ class HHSearch: ...@@ -84,7 +110,25 @@ class HHSearch:
str(self.maxseq), str(self.maxseq),
"-cpu", "-cpu",
str(self.n_cpu), str(self.n_cpu),
"-b",
str(self.min_align),
"-B",
str(self.max_align),
"-z",
str(self.min_lines),
"-Z",
str(self.max_lines),
"-mact",
str(self.mact),
"-aliw",
str(self.aliw),
"-e",
str(self.e_value),
"-p",
str(self.min_prob),
] + db_cmd ] + db_cmd
if gen_atab:
cmd += ["-atab", atab_path]
logging.info('Launching subprocess "%s"', " ".join(cmd)) logging.info('Launching subprocess "%s"', " ".join(cmd))
process = subprocess.Popen( process = subprocess.Popen(
...@@ -103,4 +147,10 @@ class HHSearch: ...@@ -103,4 +147,10 @@ class HHSearch:
with open(hhr_path) as f: with open(hhr_path) as f:
hhr = f.read() hhr = f.read()
return hhr if gen_atab:
with open(atab_path) as f:
atab = f.read()
if gen_atab:
return hhr, atab
else:
return hhr
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment