Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
edffead3
Commit
edffead3
authored
Dec 19, 2021
by
Gustaf Ahdritz
Browse files
Make AlignmentRunner more flexible
parent
f4316dc0
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
153 additions
and
74 deletions
+153
-74
openfold/data/data_pipeline.py
openfold/data/data_pipeline.py
+153
-62
run_pretrained_openfold.py
run_pretrained_openfold.py
+0
-8
scripts/precompute_alignments.py
scripts/precompute_alignments.py
+0
-1
scripts/utils.py
scripts/utils.py
+0
-3
No files found.
openfold/data/data_pipeline.py
View file @
edffead3
...
@@ -219,53 +219,137 @@ class AlignmentRunner:
...
@@ -219,53 +219,137 @@ class AlignmentRunner:
def
__init__
(
def
__init__
(
self
,
self
,
jackhmmer_binary_path
:
str
,
jackhmmer_binary_path
:
Optional
[
str
]
=
None
,
hhblits_binary_path
:
str
,
hhblits_binary_path
:
Optional
[
str
]
=
None
,
hhsearch_binary_path
:
str
,
hhsearch_binary_path
:
Optional
[
str
]
=
None
,
uniref90_database_path
:
str
,
uniref90_database_path
:
Optional
[
str
]
=
None
,
mgnify_database_path
:
str
,
mgnify_database_path
:
Optional
[
str
]
=
None
,
bfd_database_path
:
Optional
[
str
],
bfd_database_path
:
Optional
[
str
]
=
None
,
uniclust30_database_path
:
Optional
[
str
],
uniclust30_database_path
:
Optional
[
str
]
=
None
,
small_bfd_database_path
:
Optional
[
str
],
pdb70_database_path
:
Optional
[
str
]
=
None
,
pdb70_database_path
:
str
,
use_small_bfd
:
Optional
[
bool
]
=
None
,
use_small_bfd
:
bool
,
no_cpus
:
Optional
[
int
]
=
None
,
no_cpus
:
int
,
uniref_max_hits
:
int
=
10000
,
uniref_max_hits
:
int
=
10000
,
mgnify_max_hits
:
int
=
5000
,
mgnify_max_hits
:
int
=
5000
,
):
):
self
.
_use_small_bfd
=
use_small_bfd
"""
self
.
jackhmmer_uniref90_runner
=
jackhmmer
.
Jackhmmer
(
Args:
binary_path
=
jackhmmer_binary_path
,
jackhmmer_binary_path:
database_path
=
uniref90_database_path
,
Path to jackhmmer binary
n_cpu
=
no_cpus
,
hhblits_binary_path:
)
Path to hhblits binary
hhsearch_binary_path:
Path to hhsearch binary
uniref90_database_path:
Path to uniref90 database. If provided, jackhmmer_binary_path
must also be provided
mgnify_database_path:
Path to mgnify database. If provided, jackhmmer_binary_path
must also be provided
bfd_database_path:
Path to BFD database. Depending on the value of use_small_bfd,
one of hhblits_binary_path or jackhmmer_binary_path must be
provided.
uniclust30_database_path:
Path to uniclust30. Searched alongside BFD if use_small_bfd is
false.
pdb70_database_path:
Path to pdb70 database.
use_small_bfd:
Whether to search the BFD database alone with jackhmmer or
in conjunction with uniclust30 with hhblits.
no_cpus:
The number of CPUs available for alignment
uniref_max_hits:
Max number of uniref hits
mgnify_max_hits:
Max number of mgnify hits
"""
db_map
=
{
"jackhmmer"
:
{
"binary"
:
jackhmmer_binary_path
,
"dbs"
:
[
uniref90_database_path
,
mgnify_database_path
,
bfd_database_path
if
use_small_bfd
else
None
,
],
},
"hhblits"
:
{
"binary"
:
hhblits_binary_path
,
"dbs"
:
[
bfd_database_path
if
not
use_small_bfd
else
None
,
],
},
"hhsearch"
:
{
"binary"
:
hhsearch_binary_path
,
"dbs"
:
[
pdb70_database_path
,
],
},
}
for
name
,
dic
in
db_map
.
items
():
binary
,
dbs
=
dic
[
"binary"
],
dic
[
"dbs"
]
if
(
binary
is
None
and
not
all
([
x
is
None
for
x
in
dbs
])):
raise
ValueError
(
f
"
{
name
}
DBs provided but
{
name
}
binary is None"
)
if
(
not
all
([
x
is
None
for
x
in
db_map
[
"hhsearch"
][
"dbs"
]])
and
uniref90_database_path
is
None
):
raise
ValueError
(
"""uniref90_database_path must be specified in order to perform
template search"""
)
if
use_small_bfd
:
self
.
uniref_max_hits
=
uniref_max_hits
self
.
jackhmmer_small_bfd_runner
=
jackhmmer
.
Jackhmmer
(
self
.
mgnify_max_hits
=
mgnify_max_hits
self
.
use_small_bfd
=
use_small_bfd
self
.
jackhmmer_uniref90_runner
=
None
if
(
jackhmmer_binary_path
is
not
None
and
uniref90_database_path
is
not
None
):
self
.
jackhmmer_uniref90_runner
=
jackhmmer
.
Jackhmmer
(
binary_path
=
jackhmmer_binary_path
,
binary_path
=
jackhmmer_binary_path
,
database_path
=
small_bfd
_database_path
,
database_path
=
uniref90
_database_path
,
n_cpu
=
no_cpus
,
n_cpu
=
no_cpus
,
)
)
else
:
self
.
hhblits_bfd_uniclust_runner
=
hhblits
.
HHBlits
(
self
.
jackhmmer_small_bfd_runner
=
None
binary_path
=
hhblits_binary_path
,
self
.
hhblits_bfd_uniclust_runner
=
None
databases
=
[
bfd_database_path
,
uniclust30_database_path
],
if
(
bfd_database_path
is
not
None
):
if
use_small_bfd
:
self
.
jackhmmer_small_bfd_runner
=
jackhmmer
.
Jackhmmer
(
binary_path
=
jackhmmer_binary_path
,
database_path
=
bfd_database_path
,
n_cpu
=
no_cpus
,
)
else
:
dbs
=
[
bfd_database_path
]
if
(
uniclust30_database_path
is
not
None
):
dbs
.
append
(
uniclust30_database_path
)
self
.
hhblits_bfd_uniclust_runner
=
hhblits
.
HHBlits
(
binary_path
=
hhblits_binary_path
,
databases
=
dbs
,
n_cpu
=
no_cpus
,
)
self
.
jackhmmer_mgnify_runner
=
None
if
(
mgnify_database_path
is
not
None
):
self
.
jackhmmer_mgnify_runner
=
jackhmmer
.
Jackhmmer
(
binary_path
=
jackhmmer_binary_path
,
database_path
=
mgnify_database_path
,
n_cpu
=
no_cpus
,
n_cpu
=
no_cpus
,
)
)
self
.
jackhmmer_mgnify_runner
=
jackhmmer
.
Jackhmmer
(
self
.
hhsearch_pdb70_runner
=
None
binary_path
=
jackhmmer_binary_path
,
if
(
pdb70_database_path
is
not
None
):
database_path
=
mgnify_database_path
,
self
.
hhsearch_pdb70_runner
=
hhsearch
.
HHSearch
(
n_cpu
=
no_cpus
,
binary_path
=
hhsearch_binary_path
,
)
databases
=
[
pdb70_database_path
],
n_cpu
=
no_cpus
,
self
.
hhsearch_pdb70_runner
=
hhsearch
.
HHSearch
(
)
binary_path
=
hhsearch_binary_path
,
databases
=
[
pdb70_database_path
],
n_cpu
=
no_cpus
,
)
self
.
uniref_max_hits
=
uniref_max_hits
self
.
mgnify_max_hits
=
mgnify_max_hits
def
run
(
def
run
(
self
,
self
,
...
@@ -273,39 +357,46 @@ class AlignmentRunner:
...
@@ -273,39 +357,46 @@ class AlignmentRunner:
output_dir
:
str
,
output_dir
:
str
,
):
):
"""Runs alignment tools on a sequence"""
"""Runs alignment tools on a sequence"""
jackhmmer_uniref90_result
=
self
.
jackhmmer_uniref90_runner
.
query
(
if
(
self
.
jackhmmer_uniref90_runner
is
not
None
):
fasta_path
jackhmmer_uniref90_result
=
self
.
jackhmmer_uniref90_runner
.
query
(
)[
0
]
fasta_path
uniref90_msa_as_a3m
=
parsers
.
convert_stockholm_to_a3m
(
)[
0
]
jackhmmer_uniref90_result
[
"sto"
],
max_sequences
=
self
.
uniref_max_hits
uniref90_msa_as_a3m
=
parsers
.
convert_stockholm_to_a3m
(
)
jackhmmer_uniref90_result
[
"sto"
],
uniref90_out_path
=
os
.
path
.
join
(
output_dir
,
"
uniref
90
_hits
.a3m"
)
max_sequences
=
self
.
uniref
_max
_hits
with
open
(
uniref90_out_path
,
"w"
)
as
f
:
)
f
.
write
(
uniref90_
msa_as_
a3m
)
uniref90_out_path
=
os
.
path
.
join
(
output_dir
,
"
uniref90_
hits.
a3m
"
)
with
open
(
uniref90_out_path
,
"w"
)
as
f
:
jackhmmer_mgnify_result
=
self
.
jackhmmer_mgnify_runner
.
query
(
f
.
write
(
uniref90_msa_as_a3m
)
fasta_path
)[
0
]
if
(
self
.
hhsearch_pdb70_runner
is
not
None
):
mgnify_msa_as_a3m
=
parsers
.
convert_stockholm_to_a3m
(
hhsearch_result
=
self
.
hhsearch_pdb70_runner
.
query
(
jackhmmer_mgnify_result
[
"sto"
],
max_sequences
=
self
.
mgnify_max_hits
uniref90_msa_as_a3m
)
)
mgnify
_out_path
=
os
.
path
.
join
(
output_dir
,
"
mgnify
_hits.
a3m
"
)
pdb70
_out_path
=
os
.
path
.
join
(
output_dir
,
"
pdb70
_hits.
hhr
"
)
with
open
(
mgnify
_out_path
,
"w"
)
as
f
:
with
open
(
pdb70
_out_path
,
"w"
)
as
f
:
f
.
write
(
mgnify_msa_as_a3m
)
f
.
write
(
hhsearch_result
)
hhsearch_result
=
self
.
hhsearch_pdb70_runner
.
query
(
uniref90_msa_as_a3m
)
if
(
self
.
jackhmmer_mgnify_runner
is
not
None
):
pdb70_out_path
=
os
.
path
.
join
(
output_dir
,
"pdb70_hits.hhr"
)
jackhmmer_mgnify_result
=
self
.
jackhmmer_mgnify_runner
.
query
(
with
open
(
pdb70_out_path
,
"w"
)
as
f
:
fasta_path
f
.
write
(
hhsearch_result
)
)[
0
]
mgnify_msa_as_a3m
=
parsers
.
convert_stockholm_to_a3m
(
jackhmmer_mgnify_result
[
"sto"
],
max_sequences
=
self
.
mgnify_max_hits
)
mgnify_out_path
=
os
.
path
.
join
(
output_dir
,
"mgnify_hits.a3m"
)
with
open
(
mgnify_out_path
,
"w"
)
as
f
:
f
.
write
(
mgnify_msa_as_a3m
)
if
self
.
_
use_small_bfd
:
if
(
self
.
use_small_bfd
and
self
.
jackhmmer_small_bfd_runner
is
not
None
)
:
jackhmmer_small_bfd_result
=
self
.
jackhmmer_small_bfd_runner
.
query
(
jackhmmer_small_bfd_result
=
self
.
jackhmmer_small_bfd_runner
.
query
(
fasta_path
fasta_path
)[
0
]
)[
0
]
bfd_out_path
=
os
.
path
.
join
(
output_dir
,
"small_bfd_hits.sto"
)
bfd_out_path
=
os
.
path
.
join
(
output_dir
,
"small_bfd_hits.sto"
)
with
open
(
bfd_out_path
,
"w"
)
as
f
:
with
open
(
bfd_out_path
,
"w"
)
as
f
:
f
.
write
(
jackhmmer_small_bfd_result
[
"sto"
])
f
.
write
(
jackhmmer_small_bfd_result
[
"sto"
])
el
se
:
el
if
(
self
.
hhblits_bfd_uniclust_runner
is
not
None
)
:
hhblits_bfd_uniclust_result
=
(
hhblits_bfd_uniclust_result
=
(
self
.
hhblits_bfd_uniclust_runner
.
query
(
fasta_path
)
self
.
hhblits_bfd_uniclust_runner
.
query
(
fasta_path
)
)
)
...
...
run_pretrained_openfold.py
View file @
edffead3
...
@@ -105,7 +105,6 @@ def main(args):
...
@@ -105,7 +105,6 @@ def main(args):
mgnify_database_path
=
args
.
mgnify_database_path
,
mgnify_database_path
=
args
.
mgnify_database_path
,
bfd_database_path
=
args
.
bfd_database_path
,
bfd_database_path
=
args
.
bfd_database_path
,
uniclust30_database_path
=
args
.
uniclust30_database_path
,
uniclust30_database_path
=
args
.
uniclust30_database_path
,
small_bfd_database_path
=
args
.
small_bfd_database_path
,
pdb70_database_path
=
args
.
pdb70_database_path
,
pdb70_database_path
=
args
.
pdb70_database_path
,
use_small_bfd
=
use_small_bfd
,
use_small_bfd
=
use_small_bfd
,
no_cpus
=
args
.
cpus
,
no_cpus
=
args
.
cpus
,
...
@@ -229,11 +228,4 @@ if __name__ == "__main__":
...
@@ -229,11 +228,4 @@ if __name__ == "__main__":
--model_device for better performance"""
--model_device for better performance"""
)
)
if
(
args
.
bfd_database_path
is
None
and
args
.
small_bfd_database_path
is
None
):
raise
ValueError
(
"At least one of --bfd_database_path or --small_bfd_database_path"
"must be specified"
)
main
(
args
)
main
(
args
)
scripts/precompute_alignments.py
View file @
edffead3
...
@@ -25,7 +25,6 @@ def main(args):
...
@@ -25,7 +25,6 @@ def main(args):
mgnify_database_path
=
args
.
mgnify_database_path
,
mgnify_database_path
=
args
.
mgnify_database_path
,
bfd_database_path
=
args
.
bfd_database_path
,
bfd_database_path
=
args
.
bfd_database_path
,
uniclust30_database_path
=
args
.
uniclust30_database_path
,
uniclust30_database_path
=
args
.
uniclust30_database_path
,
small_bfd_database_path
=
args
.
small_bfd_database_path
,
pdb70_database_path
=
args
.
pdb70_database_path
,
pdb70_database_path
=
args
.
pdb70_database_path
,
use_small_bfd
=
args
.
bfd_database_path
is
None
,
use_small_bfd
=
args
.
bfd_database_path
is
None
,
no_cpus
=
args
.
cpus
,
no_cpus
=
args
.
cpus
,
...
...
scripts/utils.py
View file @
edffead3
...
@@ -21,9 +21,6 @@ def add_data_args(parser: argparse.ArgumentParser):
...
@@ -21,9 +21,6 @@ def add_data_args(parser: argparse.ArgumentParser):
parser
.
add_argument
(
parser
.
add_argument
(
'--bfd_database_path'
,
type
=
str
,
default
=
None
,
'--bfd_database_path'
,
type
=
str
,
default
=
None
,
)
)
parser
.
add_argument
(
'--small_bfd_database_path'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
parser
.
add_argument
(
'--jackhmmer_binary_path'
,
type
=
str
,
default
=
'/usr/bin/jackhmmer'
'--jackhmmer_binary_path'
,
type
=
str
,
default
=
'/usr/bin/jackhmmer'
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment