Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
2c7ce956
Commit
2c7ce956
authored
Nov 10, 2021
by
Gustaf
Browse files
Add more ProteinNet functionality
parent
78fa6c6e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
99 additions
and
22 deletions
+99
-22
README.md
README.md
+2
-2
openfold/data/data_modules.py
openfold/data/data_modules.py
+8
-5
openfold/data/data_pipeline.py
openfold/data/data_pipeline.py
+3
-2
scripts/data_dir_to_fasta.py
scripts/data_dir_to_fasta.py
+69
-0
scripts/precompute_alignments.py
scripts/precompute_alignments.py
+17
-11
scripts/precompute_alignments_mmseqs.py
scripts/precompute_alignments_mmseqs.py
+0
-2
No files found.
README.md
View file @
2c7ce956
...
@@ -167,8 +167,8 @@ python3 scripts/precompute_alignments_mmseqs.py input.fasta \
...
@@ -167,8 +167,8 @@ python3 scripts/precompute_alignments_mmseqs.py input.fasta \
```
```
where
`input.fasta`
is a FASTA file containing one or more query sequences. To
where
`input.fasta`
is a FASTA file containing one or more query sequences. To
generate an input FASTA from a directory of mmCIF
files, we provide
generate an input FASTA from a directory of mmCIF
and/or ProteinNet .core
`scripts/
mmcif
_dir_to_fasta.py`
.
files, we provide
`scripts/
data
_dir_to_fasta.py`
.
Next, generate a cache of certain datapoints in the mmCIF files:
Next, generate a cache of certain datapoints in the mmCIF files:
...
...
openfold/data/data_modules.py
View file @
2c7ce956
...
@@ -152,16 +152,19 @@ class OpenFoldSingleDataset(torch.utils.data.Dataset):
...
@@ -152,16 +152,19 @@ class OpenFoldSingleDataset(torch.utils.data.Dataset):
file_id
,
=
spl
file_id
,
=
spl
chain_id
=
None
chain_id
=
None
path
=
os
.
path
.
join
(
self
.
data_dir
,
file_id
+
'.cif'
)
path
=
os
.
path
.
join
(
self
.
data_dir
,
file_id
)
if
(
os
.
path
.
exists
(
path
)):
if
(
os
.
path
.
exists
(
path
+
".cif"
)):
data
=
self
.
_parse_mmcif
(
data
=
self
.
_parse_mmcif
(
path
,
file_id
,
chain_id
,
alignment_dir
path
+
".cif"
,
file_id
,
chain_id
,
alignment_dir
)
elif
(
os
.
path
.
exists
(
path
+
".core"
)):
data
=
self
.
data_pipeline
.
process_core
(
path
+
".core"
,
alignment_dir
)
)
else
:
else
:
# Try to search for a distillation PDB file instead
# Try to search for a distillation PDB file instead
path
=
os
.
path
.
join
(
self
.
data_dir
,
file_id
+
'.pdb'
)
data
=
self
.
data_pipeline
.
process_pdb
(
data
=
self
.
data_pipeline
.
process_pdb
(
pdb_path
=
path
,
pdb_path
=
path
+
".pdb"
,
alignment_dir
=
alignment_dir
alignment_dir
=
alignment_dir
)
)
else
:
else
:
...
...
openfold/data/data_pipeline.py
View file @
2c7ce956
...
@@ -123,8 +123,9 @@ def make_mmcif_features(
...
@@ -123,8 +123,9 @@ def make_mmcif_features(
def
_aatype_to_str_sequence
(
aatype
):
def
_aatype_to_str_sequence
(
aatype
):
return
str
([
return
''
.
join
([
residue_constants
.
restypes
[
aatype
[
i
]]
for
i
in
range
(
len
(
aatype
))
residue_constants
.
restypes_with_x
[
aatype
[
i
]]
for
i
in
range
(
len
(
aatype
))
])
])
def
make_protein_features
(
def
make_protein_features
(
...
...
scripts/
mmcif
_dir_to_fasta.py
→
scripts/
data
_dir_to_fasta.py
View file @
2c7ce956
...
@@ -3,36 +3,47 @@ import logging
...
@@ -3,36 +3,47 @@ import logging
import
os
import
os
from
openfold.data
import
mmcif_parsing
from
openfold.data
import
mmcif_parsing
from
openfold.np
import
protein
,
residue_constants
def
main
(
args
):
def
main
(
args
):
fasta
=
[]
fasta
=
[]
for
fname
in
os
.
listdir
(
args
.
mmcif
_dir
):
for
fname
in
os
.
listdir
(
args
.
data
_dir
):
basename
,
ext
=
os
.
path
.
splitext
(
fname
)
basename
,
ext
=
os
.
path
.
splitext
(
fname
)
basename
=
basename
.
upper
()
basename
=
basename
.
upper
()
fpath
=
os
.
path
.
join
(
args
.
data_dir
,
fname
)
if
(
ext
==
".cif"
):
with
open
(
fpath
,
'r'
)
as
fp
:
mmcif_str
=
fp
.
read
()
mmcif
=
mmcif_parsing
.
parse
(
file_id
=
basename
,
mmcif_string
=
mmcif_str
)
if
(
mmcif
.
mmcif_object
is
None
):
logging
.
warning
(
f
'Failed to parse
{
fname
}
...'
)
if
(
args
.
raise_errors
):
raise
list
(
mmcif
.
errors
.
values
())[
0
]
else
:
continue
if
(
not
ext
==
".cif"
):
mmcif
=
mmcif
.
mmcif_object
continue
for
chain
,
seq
in
mmcif
.
chain_to_seqres
.
items
():
chain_id
=
'_'
.
join
([
basename
,
chain
])
fpath
=
os
.
path
.
join
(
args
.
mmcif_dir
,
fname
)
fasta
.
append
(
f
">
{
chain_id
}
"
)
with
open
(
fpath
,
'r'
)
as
fp
:
fasta
.
append
(
seq
)
mmcif_str
=
fp
.
read
()
elif
(
ext
==
".core"
):
with
open
(
fpath
,
'r'
)
as
fp
:
mmcif
=
mmcif_parsing
.
parse
(
core_str
=
fp
.
read
()
file_id
=
basename
,
mmcif_string
=
mmcif_str
)
core_protein
=
protein
.
from_proteinnet_string
(
core_str
)
if
(
mmcif
.
mmcif_object
is
None
):
aatype
=
core_protein
.
aatype
logging
.
warning
(
f
'Failed to parse
{
fname
}
...'
)
seq
=
''
.
join
([
if
(
args
.
raise_errors
):
residue_constants
.
restypes_with_x
[
aatype
[
i
]]
raise
list
(
mmcif
.
errors
.
values
())[
0
]
for
i
in
range
(
len
(
aatype
))
else
:
])
continue
fasta
.
append
(
f
">
{
basename
}
"
)
mmcif
=
mmcif
.
mmcif_object
for
chain
,
seq
in
mmcif
.
chain_to_seqres
.
items
():
chain_id
=
'_'
.
join
([
basename
,
chain
])
fasta
.
append
(
f
">
{
chain_id
}
"
)
fasta
.
append
(
seq
)
fasta
.
append
(
seq
)
with
open
(
args
.
output_path
,
"w"
)
as
fp
:
with
open
(
args
.
output_path
,
"w"
)
as
fp
:
fp
.
write
(
'
\n
'
.
join
(
fasta
))
fp
.
write
(
'
\n
'
.
join
(
fasta
))
...
@@ -41,8 +52,8 @@ def main(args):
...
@@ -41,8 +52,8 @@ def main(args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
parser
.
add_argument
(
"
mmcif
_dir"
,
type
=
str
,
"
data
_dir"
,
type
=
str
,
help
=
"Path to a directory containing mmCIF files"
help
=
"Path to a directory containing mmCIF
or .core
files"
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"output_path"
,
type
=
str
,
"output_path"
,
type
=
str
,
...
...
scripts/precompute_alignments.py
View file @
2c7ce956
...
@@ -5,6 +5,7 @@ import tempfile
...
@@ -5,6 +5,7 @@ import tempfile
import
openfold.data.mmcif_parsing
as
mmcif_parsing
import
openfold.data.mmcif_parsing
as
mmcif_parsing
from
openfold.data.data_pipeline
import
AlignmentRunner
from
openfold.data.data_pipeline
import
AlignmentRunner
from
openfold.np
import
protein
,
residue_constants
from
utils
import
add_data_args
from
utils
import
add_data_args
...
@@ -31,11 +32,9 @@ def main(args):
...
@@ -31,11 +32,9 @@ def main(args):
for
f
in
os
.
listdir
(
args
.
input_dir
):
for
f
in
os
.
listdir
(
args
.
input_dir
):
path
=
os
.
path
.
join
(
args
.
input_dir
,
f
)
path
=
os
.
path
.
join
(
args
.
input_dir
,
f
)
is_mmcif
=
f
.
endswith
(
'.cif'
)
is_fasta
=
f
.
endswith
(
'.fasta'
)
file_id
=
os
.
path
.
splitext
(
f
)[
0
]
file_id
=
os
.
path
.
splitext
(
f
)[
0
]
seqs
=
{}
seqs
=
{}
if
(
is_mm
cif
):
if
(
f
.
endswith
(
'.
cif
'
)
):
with
open
(
path
,
'r'
)
as
fp
:
with
open
(
path
,
'r'
)
as
fp
:
mmcif_str
=
fp
.
read
()
mmcif_str
=
fp
.
read
()
mmcif
=
mmcif_parsing
.
parse
(
mmcif
=
mmcif_parsing
.
parse
(
...
@@ -51,7 +50,7 @@ def main(args):
...
@@ -51,7 +50,7 @@ def main(args):
for
k
,
v
in
mmcif
.
chain_to_seqres
.
items
():
for
k
,
v
in
mmcif
.
chain_to_seqres
.
items
():
chain_id
=
'_'
.
join
([
file_id
,
k
])
chain_id
=
'_'
.
join
([
file_id
,
k
])
seqs
[
chain_id
]
=
v
seqs
[
chain_id
]
=
v
elif
(
is_
fasta
):
elif
(
f
.
endswith
(
'.
fasta
'
)
):
with
open
(
path
,
'r'
)
as
fp
:
with
open
(
path
,
'r'
)
as
fp
:
fasta_str
=
fp
.
read
()
fasta_str
=
fp
.
read
()
input_seqs
,
_
=
parsers
.
parse_fasta
(
fasta_str
)
input_seqs
,
_
=
parsers
.
parse_fasta
(
fasta_str
)
...
@@ -63,6 +62,15 @@ def main(args):
...
@@ -63,6 +62,15 @@ def main(args):
logging
.
warning
(
msg
)
logging
.
warning
(
msg
)
input_sequence
=
input_seqs
[
0
]
input_sequence
=
input_seqs
[
0
]
seqs
[
file_id
]
=
input_sequence
seqs
[
file_id
]
=
input_sequence
elif
(
f
.
endswith
(
'.core'
)):
with
open
(
path
,
'r'
)
as
fp
:
core_str
=
fp
.
read
()
core_prot
=
protein
.
from_proteinnet_string
(
core_str
)
seq
=
''
.
join
([
residue_constants
.
restypes_with_x
[
aatype
[
i
]]
for
i
in
range
(
len
(
aatype
))
])
seqs
[
file_id
]
=
seq
else
:
else
:
continue
continue
...
@@ -74,17 +82,15 @@ def main(args):
...
@@ -74,17 +82,15 @@ def main(args):
os
.
makedirs
(
alignment_dir
)
os
.
makedirs
(
alignment_dir
)
if
(
not
is_fasta
):
fd
,
fasta_path
=
tempfile
.
mkstemp
(
suffix
=
".fasta"
)
fd
,
fasta_path
=
tempfile
.
mkstemp
(
suffix
=
".fasta"
)
with
os
.
fdopen
(
fd
,
'w'
)
as
fp
:
with
os
.
fdopen
(
fd
,
'w'
)
as
fp
:
fp
.
write
(
f
'>query
\n
{
seq
}
'
)
fp
.
write
(
f
'>query
\n
{
seq
}
'
)
alignment_runner
.
run
(
alignment_runner
.
run
(
f
if
is_fasta
else
fasta_path
,
alignment_dir
fasta_path
,
alignment_dir
)
)
if
(
not
is_fasta
):
os
.
remove
(
fasta_path
)
os
.
remove
(
fasta_path
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
scripts/precompute_alignments_mmseqs.py
View file @
2c7ce956
...
@@ -114,8 +114,6 @@ def main(args):
...
@@ -114,8 +114,6 @@ def main(args):
not
os
.
path
.
splitext
(
fname
)[
-
1
]
==
".a3m"
):
not
os
.
path
.
splitext
(
fname
)[
-
1
]
==
".a3m"
):
continue
continue
print
(
fpath
)
with
open
(
fpath
,
"r"
)
as
fp
:
with
open
(
fpath
,
"r"
)
as
fp
:
a3m
=
fp
.
read
()
a3m
=
fp
.
read
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment