Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
22cc45a8
Commit
22cc45a8
authored
Nov 09, 2021
by
Gustaf
Browse files
Begin to add ProteinNet support (WIP)
parent
ed9ca30d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
206 additions
and
66 deletions
+206
-66
openfold/data/data_pipeline.py
openfold/data/data_pipeline.py
+103
-65
openfold/np/protein.py
openfold/np/protein.py
+57
-1
scripts/unpack_proteinnet.py
scripts/unpack_proteinnet.py
+46
-0
No files found.
openfold/data/data_pipeline.py
View file @
22cc45a8
...
@@ -37,6 +37,33 @@ def empty_template_feats(n_res) -> FeatureDict:
...
@@ -37,6 +37,33 @@ def empty_template_feats(n_res) -> FeatureDict:
}
}
def
make_template_features
(
input_sequence
:
str
,
hits
:
Sequence
[
Any
],
template_featurizer
:
Any
,
query_pdb_code
:
Optional
[
str
]
=
None
,
query_release_date
:
Optional
[
str
]
=
None
,
)
->
FeatureDict
:
hits_cat
=
sum
(
hits
.
values
(),
[])
if
(
len
(
hits_cat
)
==
0
):
template_features
=
empty_template_feats
(
len
(
input_sequence
))
else
:
templates_result
=
template_featurizer
.
get_templates
(
query_sequence
=
input_sequence
,
query_pdb_code
=
query_pdb_code
,
query_release_date
=
query_release_date
,
hits
=
hits_cat
,
)
template_features
=
templates_result
.
features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if
(
template_features
[
"template_aatype"
].
shape
[
0
]
==
0
):
template_features
=
empty_template_feats
(
len
(
input_sequence
))
return
template_features
def
make_sequence_features
(
def
make_sequence_features
(
sequence
:
str
,
description
:
str
,
num_res
:
int
sequence
:
str
,
description
:
str
,
num_res
:
int
)
->
FeatureDict
:
)
->
FeatureDict
:
...
@@ -95,16 +122,21 @@ def make_mmcif_features(
...
@@ -95,16 +122,21 @@ def make_mmcif_features(
return
mmcif_feats
return
mmcif_feats
def
make_pdb_features
(
def
_aatype_to_str_sequence
(
aatype
):
protein_object
:
protein
.
Protein
,
return
str
([
description
:
str
,
residue_constants
.
restypes
[
aatype
[
i
]]
for
i
in
range
(
len
(
aatype
))
confidence_threshold
:
float
=
0.5
,
])
def
make_protein_features
(
protein_object
:
protein
.
Protein
,
description
:
str
,
)
->
FeatureDict
:
)
->
FeatureDict
:
pdb_feats
=
{}
pdb_feats
=
{}
aatype
=
protein_object
.
aatype
sequence
=
_aatype_to_str_sequence
(
aatype
)
pdb_feats
.
update
(
pdb_feats
.
update
(
make_sequence_features
(
make_sequence_features
(
sequence
=
protein_object
.
aatyp
e
,
sequence
=
sequenc
e
,
description
=
description
,
description
=
description
,
num_res
=
len
(
protein_object
.
aatype
),
num_res
=
len
(
protein_object
.
aatype
),
)
)
...
@@ -113,12 +145,6 @@ def make_pdb_features(
...
@@ -113,12 +145,6 @@ def make_pdb_features(
all_atom_positions
=
protein_object
.
atom_positions
all_atom_positions
=
protein_object
.
atom_positions
all_atom_mask
=
protein_object
.
atom_mask
all_atom_mask
=
protein_object
.
atom_mask
high_confidence
=
protein
.
b_factors
>
confidence_threshold
high_confidence
=
np
.
any
(
high_confidence
,
axis
=-
1
)
for
i
,
confident
in
enumerate
(
high_confidence
):
if
(
not
confident
):
all_atom_mask
[
i
]
=
0
pdb_feats
[
"all_atom_positions"
]
=
all_atom_positions
pdb_feats
[
"all_atom_positions"
]
=
all_atom_positions
pdb_feats
[
"all_atom_mask"
]
=
all_atom_mask
pdb_feats
[
"all_atom_mask"
]
=
all_atom_mask
...
@@ -128,6 +154,22 @@ def make_pdb_features(
...
@@ -128,6 +154,22 @@ def make_pdb_features(
return
pdb_feats
return
pdb_feats
def
make_pdb_features
(
protein_object
:
protein
.
Protein
,
description
:
str
,
confidence_threshold
:
float
=
0.5
,
)
->
FeatureDict
:
pdb_feats
=
make_protein_features
(
protein_object
,
description
)
high_confidence
=
protein_object
.
b_factors
>
confidence_threshold
high_confidence
=
np
.
any
(
high_confidence
,
axis
=-
1
)
for
i
,
confident
in
enumerate
(
high_confidence
):
if
(
not
confident
):
pdb_feats
[
"all_atom_mask"
][
i
]
=
0
return
pdb_feats
def
make_msa_features
(
def
make_msa_features
(
msas
:
Sequence
[
Sequence
[
str
]],
msas
:
Sequence
[
Sequence
[
str
]],
deletion_matrices
:
Sequence
[
parsers
.
DeletionMatrix
],
deletion_matrices
:
Sequence
[
parsers
.
DeletionMatrix
],
...
@@ -349,22 +391,11 @@ class DataPipeline:
...
@@ -349,22 +391,11 @@ class DataPipeline:
num_res
=
len
(
input_sequence
)
num_res
=
len
(
input_sequence
)
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
hits_cat
=
sum
(
hits
.
values
(),
[])
template_features
=
make_template_features
(
if
(
len
(
hits_cat
)
==
0
):
input_sequence
,
template_features
=
empty_template_feats
(
len
(
input_sequence
))
hits
,
else
:
self
.
template_featurizer
,
templates_result
=
self
.
template_featurizer
.
get_templates
(
)
query_sequence
=
input_sequence
,
query_pdb_code
=
None
,
query_release_date
=
None
,
hits
=
hits_cat
,
)
template_features
=
templates_result
.
features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if
(
template_features
[
"template_aatype"
].
shape
[
0
]
==
0
):
template_features
=
empty_template_feats
(
len
(
input_sequence
))
sequence_features
=
make_sequence_features
(
sequence_features
=
make_sequence_features
(
sequence
=
input_sequence
,
sequence
=
input_sequence
,
...
@@ -403,23 +434,13 @@ class DataPipeline:
...
@@ -403,23 +434,13 @@ class DataPipeline:
input_sequence
=
mmcif
.
chain_to_seqres
[
chain_id
]
input_sequence
=
mmcif
.
chain_to_seqres
[
chain_id
]
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
hits_cat
=
sum
(
hits
.
values
(),
[])
template_features
=
make_template_features
(
if
(
len
(
hits_cat
)
==
0
):
input_sequence
,
template_features
=
empty_template_feats
(
len
(
input_sequence
))
hits
,
else
:
self
.
template_featurizer
,
templates_result
=
self
.
template_featurizer
.
get_templates
(
query_release_date
=
to_date
(
mmcif
.
header
[
"release_date"
])
query_sequence
=
input_sequence
,
)
query_pdb_code
=
None
,
query_release_date
=
to_date
(
mmcif
.
header
[
"release_date"
]),
hits
=
hits_cat
,
)
template_features
=
templates_result
.
features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if
(
template_features
[
"template_aatype"
].
shape
[
0
]
==
0
):
template_features
=
empty_template_feats
(
len
(
input_sequence
))
msa_features
=
self
.
_process_msa_feats
(
alignment_dir
)
msa_features
=
self
.
_process_msa_feats
(
alignment_dir
)
return
{
**
mmcif_feats
,
**
template_features
,
**
msa_features
}
return
{
**
mmcif_feats
,
**
template_features
,
**
msa_features
}
...
@@ -433,31 +454,48 @@ class DataPipeline:
...
@@ -433,31 +454,48 @@ class DataPipeline:
Assembles features for a protein in a PDB file.
Assembles features for a protein in a PDB file.
"""
"""
with
open
(
pdb_path
,
'r'
)
as
f
:
with
open
(
pdb_path
,
'r'
)
as
f
:
pdb_str
=
pdb_path
pdb_str
=
f
.
read
()
protein_object
=
protein
.
from_pdb_string
(
pdb_str
)
protein_object
=
protein
.
from_pdb_string
(
pdb_str
)
input_sequence
=
protein_object
.
aatype
input_sequence
=
_aatype_to_str_sequence
(
protein_object
.
aatype
)
description
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
pdb_path
))[
0
].
upper
()
pdb_feats
=
make_pdb_features
(
protein_object
)
pdb_feats
=
make_pdb_features
(
protein_object
,
description
)
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
hits_cat
=
sum
(
hits
.
values
(),
[])
template_features
=
make_template_features
(
if
(
len
(
hits_cat
)
==
0
):
input_sequence
,
template_features
=
empty_template_feats
(
len
(
input_sequence
))
hits
,
else
:
self
.
template_featurizer
,
templates_result
=
self
.
template_featurizer
.
get_templates
(
)
query_sequence
=
input_sequence
,
query_pdb_code
=
None
,
query_release_date
=
None
,
hits
=
hits_cat
,
)
template_features
=
templates_result
.
features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if
(
template_features
[
"template_aatype"
].
shape
[
0
]
==
0
):
template_features
=
empty_template_feats
(
len
(
input_sequence
))
msa_features
=
self
.
_process_msa_feats
(
alignment_dir
)
msa_features
=
self
.
_process_msa_feats
(
alignment_dir
)
return
{
**
pdb_feats
,
**
template_features
,
**
msa_features
}
return
{
**
pdb_feats
,
**
template_features
,
**
msa_features
}
def
process_core
(
self
,
core_path
:
str
,
alignment_dir
:
str
,
)
->
FeatureDict
:
"""
Assembles features for a protein in a ProteinNet .core file.
"""
with
open
(
core_path
,
'r'
)
as
f
:
core_str
=
f
.
read
()
protein_object
=
protein
.
from_proteinnet_string
(
core_str
)
input_sequence
=
_aatype_to_str_sequence
(
protein_object
.
aatype
)
description
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
core_path
))[
0
].
upper
()
core_feats
=
make_protein_features
(
protein_object
,
description
)
hits
=
self
.
_parse_template_hits
(
alignment_dir
)
template_features
=
make_template_features
(
input_sequence
,
hits
,
self
.
template_featurizer
,
)
msa_features
=
self
.
_process_msa_feats
(
alignment_dir
)
return
{
**
core_feats
,
**
template_features
,
**
msa_features
}
openfold/np/protein.py
View file @
22cc45a8
...
@@ -17,13 +17,16 @@
...
@@ -17,13 +17,16 @@
import
dataclasses
import
dataclasses
import
io
import
io
from
typing
import
Any
,
Mapping
,
Optional
from
typing
import
Any
,
Mapping
,
Optional
import
re
from
openfold.np
import
residue_constants
from
openfold.np
import
residue_constants
from
Bio.PDB
import
PDBParser
from
Bio.PDB
import
PDBParser
import
numpy
as
np
import
numpy
as
np
FeatureDict
=
Mapping
[
str
,
np
.
ndarray
]
FeatureDict
=
Mapping
[
str
,
np
.
ndarray
]
ModelOutput
=
Mapping
[
str
,
Any
]
# Is a nested dict.
ModelOutput
=
Mapping
[
str
,
Any
]
# Is a nested dict.
PICO_TO_ANGSTROM
=
0.01
@
dataclasses
.
dataclass
(
frozen
=
True
)
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
Protein
:
class
Protein
:
...
@@ -132,6 +135,59 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
...
@@ -132,6 +135,59 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
)
)
def
from_proteinnet_string
(
proteinnet_str
:
str
)
->
Protein
:
tag_re
=
r
'(\[[A-Z]+\]\n)'
tags
=
[
tag
.
strip
()
for
tag
in
re
.
split
(
tag_re
,
proteinnet_str
)
if
len
(
tag
)
>
0
]
groups
=
zip
(
tags
[
0
::
2
],
[
l
.
split
(
'
\n
'
)
for
l
in
tags
[
1
::
2
]])
atoms
=
[
'N'
,
'CA'
,
'C'
]
aatype
=
None
atom_positions
=
None
atom_mask
=
None
for
g
in
groups
:
if
(
"[PRIMARY]"
==
g
[
0
]):
seq
=
g
[
1
][
0
].
strip
()
for
i
in
range
(
len
(
seq
)):
if
(
seq
[
i
]
not
in
residue_constants
.
restypes
):
seq
[
i
]
=
'X'
aatype
=
np
.
array
([
residue_constants
.
restype_order
.
get
(
res_symbol
,
residue_constants
.
restype_num
)
for
res_symbol
in
seq
])
elif
(
"[TERTIARY]"
==
g
[
0
]):
tertiary
=
[]
for
axis
in
range
(
3
):
tertiary
.
append
(
list
(
map
(
float
,
g
[
1
][
axis
].
split
())))
tertiary_np
=
np
.
array
(
tertiary
)
atom_positions
=
np
.
zeros
(
(
len
(
tertiary
[
0
])
//
3
,
residue_constants
.
atom_type_num
,
3
)
).
astype
(
np
.
float32
)
for
i
,
atom
in
enumerate
(
atoms
):
atom_positions
[:,
residue_constants
.
atom_order
[
atom
],
:]
=
(
np
.
transpose
(
tertiary_np
[:,
i
::
3
])
)
atom_positions
*=
PICO_TO_ANGSTROM
elif
(
"[MASK]"
==
g
[
0
]):
mask
=
np
.
array
(
list
(
map
({
'-'
:
0
,
'+'
:
1
}.
get
,
g
[
1
][
0
].
strip
())))
atom_mask
=
np
.
zeros
(
(
len
(
mask
),
residue_constants
.
atom_type_num
,)
).
astype
(
np
.
float32
)
for
i
,
atom
in
enumerate
(
atoms
):
atom_mask
[:,
residue_constants
.
atom_order
[
atom
]]
=
1
atom_mask
*=
mask
[...,
None
]
return
Protein
(
atom_positions
=
atom_positions
,
atom_mask
=
atom_mask
,
aatype
=
aatype
,
residue_index
=
np
.
arange
(
len
(
aatype
)),
b_factors
=
None
,
)
def
to_pdb
(
prot
:
Protein
)
->
str
:
def
to_pdb
(
prot
:
Protein
)
->
str
:
"""Converts a `Protein` instance to a PDB string.
"""Converts a `Protein` instance to a PDB string.
...
...
scripts/unpack_proteinnet.py
0 → 100644
View file @
22cc45a8
import
argparse
import
os
from
pathlib
import
Path
def
_write_file
(
args
,
file_in_progress
):
file_id
=
file_in_progress
[
1
]
fname
=
file_id
.
upper
()
+
".core"
fpath
=
os
.
path
.
join
(
args
.
output_dir
,
fname
)
with
open
(
fpath
,
"w"
)
as
fp
:
fp
.
write
(
'
\n
'
.
join
(
file_in_progress
))
def
main
(
args
):
Path
(
args
.
output_dir
).
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
open
(
args
.
proteinnet_file
,
"r"
)
as
fp
:
proteinnet_string
=
fp
.
readlines
()
file_in_progress
=
[]
for
line
in
proteinnet_string
:
if
(
line
==
"[ID]
\n
"
):
if
(
len
(
file_in_progress
)
>
0
):
_write_file
(
args
,
file_in_progress
)
file_in_progress
=
[]
file_in_progress
.
append
(
line
.
strip
())
if
(
len
(
file_in_progress
)
>
0
):
_write_file
(
args
,
file_in_progress
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"proteinnet_file"
,
type
=
str
,
help
=
"Path to ProteinNet file to unpack"
)
parser
.
add_argument
(
"output_dir"
,
type
=
str
,
help
=
"Path to directory in which to output .core files"
)
args
=
parser
.
parse_args
()
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment