Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
OpenFold
Commits
63e0a247
"launch/git@developer.sourcefind.cn:OpenDAS/dynamo.git" did not exist on "3d2928510c77fc6cb29c497c91a493e3d06c0cc1"
Commit
63e0a247
authored
Oct 08, 2021
by
Sachin Kadyan
Browse files
Added parser for parsing mmCIF files for generating templates.
parent
2654fcce
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
371 additions
and
0 deletions
+371
-0
openfold/features/mmcif_parsing.py
openfold/features/mmcif_parsing.py
+371
-0
No files found.
openfold/features/mmcif_parsing.py
0 → 100644
View file @
63e0a247
"""Parses the mmCIF file format."""
import
collections
import
dataclasses
import
io
from
typing
import
Any
,
Mapping
,
Optional
,
Sequence
,
Tuple
from
absl
import
logging
from
Bio
import
PDB
from
Bio.Data
import
SCOPData
# Type aliases:
ChainId
=
str
PdbHeader
=
Mapping
[
str
,
Any
]
PdbStructure
=
PDB
.
Structure
.
Structure
SeqRes
=
str
MmCIFDict
=
Mapping
[
str
,
Sequence
[
str
]]
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
Monomer
:
id
:
str
num
:
int
# Note - mmCIF format provides no guarantees on the type of author-assigned
# sequence numbers. They need not be integers.
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
AtomSite
:
residue_name
:
str
author_chain_id
:
str
mmcif_chain_id
:
str
author_seq_num
:
str
mmcif_seq_num
:
int
insertion_code
:
str
hetatm_atom
:
str
model_num
:
int
# Used to map SEQRES index to a residue in the structure.
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
ResiduePosition
:
chain_id
:
str
residue_number
:
int
insertion_code
:
str
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
ResidueAtPosition
:
position
:
Optional
[
ResiduePosition
]
name
:
str
is_missing
:
bool
hetflag
:
str
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
MmcifObject
:
"""Representation of a parsed mmCIF file.
Contains:
file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
files being processed.
header: Biopython header.
structure: Biopython structure.
chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
{'A': 'ABCDEFG'}
seqres_to_structure: Dict; for each chain_id contains a mapping between
SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
1: ResidueAtPosition,
...}}
raw_string: The raw string used to construct the MmcifObject.
"""
file_id
:
str
header
:
PdbHeader
structure
:
PdbStructure
chain_to_seqres
:
Mapping
[
ChainId
,
SeqRes
]
seqres_to_structure
:
Mapping
[
ChainId
,
Mapping
[
int
,
ResidueAtPosition
]]
raw_string
:
Any
@
dataclasses
.
dataclass
(
frozen
=
True
)
class
ParsingResult
:
"""Returned by the parse function.
Contains:
mmcif_object: A MmcifObject, may be None if no chain could be successfully
parsed.
errors: A dict mapping (file_id, chain_id) to any exception generated.
"""
mmcif_object
:
Optional
[
MmcifObject
]
errors
:
Mapping
[
Tuple
[
str
,
str
],
Any
]
class
ParseError
(
Exception
):
"""An error indicating that an mmCIF file could not be parsed."""
def
mmcif_loop_to_list
(
prefix
:
str
,
parsed_info
:
MmCIFDict
)
->
Sequence
[
Mapping
[
str
,
str
]]:
"""Extracts loop associated with a prefix from mmCIF data as a list.
Reference for loop_ in mmCIF:
http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
Args:
prefix: Prefix shared by each of the data items in the loop.
e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
_entity_poly_seq.mon_id. Should include the trailing period.
parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
parser.
Returns:
Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
"""
cols
=
[]
data
=
[]
for
key
,
value
in
parsed_info
.
items
():
if
key
.
startswith
(
prefix
):
cols
.
append
(
key
)
data
.
append
(
value
)
assert
all
([
len
(
xs
)
==
len
(
data
[
0
])
for
xs
in
data
]),
(
'mmCIF error: Not all loops are the same length: %s'
%
cols
)
return
[
dict
(
zip
(
cols
,
xs
))
for
xs
in
zip
(
*
data
)]
def
mmcif_loop_to_dict
(
prefix
:
str
,
index
:
str
,
parsed_info
:
MmCIFDict
,
)
->
Mapping
[
str
,
Mapping
[
str
,
str
]]:
"""Extracts loop associated with a prefix from mmCIF data as a dictionary.
Args:
prefix: Prefix shared by each of the data items in the loop.
e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
_entity_poly_seq.mon_id. Should include the trailing period.
index: Which item of loop data should serve as the key.
parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
parser.
Returns:
Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
indexed by the index column.
"""
entries
=
mmcif_loop_to_list
(
prefix
,
parsed_info
)
return
{
entry
[
index
]:
entry
for
entry
in
entries
}
def
parse
(
*
,
file_id
:
str
,
mmcif_string
:
str
,
catch_all_errors
:
bool
=
True
)
->
ParsingResult
:
"""Entry point, parses an mmcif_string.
Args:
file_id: A string identifier for this file. Should be unique within the
collection of files being processed.
mmcif_string: Contents of an mmCIF file.
catch_all_errors: If True, all exceptions are caught and error messages are
returned as part of the ParsingResult. If False exceptions will be allowed
to propagate.
Returns:
A ParsingResult.
"""
errors
=
{}
try
:
parser
=
PDB
.
MMCIFParser
(
QUIET
=
True
)
handle
=
io
.
StringIO
(
mmcif_string
)
full_structure
=
parser
.
get_structure
(
''
,
handle
)
first_model_structure
=
_get_first_model
(
full_structure
)
# Extract the _mmcif_dict from the parser, which contains useful fields not
# reflected in the Biopython structure.
parsed_info
=
parser
.
_mmcif_dict
# pylint:disable=protected-access
# Ensure all values are lists, even if singletons.
for
key
,
value
in
parsed_info
.
items
():
if
not
isinstance
(
value
,
list
):
parsed_info
[
key
]
=
[
value
]
header
=
_get_header
(
parsed_info
)
# Determine the protein chains, and their start numbers according to the
# internal mmCIF numbering scheme (likely but not guaranteed to be 1).
valid_chains
=
_get_protein_chains
(
parsed_info
=
parsed_info
)
if
not
valid_chains
:
return
ParsingResult
(
None
,
{(
file_id
,
''
):
'No protein chains found in this file.'
})
seq_start_num
=
{
chain_id
:
min
([
monomer
.
num
for
monomer
in
seq
])
for
chain_id
,
seq
in
valid_chains
.
items
()}
# Loop over the atoms for which we have coordinates. Populate two mappings:
# -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
# the authors / Biopython).
# -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
mmcif_to_author_chain_id
=
{}
seq_to_structure_mappings
=
{}
for
atom
in
_get_atom_site_list
(
parsed_info
):
if
atom
.
model_num
!=
'1'
:
# We only process the first model at the moment.
continue
mmcif_to_author_chain_id
[
atom
.
mmcif_chain_id
]
=
atom
.
author_chain_id
if
atom
.
mmcif_chain_id
in
valid_chains
:
hetflag
=
' '
if
atom
.
hetatm_atom
==
'HETATM'
:
# Water atoms are assigned a special hetflag of W in Biopython. We
# need to do the same, so that this hetflag can be used to fetch
# a residue from the Biopython structure by id.
if
atom
.
residue_name
in
(
'HOH'
,
'WAT'
):
hetflag
=
'W'
else
:
hetflag
=
'H_'
+
atom
.
residue_name
insertion_code
=
atom
.
insertion_code
if
not
_is_set
(
atom
.
insertion_code
):
insertion_code
=
' '
position
=
ResiduePosition
(
chain_id
=
atom
.
author_chain_id
,
residue_number
=
int
(
atom
.
author_seq_num
),
insertion_code
=
insertion_code
)
seq_idx
=
int
(
atom
.
mmcif_seq_num
)
-
seq_start_num
[
atom
.
mmcif_chain_id
]
current
=
seq_to_structure_mappings
.
get
(
atom
.
author_chain_id
,
{})
current
[
seq_idx
]
=
ResidueAtPosition
(
position
=
position
,
name
=
atom
.
residue_name
,
is_missing
=
False
,
hetflag
=
hetflag
)
seq_to_structure_mappings
[
atom
.
author_chain_id
]
=
current
# Add missing residue information to seq_to_structure_mappings.
for
chain_id
,
seq_info
in
valid_chains
.
items
():
author_chain
=
mmcif_to_author_chain_id
[
chain_id
]
current_mapping
=
seq_to_structure_mappings
[
author_chain
]
for
idx
,
monomer
in
enumerate
(
seq_info
):
if
idx
not
in
current_mapping
:
current_mapping
[
idx
]
=
ResidueAtPosition
(
position
=
None
,
name
=
monomer
.
id
,
is_missing
=
True
,
hetflag
=
' '
)
author_chain_to_sequence
=
{}
for
chain_id
,
seq_info
in
valid_chains
.
items
():
author_chain
=
mmcif_to_author_chain_id
[
chain_id
]
seq
=
[]
for
monomer
in
seq_info
:
code
=
SCOPData
.
protein_letters_3to1
.
get
(
monomer
.
id
,
'X'
)
seq
.
append
(
code
if
len
(
code
)
==
1
else
'X'
)
seq
=
''
.
join
(
seq
)
author_chain_to_sequence
[
author_chain
]
=
seq
mmcif_object
=
MmcifObject
(
file_id
=
file_id
,
header
=
header
,
structure
=
first_model_structure
,
chain_to_seqres
=
author_chain_to_sequence
,
seqres_to_structure
=
seq_to_structure_mappings
,
raw_string
=
parsed_info
)
return
ParsingResult
(
mmcif_object
=
mmcif_object
,
errors
=
errors
)
except
Exception
as
e
:
# pylint:disable=broad-except
errors
[(
file_id
,
''
)]
=
e
if
not
catch_all_errors
:
raise
return
ParsingResult
(
mmcif_object
=
None
,
errors
=
errors
)
def
_get_first_model
(
structure
:
PdbStructure
)
->
PdbStructure
:
"""Returns the first model in a Biopython structure."""
return
next
(
structure
.
get_models
())
_MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE
=
21
def
get_release_date
(
parsed_info
:
MmCIFDict
)
->
str
:
"""Returns the oldest revision date."""
revision_dates
=
parsed_info
[
'_pdbx_audit_revision_history.revision_date'
]
return
min
(
revision_dates
)
def
_get_header
(
parsed_info
:
MmCIFDict
)
->
PdbHeader
:
"""Returns a basic header containing method, release date and resolution."""
header
=
{}
experiments
=
mmcif_loop_to_list
(
'_exptl.'
,
parsed_info
)
header
[
'structure_method'
]
=
','
.
join
([
experiment
[
'_exptl.method'
].
lower
()
for
experiment
in
experiments
])
# Note: The release_date here corresponds to the oldest revision. We prefer to
# use this for dataset filtering over the deposition_date.
if
'_pdbx_audit_revision_history.revision_date'
in
parsed_info
:
header
[
'release_date'
]
=
get_release_date
(
parsed_info
)
else
:
logging
.
warning
(
'Could not determine release_date: %s'
,
parsed_info
[
'_entry.id'
])
header
[
'resolution'
]
=
0.00
for
res_key
in
(
'_refine.ls_d_res_high'
,
'_em_3d_reconstruction.resolution'
,
'_reflns.d_resolution_high'
):
if
res_key
in
parsed_info
:
try
:
raw_resolution
=
parsed_info
[
res_key
][
0
]
header
[
'resolution'
]
=
float
(
raw_resolution
)
except
ValueError
:
logging
.
warning
(
'Invalid resolution format: %s'
,
parsed_info
[
res_key
])
return
header
def
_get_atom_site_list
(
parsed_info
:
MmCIFDict
)
->
Sequence
[
AtomSite
]:
"""Returns list of atom sites; contains data not present in the structure."""
return
[
AtomSite
(
*
site
)
for
site
in
zip
(
# pylint:disable=g-complex-comprehension
parsed_info
[
'_atom_site.label_comp_id'
],
parsed_info
[
'_atom_site.auth_asym_id'
],
parsed_info
[
'_atom_site.label_asym_id'
],
parsed_info
[
'_atom_site.auth_seq_id'
],
parsed_info
[
'_atom_site.label_seq_id'
],
parsed_info
[
'_atom_site.pdbx_PDB_ins_code'
],
parsed_info
[
'_atom_site.group_PDB'
],
parsed_info
[
'_atom_site.pdbx_PDB_model_num'
],
)]
def
_get_protein_chains
(
*
,
parsed_info
:
Mapping
[
str
,
Any
])
->
Mapping
[
ChainId
,
Sequence
[
Monomer
]]:
"""Extracts polymer information for protein chains only.
Args:
parsed_info: _mmcif_dict produced by the Biopython parser.
Returns:
A dict mapping mmcif chain id to a list of Monomers.
"""
# Get polymer information for each entity in the structure.
entity_poly_seqs
=
mmcif_loop_to_list
(
'_entity_poly_seq.'
,
parsed_info
)
polymers
=
collections
.
defaultdict
(
list
)
for
entity_poly_seq
in
entity_poly_seqs
:
polymers
[
entity_poly_seq
[
'_entity_poly_seq.entity_id'
]].
append
(
Monomer
(
id
=
entity_poly_seq
[
'_entity_poly_seq.mon_id'
],
num
=
int
(
entity_poly_seq
[
'_entity_poly_seq.num'
])))
# Get chemical compositions. Will allow us to identify which of these polymers
# are proteins.
chem_comps
=
mmcif_loop_to_dict
(
'_chem_comp.'
,
'_chem_comp.id'
,
parsed_info
)
# Get chains information for each entity. Necessary so that we can return a
# dict keyed on chain id rather than entity.
struct_asyms
=
mmcif_loop_to_list
(
'_struct_asym.'
,
parsed_info
)
entity_to_mmcif_chains
=
collections
.
defaultdict
(
list
)
for
struct_asym
in
struct_asyms
:
chain_id
=
struct_asym
[
'_struct_asym.id'
]
entity_id
=
struct_asym
[
'_struct_asym.entity_id'
]
entity_to_mmcif_chains
[
entity_id
].
append
(
chain_id
)
# Identify and return the valid protein chains.
valid_chains
=
{}
for
entity_id
,
seq_info
in
polymers
.
items
():
chain_ids
=
entity_to_mmcif_chains
[
entity_id
]
# Reject polymers without any peptide-like components, such as DNA/RNA.
if
any
([
'peptide'
in
chem_comps
[
monomer
.
id
][
'_chem_comp.type'
]
for
monomer
in
seq_info
]):
for
chain_id
in
chain_ids
:
valid_chains
[
chain_id
]
=
seq_info
return
valid_chains
def
_is_set
(
data
:
str
)
->
bool
:
"""Returns False if data is a special mmCIF character indicating 'unset'."""
return
data
not
in
(
'.'
,
'?'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment