Commit 22cc45a8 authored by Gustaf's avatar Gustaf
Browse files

Begin to add ProteinNet support (WIP)

parent ed9ca30d
...@@ -37,6 +37,33 @@ def empty_template_feats(n_res) -> FeatureDict: ...@@ -37,6 +37,33 @@ def empty_template_feats(n_res) -> FeatureDict:
} }
def make_template_features(
input_sequence: str,
hits: Sequence[Any],
template_featurizer: Any,
query_pdb_code: Optional[str] = None,
query_release_date: Optional[str] = None,
) -> FeatureDict:
hits_cat = sum(hits.values(), [])
if(len(hits_cat) == 0):
template_features = empty_template_feats(len(input_sequence))
else:
templates_result = template_featurizer.get_templates(
query_sequence=input_sequence,
query_pdb_code=query_pdb_code,
query_release_date=query_release_date,
hits=hits_cat,
)
template_features = templates_result.features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if(template_features["template_aatype"].shape[0] == 0):
template_features = empty_template_feats(len(input_sequence))
return template_features
def make_sequence_features( def make_sequence_features(
sequence: str, description: str, num_res: int sequence: str, description: str, num_res: int
) -> FeatureDict: ) -> FeatureDict:
...@@ -95,16 +122,21 @@ def make_mmcif_features( ...@@ -95,16 +122,21 @@ def make_mmcif_features(
return mmcif_feats return mmcif_feats
def make_pdb_features( def _aatype_to_str_sequence(aatype):
return str([
residue_constants.restypes[aatype[i]] for i in range(len(aatype))
])
def make_protein_features(
protein_object: protein.Protein, protein_object: protein.Protein,
description: str, description: str,
confidence_threshold: float = 0.5,
) -> FeatureDict: ) -> FeatureDict:
pdb_feats = {} pdb_feats = {}
aatype = protein_object.aatype
sequence = _aatype_to_str_sequence(aatype)
pdb_feats.update( pdb_feats.update(
make_sequence_features( make_sequence_features(
sequence=protein_object.aatype, sequence=sequence,
description=description, description=description,
num_res=len(protein_object.aatype), num_res=len(protein_object.aatype),
) )
...@@ -113,12 +145,6 @@ def make_pdb_features( ...@@ -113,12 +145,6 @@ def make_pdb_features(
all_atom_positions = protein_object.atom_positions all_atom_positions = protein_object.atom_positions
all_atom_mask = protein_object.atom_mask all_atom_mask = protein_object.atom_mask
high_confidence = protein.b_factors > confidence_threshold
high_confidence = np.any(high_confidence, axis=-1)
for i, confident in enumerate(high_confidence):
if(not confident):
all_atom_mask[i] = 0
pdb_feats["all_atom_positions"] = all_atom_positions pdb_feats["all_atom_positions"] = all_atom_positions
pdb_feats["all_atom_mask"] = all_atom_mask pdb_feats["all_atom_mask"] = all_atom_mask
...@@ -128,6 +154,22 @@ def make_pdb_features( ...@@ -128,6 +154,22 @@ def make_pdb_features(
return pdb_feats return pdb_feats
def make_pdb_features(
protein_object: protein.Protein,
description: str,
confidence_threshold: float = 0.5,
) -> FeatureDict:
pdb_feats = make_protein_features(protein_object, description)
high_confidence = protein_object.b_factors > confidence_threshold
high_confidence = np.any(high_confidence, axis=-1)
for i, confident in enumerate(high_confidence):
if(not confident):
pdb_feats["all_atom_mask"][i] = 0
return pdb_feats
def make_msa_features( def make_msa_features(
msas: Sequence[Sequence[str]], msas: Sequence[Sequence[str]],
deletion_matrices: Sequence[parsers.DeletionMatrix], deletion_matrices: Sequence[parsers.DeletionMatrix],
...@@ -349,22 +391,11 @@ class DataPipeline: ...@@ -349,22 +391,11 @@ class DataPipeline:
num_res = len(input_sequence) num_res = len(input_sequence)
hits = self._parse_template_hits(alignment_dir) hits = self._parse_template_hits(alignment_dir)
hits_cat = sum(hits.values(), []) template_features = make_template_features(
if(len(hits_cat) == 0): input_sequence,
template_features = empty_template_feats(len(input_sequence)) hits,
else: self.template_featurizer,
templates_result = self.template_featurizer.get_templates(
query_sequence=input_sequence,
query_pdb_code=None,
query_release_date=None,
hits=hits_cat,
) )
template_features = templates_result.features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if(template_features["template_aatype"].shape[0] == 0):
template_features = empty_template_feats(len(input_sequence))
sequence_features = make_sequence_features( sequence_features = make_sequence_features(
sequence=input_sequence, sequence=input_sequence,
...@@ -403,22 +434,12 @@ class DataPipeline: ...@@ -403,22 +434,12 @@ class DataPipeline:
input_sequence = mmcif.chain_to_seqres[chain_id] input_sequence = mmcif.chain_to_seqres[chain_id]
hits = self._parse_template_hits(alignment_dir) hits = self._parse_template_hits(alignment_dir)
hits_cat = sum(hits.values(), []) template_features = make_template_features(
if(len(hits_cat) == 0): input_sequence,
template_features = empty_template_feats(len(input_sequence)) hits,
else: self.template_featurizer,
templates_result = self.template_featurizer.get_templates( query_release_date=to_date(mmcif.header["release_date"])
query_sequence=input_sequence,
query_pdb_code=None,
query_release_date=to_date(mmcif.header["release_date"]),
hits=hits_cat,
) )
template_features = templates_result.features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if(template_features["template_aatype"].shape[0] == 0):
template_features = empty_template_feats(len(input_sequence))
msa_features = self._process_msa_feats(alignment_dir) msa_features = self._process_msa_feats(alignment_dir)
...@@ -433,31 +454,48 @@ class DataPipeline: ...@@ -433,31 +454,48 @@ class DataPipeline:
Assembles features for a protein in a PDB file. Assembles features for a protein in a PDB file.
""" """
with open(pdb_path, 'r') as f: with open(pdb_path, 'r') as f:
pdb_str = pdb_path pdb_str = f.read()
protein_object = protein.from_pdb_string(pdb_str) protein_object = protein.from_pdb_string(pdb_str)
input_sequence = protein_object.aatype input_sequence = _aatype_to_str_sequence(protein_object.aatype)
description = os.path.splitext(os.path.basename(pdb_path))[0].upper()
pdb_feats = make_pdb_features(protein_object) pdb_feats = make_pdb_features(protein_object, description)
hits = self._parse_template_hits(alignment_dir) hits = self._parse_template_hits(alignment_dir)
hits_cat = sum(hits.values(), []) template_features = make_template_features(
if(len(hits_cat) == 0): input_sequence,
template_features = empty_template_feats(len(input_sequence)) hits,
else: self.template_featurizer,
templates_result = self.template_featurizer.get_templates(
query_sequence=input_sequence,
query_pdb_code=None,
query_release_date=None,
hits=hits_cat,
) )
template_features = templates_result.features
# The template featurizer doesn't format empty template features
# properly. This is a quick fix.
if(template_features["template_aatype"].shape[0] == 0):
template_features = empty_template_feats(len(input_sequence))
msa_features = self._process_msa_feats(alignment_dir) msa_features = self._process_msa_feats(alignment_dir)
return {**pdb_feats, **template_features, **msa_features} return {**pdb_feats, **template_features, **msa_features}
def process_core(
self,
core_path: str,
alignment_dir: str,
) -> FeatureDict:
"""
Assembles features for a protein in a ProteinNet .core file.
"""
with open(core_path, 'r') as f:
core_str = f.read()
protein_object = protein.from_proteinnet_string(core_str)
input_sequence = _aatype_to_str_sequence(protein_object.aatype)
description = os.path.splitext(os.path.basename(core_path))[0].upper()
core_feats = make_protein_features(protein_object, description)
hits = self._parse_template_hits(alignment_dir)
template_features = make_template_features(
input_sequence,
hits,
self.template_featurizer,
)
msa_features = self._process_msa_feats(alignment_dir)
return {**core_feats, **template_features, **msa_features}
...@@ -17,13 +17,16 @@ ...@@ -17,13 +17,16 @@
import dataclasses import dataclasses
import io import io
from typing import Any, Mapping, Optional from typing import Any, Mapping, Optional
import re
from openfold.np import residue_constants from openfold.np import residue_constants
from Bio.PDB import PDBParser from Bio.PDB import PDBParser
import numpy as np import numpy as np
FeatureDict = Mapping[str, np.ndarray] FeatureDict = Mapping[str, np.ndarray]
ModelOutput = Mapping[str, Any] # Is a nested dict. ModelOutput = Mapping[str, Any] # Is a nested dict.
PICO_TO_ANGSTROM = 0.01
@dataclasses.dataclass(frozen=True) @dataclasses.dataclass(frozen=True)
class Protein: class Protein:
...@@ -132,6 +135,59 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein: ...@@ -132,6 +135,59 @@ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
) )
def from_proteinnet_string(proteinnet_str: str) -> Protein:
tag_re = r'(\[[A-Z]+\]\n)'
tags = [
tag.strip() for tag in re.split(tag_re, proteinnet_str) if len(tag) > 0
]
groups = zip(tags[0::2], [l.split('\n') for l in tags[1::2]])
atoms = ['N', 'CA', 'C']
aatype = None
atom_positions = None
atom_mask = None
for g in groups:
if("[PRIMARY]" == g[0]):
seq = g[1][0].strip()
for i in range(len(seq)):
if(seq[i] not in residue_constants.restypes):
seq[i] = 'X'
aatype = np.array([
residue_constants.restype_order.get(
res_symbol, residue_constants.restype_num
) for res_symbol in seq
])
elif("[TERTIARY]" == g[0]):
tertiary = []
for axis in range(3):
tertiary.append(list(map(float, g[1][axis].split())))
tertiary_np = np.array(tertiary)
atom_positions = np.zeros(
(len(tertiary[0])//3, residue_constants.atom_type_num, 3)
).astype(np.float32)
for i, atom in enumerate(atoms):
atom_positions[:, residue_constants.atom_order[atom], :] = (
np.transpose(tertiary_np[:, i::3])
)
atom_positions *= PICO_TO_ANGSTROM
elif("[MASK]" == g[0]):
mask = np.array(list(map({'-': 0, '+': 1}.get, g[1][0].strip())))
atom_mask = np.zeros(
(len(mask), residue_constants.atom_type_num,)
).astype(np.float32)
for i, atom in enumerate(atoms):
atom_mask[:, residue_constants.atom_order[atom]] = 1
atom_mask *= mask[..., None]
return Protein(
atom_positions=atom_positions,
atom_mask=atom_mask,
aatype=aatype,
residue_index=np.arange(len(aatype)),
b_factors=None,
)
def to_pdb(prot: Protein) -> str: def to_pdb(prot: Protein) -> str:
"""Converts a `Protein` instance to a PDB string. """Converts a `Protein` instance to a PDB string.
......
import argparse
import os
from pathlib import Path
def _write_file(args, file_in_progress):
file_id = file_in_progress[1]
fname = file_id.upper() + ".core"
fpath = os.path.join(args.output_dir, fname)
with open(fpath, "w") as fp:
fp.write('\n'.join(file_in_progress))
def main(args):
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
with open(args.proteinnet_file, "r") as fp:
proteinnet_string = fp.readlines()
file_in_progress = []
for line in proteinnet_string:
if(line == "[ID]\n"):
if(len(file_in_progress) > 0):
_write_file(args, file_in_progress)
file_in_progress = []
file_in_progress.append(line.strip())
if(len(file_in_progress) > 0):
_write_file(args, file_in_progress)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"proteinnet_file", type=str,
help="Path to ProteinNet file to unpack"
)
parser.add_argument(
"output_dir", type=str,
help="Path to directory in which to output .core files"
)
args = parser.parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment