protein.py 7.54 KB
Newer Older
Augustin-Zidek's avatar
Augustin-Zidek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2021 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Protein data type."""
Tom Ward's avatar
Tom Ward committed
16
import dataclasses
Augustin-Zidek's avatar
Augustin-Zidek committed
17
18
import io
from typing import Any, Mapping, Optional
Tom Ward's avatar
Tom Ward committed
19
from alphafold.common import residue_constants
Augustin-Zidek's avatar
Augustin-Zidek committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from Bio.PDB import PDBParser
import numpy as np

FeatureDict = Mapping[str, np.ndarray]
ModelOutput = Mapping[str, Any]  # Is a nested dict.


@dataclasses.dataclass(frozen=True)
class Protein:
  """Protein structure representation."""

  # Cartesian coordinates of atoms in angstroms. The atom types correspond to
  # residue_constants.atom_types, i.e. the first three are N, CA, CB.
  atom_positions: np.ndarray  # [num_res, num_atom_type, 3]

  # Amino-acid type for each residue represented as an integer between 0 and
  # 20, where 20 is 'X'.
  aatype: np.ndarray  # [num_res]

  # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
  # is present and 0.0 if not. This should be used for loss masking.
  atom_mask: np.ndarray  # [num_res, num_atom_type]

  # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
  residue_index: np.ndarray  # [num_res]

  # B-factors, or temperature factors, of each residue (in sq. angstroms units),
  # representing the displacement of the residue from its ground truth mean
  # value.
  b_factors: np.ndarray  # [num_res, num_atom_type]


def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
  """Takes a PDB string and constructs a Protein object.

  WARNING: All non-standard residue types will be converted into UNK. All
    non-standard atoms will be ignored.

  Args:
    pdb_str: The contents of the pdb file
    chain_id: If None, then the pdb file must contain a single chain (which
      will be parsed). If chain_id is specified (e.g. A), then only that chain
      is parsed.

  Returns:
    A new `Protein` parsed from the pdb contents.
  """
  pdb_fh = io.StringIO(pdb_str)
68
  parser = PDBParser(QUIET=True)
Augustin-Zidek's avatar
Augustin-Zidek committed
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  structure = parser.get_structure('none', pdb_fh)
  models = list(structure.get_models())
  if len(models) != 1:
    raise ValueError(
        f'Only single model PDBs are supported. Found {len(models)} models.')
  model = models[0]

  if chain_id is not None:
    chain = model[chain_id]
  else:
    chains = list(model.get_chains())
    if len(chains) != 1:
      raise ValueError(
          'Only single chain PDBs are supported when chain_id not specified. '
          f'Found {len(chains)} chains.')
    else:
      chain = chains[0]

  atom_positions = []
  aatype = []
  atom_mask = []
  residue_index = []
  b_factors = []

  for res in chain:
    if res.id[2] != ' ':
      raise ValueError(
          f'PDB contains an insertion code at chain {chain.id} and residue '
          f'index {res.id[1]}. These are not supported.')
    res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
    restype_idx = residue_constants.restype_order.get(
        res_shortname, residue_constants.restype_num)
    pos = np.zeros((residue_constants.atom_type_num, 3))
    mask = np.zeros((residue_constants.atom_type_num,))
    res_b_factors = np.zeros((residue_constants.atom_type_num,))
    for atom in res:
      if atom.name not in residue_constants.atom_types:
        continue
      pos[residue_constants.atom_order[atom.name]] = atom.coord
      mask[residue_constants.atom_order[atom.name]] = 1.
      res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
    if np.sum(mask) < 0.5:
      # If no known atom positions are reported for the residue then skip it.
      continue
    aatype.append(restype_idx)
    atom_positions.append(pos)
    atom_mask.append(mask)
    residue_index.append(res.id[1])
    b_factors.append(res_b_factors)

  return Protein(
      atom_positions=np.array(atom_positions),
      atom_mask=np.array(atom_mask),
      aatype=np.array(aatype),
      residue_index=np.array(residue_index),
      b_factors=np.array(b_factors))


def to_pdb(prot: Protein) -> str:
  """Converts a `Protein` instance to a PDB string.

  Args:
    prot: The protein to convert to PDB.

  Returns:
    PDB string.
  """
  restypes = residue_constants.restypes + ['X']
  res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
  atom_types = residue_constants.atom_types

  pdb_lines = []

  atom_mask = prot.atom_mask
  aatype = prot.aatype
  atom_positions = prot.atom_positions
  residue_index = prot.residue_index.astype(np.int32)
  b_factors = prot.b_factors

  if np.any(aatype > residue_constants.restype_num):
    raise ValueError('Invalid aatypes.')

  pdb_lines.append('MODEL     1')
  atom_index = 1
  chain_id = 'A'
  # Add all atom sites.
  for i in range(aatype.shape[0]):
    res_name_3 = res_1to3(aatype[i])
    for atom_name, pos, mask, b_factor in zip(
        atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
      if mask < 0.5:
        continue

      record_type = 'ATOM'
      name = atom_name if len(atom_name) == 4 else f' {atom_name}'
      alt_loc = ''
      insertion_code = ''
      occupancy = 1.00
      element = atom_name[0]  # Protein supports only C, N, O, S, this works.
      charge = ''
      # PDB is a columnar format, every space matters here!
      atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
                   f'{res_name_3:>3} {chain_id:>1}'
                   f'{residue_index[i]:>4}{insertion_code:>1}   '
                   f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
                   f'{occupancy:>6.2f}{b_factor:>6.2f}          '
                   f'{element:>2}{charge:>2}')
      pdb_lines.append(atom_line)
      atom_index += 1

  # Close the chain.
  chain_end = 'TER'
  chain_termination_line = (
      f'{chain_end:<6}{atom_index:>5}      {res_1to3(aatype[-1]):>3} '
      f'{chain_id:>1}{residue_index[-1]:>4}')
  pdb_lines.append(chain_termination_line)
  pdb_lines.append('ENDMDL')

  pdb_lines.append('END')
  pdb_lines.append('')
  return '\n'.join(pdb_lines)


def ideal_atom_mask(prot: Protein) -> np.ndarray:
  """Computes an ideal atom mask.

  `Protein.atom_mask` typically is defined according to the atoms that are
  reported in the PDB. This function computes a mask according to heavy atoms
  that should be present in the given seqence of amino acids.

  Args:
    prot: `Protein` whose fields are `numpy.ndarray` objects.

  Returns:
    An ideal atom mask.
  """
  return residue_constants.STANDARD_ATOM_MASK[prot.aatype]


208
209
def from_prediction(features: FeatureDict, result: ModelOutput,
                    b_factors: Optional[np.ndarray] = None) -> Protein:
Augustin-Zidek's avatar
Augustin-Zidek committed
210
211
212
213
214
  """Assembles a protein from a prediction.

  Args:
    features: Dictionary holding model inputs.
    result: Dictionary holding model outputs.
215
    b_factors: (Optional) B-factors to use for the protein.
Augustin-Zidek's avatar
Augustin-Zidek committed
216
217
218
219
220

  Returns:
    A protein instance.
  """
  fold_output = result['structure_module']
221
222
  if b_factors is None:
    b_factors = np.zeros_like(fold_output['final_atom_mask'])
Augustin-Zidek's avatar
Augustin-Zidek committed
223
224
225
226
227
228

  return Protein(
      aatype=features['aatype'][0],
      atom_positions=fold_output['final_atom_positions'],
      atom_mask=fold_output['final_atom_mask'],
      residue_index=features['residue_index'][0] + 1,
229
      b_factors=b_factors)