pdbfile.py 18.5 KB
Newer Older
1
2
"""
pdbfile.py: Used for loading PDB files.
3
4
5
6
7
8

This is part of the OpenMM molecular simulation toolkit originating from
Simbios, the NIH National Center for Physics-Based Simulation of
Biological Structures at Stanford, funded under the NIH Roadmap for
Medical Research, grant U54 GM072970. See https://simtk.org.

9
Portions copyright (c) 2012-2015 Stanford University and the Authors.
10
11
12
Authors: Peter Eastman
Contributors:

Justin MacCallum's avatar
Justin MacCallum committed
13
Permission is hereby granted, free of charge, to any person obtaining a
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
USE OR OTHER DEALINGS IN THE SOFTWARE.
30
"""
31
from __future__ import print_function, division, absolute_import
32
33
34
35
36
__author__ = "Peter Eastman"
__version__ = "1.0"

import os
import sys
37
import math
38
39
import xml.etree.ElementTree as etree
from copy import copy
40
41
from datetime import date
from simtk.openmm import Vec3, Platform
42
from simtk.openmm.app.internal.pdbstructure import PdbStructure
43
from simtk.openmm.app.internal.unitcell import computeLengthsAndAngles
44
from simtk.openmm.app import Topology
45
from simtk.unit import nanometers, angstroms, is_quantity, norm, Quantity, dot
46
from . import element as elem
47
48
try:
    import numpy
49
except ImportError:
50
51
    pass

52
53
class PDBFile(object):
    """PDBFile parses a Protein Data Bank (PDB) file and constructs a Topology and a set of atom positions from it.
Justin MacCallum's avatar
Justin MacCallum committed
54

55
56
57
    This class also provides methods for creating PDB files.  To write a file containing a single model, call
    writeFile().  You also can create files that contain multiple models.  To do this, first call writeHeader(),
    then writeModel() once for each model in the file, and finally writeFooter() to complete the file."""
Justin MacCallum's avatar
Justin MacCallum committed
58

59
60
    _residueNameReplacements = {}
    _atomNameReplacements = {}
Justin MacCallum's avatar
Justin MacCallum committed
61

62
    def __init__(self, file, extraParticleIdentifier='EP'):
63
        """Load a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
64

65
        The atom positions and Topology can be retrieved by calling getPositions() and getTopology().
Justin MacCallum's avatar
Justin MacCallum committed
66

67
68
69
70
        Parameters
        ----------
        file : string
            the name of the file to load
71
72
        """
        top = Topology()
73
        ## The Topology read from the PDB file
74
        self.topology = top
Justin MacCallum's avatar
Justin MacCallum committed
75

76
        # Load the PDB file
Justin MacCallum's avatar
Justin MacCallum committed
77

peastman's avatar
peastman committed
78
79
80
81
        if isinstance(file, PdbStructure):
            pdb = file
        else:
            inputfile = file
82
            own_handle = False
peastman's avatar
peastman committed
83
84
            if isinstance(file, str):
                inputfile = open(file)
85
                own_handle = True
86
            pdb = PdbStructure(inputfile, load_all_models=True, extraParticleIdentifier=extraParticleIdentifier)
87
88
            if own_handle:
                inputfile.close()
89
90
91
92
        PDBFile._loadNameReplacementTables()

        # Build the topology

93
        atomByNumber = {}
94
        for chain in pdb.iter_chains():
95
            c = top.addChain(chain.chain_id)
96
97
98
99
            for residue in chain.iter_residues():
                resName = residue.get_name()
                if resName in PDBFile._residueNameReplacements:
                    resName = PDBFile._residueNameReplacements[resName]
100
                r = top.addResidue(resName, c, str(residue.number))
101
102
103
104
                if resName in PDBFile._atomNameReplacements:
                    atomReplacements = PDBFile._atomNameReplacements[resName]
                else:
                    atomReplacements = {}
105
                for atom in residue.atoms:
106
107
108
109
                    atomName = atom.get_name()
                    if atomName in atomReplacements:
                        atomName = atomReplacements[atomName]
                    atomName = atomName.strip()
110
                    element = atom.element
111
112
113
                    if element == extraParticleIdentifier:
                        element = None
                    elif element is None:
114
                        # Try to guess the element.
Justin MacCallum's avatar
Justin MacCallum committed
115

116
117
118
119
120
121
122
123
124
125
126
127
128
                        upper = atomName.upper()
                        if upper.startswith('CL'):
                            element = elem.chlorine
                        elif upper.startswith('NA'):
                            element = elem.sodium
                        elif upper.startswith('MG'):
                            element = elem.magnesium
                        elif upper.startswith('BE'):
                            element = elem.beryllium
                        elif upper.startswith('LI'):
                            element = elem.lithium
                        elif upper.startswith('K'):
                            element = elem.potassium
Carlos Hernández's avatar
Carlos Hernández committed
129
130
                        elif upper.startswith('ZN'):
                            element = elem.zinc
131
132
133
134
135
136
                        elif( len( residue ) == 1 and upper.startswith('CA') ):
                            element = elem.calcium
                        else:
                            try:
                                element = elem.get_by_symbol(atomName[0])
                            except KeyError:
137
                                pass
138
                    newAtom = top.addAtom(atomName, element, r, str(atom.serial_number))
139
                    atomByNumber[atom.serial_number] = newAtom
140
141
142
        self._positions = []
        for model in pdb.iter_models(True):
            coords = []
143
144
145
146
147
            for chain in model.iter_chains():
                for residue in chain.iter_residues():
                    for atom in residue.atoms:
                        pos = atom.get_position().value_in_unit(nanometers)
                        coords.append(Vec3(pos[0], pos[1], pos[2]))
148
149
150
            self._positions.append(coords*nanometers)
        ## The atom positions read from the PDB file.  If the file contains multiple frames, these are the positions in the first frame.
        self.positions = self._positions[0]
151
        self.topology.setPeriodicBoxVectors(pdb.get_periodic_box_vectors())
152
153
        self.topology.createStandardBonds()
        self.topology.createDisulfideBonds(self.positions)
154
        self._numpyPositions = None
Justin MacCallum's avatar
Justin MacCallum committed
155

156
        # Add bonds based on CONECT records.
Justin MacCallum's avatar
Justin MacCallum committed
157

158
        connectBonds = []
159
        for connect in pdb.models[-1].connects:
160
161
            i = connect[0]
            for j in connect[1:]:
peastman's avatar
peastman committed
162
163
                if i in atomByNumber and j in atomByNumber:
                    connectBonds.append((atomByNumber[i], atomByNumber[j]))
164
165
166
167
168
169
170
        if len(connectBonds) > 0:
            # Only add bonds that don't already exist.
            existingBonds = set(top.bonds())
            for bond in connectBonds:
                if bond not in existingBonds and (bond[1], bond[0]) not in existingBonds:
                    top.addBond(bond[0], bond[1])
                    existingBonds.add(bond)
171
172
173
174

    def getTopology(self):
        """Get the Topology of the model."""
        return self.topology
Justin MacCallum's avatar
Justin MacCallum committed
175

176
177
178
    def getNumFrames(self):
        """Get the number of frames stored in the file."""
        return len(self._positions)
Justin MacCallum's avatar
Justin MacCallum committed
179

180
    def getPositions(self, asNumpy=False, frame=0):
181
        """Get the atomic positions.
Justin MacCallum's avatar
Justin MacCallum committed
182

Robert McGibbon's avatar
Robert McGibbon committed
183
184
185
186
187
188
189
190
        Parameters
        ----------
        asNumpy : boolean=False
            if true, the values are returned as a numpy array instead of a list
            of Vec3s
        frame : int=0
            the index of the frame for which to get positions
        """
191
192
        if asNumpy:
            if self._numpyPositions is None:
193
194
195
196
197
                self._numpyPositions = [None]*len(self._positions)
            if self._numpyPositions[frame] is None:
                self._numpyPositions[frame] = Quantity(numpy.array(self._positions[frame].value_in_unit(nanometers)), nanometers)
            return self._numpyPositions[frame]
        return self._positions[frame]
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

    @staticmethod
    def _loadNameReplacementTables():
        """Load the list of atom and residue name replacements."""
        if len(PDBFile._residueNameReplacements) == 0:
            tree = etree.parse(os.path.join(os.path.dirname(__file__), 'data', 'pdbNames.xml'))
            allResidues = {}
            proteinResidues = {}
            nucleicAcidResidues = {}
            for residue in tree.getroot().findall('Residue'):
                name = residue.attrib['name']
                if name == 'All':
                    PDBFile._parseResidueAtoms(residue, allResidues)
                elif name == 'Protein':
                    PDBFile._parseResidueAtoms(residue, proteinResidues)
                elif name == 'Nucleic':
                    PDBFile._parseResidueAtoms(residue, nucleicAcidResidues)
            for atom in allResidues:
                proteinResidues[atom] = allResidues[atom]
                nucleicAcidResidues[atom] = allResidues[atom]
            for residue in tree.getroot().findall('Residue'):
                name = residue.attrib['name']
                for id in residue.attrib:
                    if id == 'name' or id.startswith('alt'):
                        PDBFile._residueNameReplacements[residue.attrib[id]] = name
                if 'type' not in residue.attrib:
                    atoms = copy(allResidues)
                elif residue.attrib['type'] == 'Protein':
                    atoms = copy(proteinResidues)
                elif residue.attrib['type'] == 'Nucleic':
                    atoms = copy(nucleicAcidResidues)
                else:
                    atoms = copy(allResidues)
                PDBFile._parseResidueAtoms(residue, atoms)
                PDBFile._atomNameReplacements[name] = atoms

    @staticmethod
    def _parseResidueAtoms(residue, map):
        for atom in residue.findall('Atom'):
            name = atom.attrib['name']
            for id in atom.attrib:
                map[atom.attrib[id]] = name
Justin MacCallum's avatar
Justin MacCallum committed
240

241
    @staticmethod
242
    def writeFile(topology, positions, file=sys.stdout, keepIds=False, extraParticleIdentifier='EP'):
243
        """Write a PDB file containing a single model.
Justin MacCallum's avatar
Justin MacCallum committed
244

Robert McGibbon's avatar
Robert McGibbon committed
245
246
247
248
249
250
251
252
253
254
255
256
257
        Parameters
        ----------
        topology : Topology
            The Topology defining the model to write
        positions : list
            The list of atomic positions to write
        file : file=stdout
            A file to write to
        keepIds : bool=False
            If True, keep the residue and chain IDs specified in the Topology
            rather than generating new ones.  Warning: It is up to the caller to
            make sure these are valid IDs that satisfy the requirements of the
            PDB format.  Otherwise, the output file will be invalid.
258
259
        """
        PDBFile.writeHeader(topology, file)
260
        PDBFile.writeModel(topology, positions, file, keepIds=keepIds, extraParticleIdentifier=extraParticleIdentifier)
261
262
263
264
265
        PDBFile.writeFooter(topology, file)

    @staticmethod
    def writeHeader(topology, file=sys.stdout):
        """Write out the header for a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
266

Robert McGibbon's avatar
Robert McGibbon committed
267
268
269
270
271
272
        Parameters
        ----------
        topology : Topology
            The Topology defining the molecular system being written
        file : file=stdout
            A file to write the file to
273
        """
274
        print("REMARK   1 CREATED WITH OPENMM %s, %s" % (Platform.getOpenMMVersion(), str(date.today())), file=file)
275
276
        vectors = topology.getPeriodicBoxVectors()
        if vectors is not None:
277
278
279
280
            a, b, c, alpha, beta, gamma = computeLengthsAndAngles(vectors)
            RAD_TO_DEG = 180/math.pi
            print("CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1           1 " % (
                    a*10, b*10, c*10, alpha*RAD_TO_DEG, beta*RAD_TO_DEG, gamma*RAD_TO_DEG), file=file)
Justin MacCallum's avatar
Justin MacCallum committed
281

282
    @staticmethod
283
    def writeModel(topology, positions, file=sys.stdout, modelIndex=None, keepIds=False, extraParticleIdentifier='EP'):
284
        """Write out a model to a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
285

Robert McGibbon's avatar
Robert McGibbon committed
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
        Parameters
        ----------
        topology : Topology
            The Topology defining the model to write
        positions : list
            The list of atomic positions to write
        file : file=stdout
            A file to write the model to
        modelIndex : int=None
            If not None, the model will be surrounded by MODEL/ENDMDL records
            with this index
        keepIds : bool=False
            If True, keep the residue and chain IDs specified in the Topology
            rather than generating new ones.  Warning: It is up to the caller to
            make sure these are valid IDs that satisfy the requirements of the
            PDB format.  Otherwise, the output file will be invalid.
302
        """
Robert McGibbon's avatar
Robert McGibbon committed
303

304
        if len(list(topology.atoms())) != len(positions):
Justin MacCallum's avatar
Justin MacCallum committed
305
            raise ValueError('The number of positions must match the number of atoms')
306
307
        if is_quantity(positions):
            positions = positions.value_in_unit(angstroms)
308
309
310
311
        if any(math.isnan(norm(pos)) for pos in positions):
            raise ValueError('Particle position is NaN')
        if any(math.isinf(norm(pos)) for pos in positions):
            raise ValueError('Particle position is infinite')
312
313
314
        atomIndex = 1
        posIndex = 0
        if modelIndex is not None:
315
            print("MODEL     %4d" % modelIndex, file=file)
316
        for (chainIndex, chain) in enumerate(topology.chains()):
317
318
319
320
            if keepIds:
                chainName = chain.id
            else:
                chainName = chr(ord('A')+chainIndex%26)
321
322
323
324
325
326
            residues = list(chain.residues())
            for (resIndex, res) in enumerate(residues):
                if len(res.name) > 3:
                    resName = res.name[:3]
                else:
                    resName = res.name
327
328
329
330
                if keepIds:
                    resId = res.id
                else:
                    resId = "%4d" % ((resIndex+1)%10000)
331
                for atom in res.atoms():
332
333
334
335
336
337
338
                    if atom.element is not None:
                        symbol = atom.element.symbol
                    elif atom.element is None and extraParticleIdentifier is not None:
                        symbol = extraParticleIdentifier
                    else:
                        symbol = ' '
                    if len(atom.name) < 4 and atom.name[:1].isalpha() and len(symbol) < 2:
339
340
341
342
343
344
                        atomName = ' '+atom.name
                    elif len(atom.name) > 4:
                        atomName = atom.name[:4]
                    else:
                        atomName = atom.name
                    coords = positions[posIndex]
345
346
                    line = "ATOM  %5d %-4s %3s %s%4s    %s%s%s  1.00  0.00          %2s  " % (
                        atomIndex%100000, atomName, resName, chainName, resId, _format_83(coords[0]),
Robert McGibbon's avatar
Robert McGibbon committed
347
                        _format_83(coords[1]), _format_83(coords[2]), symbol)
348
                    assert len(line) == 80, 'Fixed width overflow detected'
349
                    print(line, file=file)
350
351
352
                    posIndex += 1
                    atomIndex += 1
                if resIndex == len(residues)-1:
353
                    print("TER   %5d      %3s %s%4s" % (atomIndex, resName, chainName, resId), file=file)
354
355
                    atomIndex += 1
        if modelIndex is not None:
356
            print("ENDMDL", file=file)
357
358
359
360

    @staticmethod
    def writeFooter(topology, file=sys.stdout):
        """Write out the footer for a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
361

Robert McGibbon's avatar
Robert McGibbon committed
362
363
364
365
366
367
        Parameters
        ----------
        topology : Topology
            The Topology defining the molecular system being written
        file : file=stdout
            A file to write the file to
368
        """
peastman's avatar
peastman committed
369
        # Identify bonds that should be listed as CONECT records.
Robert McGibbon's avatar
Robert McGibbon committed
370

peastman's avatar
peastman committed
371
372
373
374
375
376
377
378
379
380
        standardResidues = ['ALA', 'ASN', 'CYS', 'GLU', 'HIS', 'LEU', 'MET', 'PRO', 'THR', 'TYR',
                            'ARG', 'ASP', 'GLN', 'GLY', 'ILE', 'LYS', 'PHE', 'SER', 'TRP', 'VAL',
                            'A', 'G', 'C', 'U', 'I', 'DA', 'DG', 'DC', 'DT', 'DI', 'HOH']
        conectBonds = []
        for atom1, atom2 in topology.bonds():
            if atom1.residue.name not in standardResidues or atom2.residue.name not in standardResidues:
                conectBonds.append((atom1, atom2))
            elif atom1.name == 'SG' and atom2.name == 'SG' and atom1.residue.name == 'CYS' and atom2.residue.name == 'CYS':
                conectBonds.append((atom1, atom2))
        if len(conectBonds) > 0:
Robert McGibbon's avatar
Robert McGibbon committed
381

peastman's avatar
peastman committed
382
            # Work out the index used in the PDB file for each atom.
Robert McGibbon's avatar
Robert McGibbon committed
383

peastman's avatar
peastman committed
384
385
386
387
388
389
390
391
392
393
            atomIndex = {}
            nextAtomIndex = 0
            prevChain = None
            for chain in topology.chains():
                for atom in chain.atoms():
                    if atom.residue.chain != prevChain:
                        nextAtomIndex += 1
                        prevChain = atom.residue.chain
                    atomIndex[atom] = nextAtomIndex
                    nextAtomIndex += 1
Robert McGibbon's avatar
Robert McGibbon committed
394

peastman's avatar
peastman committed
395
            # Record which other atoms each atom is bonded to.
Robert McGibbon's avatar
Robert McGibbon committed
396

peastman's avatar
peastman committed
397
398
399
400
401
402
403
404
405
406
            atomBonds = {}
            for atom1, atom2 in conectBonds:
                index1 = atomIndex[atom1]
                index2 = atomIndex[atom2]
                if index1 not in atomBonds:
                    atomBonds[index1] = []
                if index2 not in atomBonds:
                    atomBonds[index2] = []
                atomBonds[index1].append(index2)
                atomBonds[index2].append(index1)
Robert McGibbon's avatar
Robert McGibbon committed
407

peastman's avatar
peastman committed
408
            # Write the CONECT records.
Robert McGibbon's avatar
Robert McGibbon committed
409

peastman's avatar
peastman committed
410
411
412
            for index1 in sorted(atomBonds):
                bonded = atomBonds[index1]
                while len(bonded) > 4:
413
                    print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=file)
peastman's avatar
peastman committed
414
415
416
417
                    del bonded[:4]
                line = "CONECT%5d" % index1
                for index2 in bonded:
                    line = "%s%5d" % (line, index2)
418
419
                print(line, file=file)
        print("END", file=file)
420

421
422
423
424
425

def _format_83(f):
    """Format a single float into a string of width 8, with ideally 3 decimal
    places of precision. If the number is a little too large, we can
    gracefully degrade the precision by lopping off some of the decimal
Robert McGibbon's avatar
Robert McGibbon committed
426
    places. If it's much too large, we throw a ValueError"""
427
428
429
430
    if -999.999 < f < 9999.999:
        return '%8.3f' % f
    if -9999999 < f < 99999999:
        return ('%8.3f' % f)[:8]
431
    raise ValueError('coordinate "%s" could not be represented '
Robert McGibbon's avatar
Robert McGibbon committed
432
                     'in a width-8 field' % f)