pdbfile.py 20.9 KB
Newer Older
1
2
"""
pdbfile.py: Used for loading PDB files.
3
4
5
6
7
8

This is part of the OpenMM molecular simulation toolkit originating from
Simbios, the NIH National Center for Physics-Based Simulation of
Biological Structures at Stanford, funded under the NIH Roadmap for
Medical Research, grant U54 GM072970. See https://simtk.org.

9
Portions copyright (c) 2012-2020 Stanford University and the Authors.
10
11
12
Authors: Peter Eastman
Contributors:

Justin MacCallum's avatar
Justin MacCallum committed
13
Permission is hereby granted, free of charge, to any person obtaining a
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
USE OR OTHER DEALINGS IN THE SOFTWARE.
30
"""
31
from __future__ import print_function, division, absolute_import
32
33
34
35
36
__author__ = "Peter Eastman"
__version__ = "1.0"

import os
import sys
37
import math
38
39
import xml.etree.ElementTree as etree
from copy import copy
40
41
from datetime import date
from simtk.openmm import Vec3, Platform
42
from simtk.openmm.app.internal.pdbstructure import PdbStructure
43
from simtk.openmm.app.internal.unitcell import computeLengthsAndAngles
44
from simtk.openmm.app import Topology
45
from simtk.unit import nanometers, angstroms, is_quantity, norm, Quantity, dot
46
from . import element as elem
47
48
try:
    import numpy
49
except ImportError:
50
51
    pass

52
53
class PDBFile(object):
    """PDBFile parses a Protein Data Bank (PDB) file and constructs a Topology and a set of atom positions from it.
Justin MacCallum's avatar
Justin MacCallum committed
54

55
56
57
    This class also provides methods for creating PDB files.  To write a file containing a single model, call
    writeFile().  You also can create files that contain multiple models.  To do this, first call writeHeader(),
    then writeModel() once for each model in the file, and finally writeFooter() to complete the file."""
Justin MacCallum's avatar
Justin MacCallum committed
58

59
60
    _residueNameReplacements = {}
    _atomNameReplacements = {}
61
62
63
    _standardResidues = ['ALA', 'ASN', 'CYS', 'GLU', 'HIS', 'LEU', 'MET', 'PRO', 'THR', 'TYR',
                         'ARG', 'ASP', 'GLN', 'GLY', 'ILE', 'LYS', 'PHE', 'SER', 'TRP', 'VAL',
                         'A', 'G', 'C', 'U', 'I', 'DA', 'DG', 'DC', 'DT', 'DI', 'HOH']
Justin MacCallum's avatar
Justin MacCallum committed
64

65
    def __init__(self, file, extraParticleIdentifier='EP'):
66
        """Load a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
67

68
        The atom positions and Topology can be retrieved by calling getPositions() and getTopology().
Justin MacCallum's avatar
Justin MacCallum committed
69

70
71
        Parameters
        ----------
72
73
        file : string or file
            the name of the file to load.  Alternatively you can pass an open file object.
Rafal P. Wiewiora's avatar
Rafal P. Wiewiora committed
74
75
        extraParticleIdentifier : string='EP'
            if this value appears in the element column for an ATOM record, the Atom's element will be set to None to mark it as an extra particle
76
        """
77
78
        
        metalElements = ['Al','As','Ba','Ca','Cd','Ce','Co','Cs','Cu','Dy','Fe','Gd','Hg','Ho','In','Ir','K','Li','Mg',
Rafal P. Wiewiora's avatar
Rafal P. Wiewiora committed
79
        'Mn','Mo','Na','Ni','Pb','Pd','Pt','Rb','Rh','Sm','Sr','Te','Tl','V','W','Yb','Zn']
80
        
81
        top = Topology()
82
        ## The Topology read from the PDB file
83
        self.topology = top
Justin MacCallum's avatar
Justin MacCallum committed
84

85
        # Load the PDB file
Justin MacCallum's avatar
Justin MacCallum committed
86

peastman's avatar
peastman committed
87
88
89
90
        if isinstance(file, PdbStructure):
            pdb = file
        else:
            inputfile = file
91
            own_handle = False
peastman's avatar
peastman committed
92
93
            if isinstance(file, str):
                inputfile = open(file)
94
                own_handle = True
95
            pdb = PdbStructure(inputfile, load_all_models=True, extraParticleIdentifier=extraParticleIdentifier)
96
97
            if own_handle:
                inputfile.close()
98
99
100
101
        PDBFile._loadNameReplacementTables()

        # Build the topology

102
        atomByNumber = {}
103
        for chain in pdb.iter_chains():
104
            c = top.addChain(chain.chain_id)
105
106
107
108
            for residue in chain.iter_residues():
                resName = residue.get_name()
                if resName in PDBFile._residueNameReplacements:
                    resName = PDBFile._residueNameReplacements[resName]
peastman's avatar
peastman committed
109
                r = top.addResidue(resName, c, str(residue.number), residue.insertion_code)
110
111
112
113
                if resName in PDBFile._atomNameReplacements:
                    atomReplacements = PDBFile._atomNameReplacements[resName]
                else:
                    atomReplacements = {}
114
                for atom in residue.iter_atoms():
115
116
117
118
                    atomName = atom.get_name()
                    if atomName in atomReplacements:
                        atomName = atomReplacements[atomName]
                    atomName = atomName.strip()
119
                    element = atom.element
120
                    if element == 'EP':
121
122
                        element = None
                    elif element is None:
123
                        # Try to guess the element.
Justin MacCallum's avatar
Justin MacCallum committed
124

125
                        upper = atomName.upper()
peastman's avatar
peastman committed
126
                        while len(upper) > 1 and upper[0].isdigit():
127
                            upper = upper[1:]
128
129
130
131
132
133
134
135
136
137
138
139
                        if upper.startswith('CL'):
                            element = elem.chlorine
                        elif upper.startswith('NA'):
                            element = elem.sodium
                        elif upper.startswith('MG'):
                            element = elem.magnesium
                        elif upper.startswith('BE'):
                            element = elem.beryllium
                        elif upper.startswith('LI'):
                            element = elem.lithium
                        elif upper.startswith('K'):
                            element = elem.potassium
Carlos Hernández's avatar
Carlos Hernández committed
140
141
                        elif upper.startswith('ZN'):
                            element = elem.zinc
142
                        elif len(residue) == 1 and upper.startswith('CA'):
143
                            element = elem.calcium
144
145
                        elif upper.startswith('D') and any(a.name == atomName[1:] for a in residue.iter_atoms()):
                            pass # A Drude particle
146
147
                        else:
                            try:
148
                                element = elem.get_by_symbol(upper[0])
149
150
                            except KeyError:
                                pass
151
                    newAtom = top.addAtom(atomName, element, r, str(atom.serial_number))
152
                    atomByNumber[atom.serial_number] = newAtom
153
154
155
        self._positions = []
        for model in pdb.iter_models(True):
            coords = []
156
157
            for chain in model.iter_chains():
                for residue in chain.iter_residues():
158
                    for atom in residue.iter_atoms():
159
160
                        pos = atom.get_position().value_in_unit(nanometers)
                        coords.append(Vec3(pos[0], pos[1], pos[2]))
161
162
163
            self._positions.append(coords*nanometers)
        ## The atom positions read from the PDB file.  If the file contains multiple frames, these are the positions in the first frame.
        self.positions = self._positions[0]
164
        self.topology.setPeriodicBoxVectors(pdb.get_periodic_box_vectors())
165
166
        self.topology.createStandardBonds()
        self.topology.createDisulfideBonds(self.positions)
167
        self._numpyPositions = None
Justin MacCallum's avatar
Justin MacCallum committed
168

Rafal P. Wiewiora's avatar
Rafal P. Wiewiora committed
169
        # Add bonds based on CONECT records. Bonds between metals of elements specified in metalElements and residues in standardResidues are not added.
Justin MacCallum's avatar
Justin MacCallum committed
170

171
        connectBonds = []
172
        for connect in pdb.models[-1].connects:
173
174
            i = connect[0]
            for j in connect[1:]:
175
                if i in atomByNumber and j in atomByNumber:    
Rafal P. Wiewiora's avatar
Rafal P. Wiewiora committed
176
                    if atomByNumber[i].element is not None and atomByNumber[j].element is not None:
177
178
                        if atomByNumber[i].element.symbol not in metalElements and atomByNumber[j].element.symbol not in metalElements:
                            connectBonds.append((atomByNumber[i], atomByNumber[j])) 
179
                        elif atomByNumber[i].element.symbol in metalElements and atomByNumber[j].residue.name not in PDBFile._standardResidues:
180
                            connectBonds.append((atomByNumber[i], atomByNumber[j])) 
181
                        elif atomByNumber[j].element.symbol in metalElements and atomByNumber[i].residue.name not in PDBFile._standardResidues:
182
                            connectBonds.append((atomByNumber[i], atomByNumber[j]))     
183
184
                    else:
                        connectBonds.append((atomByNumber[i], atomByNumber[j]))         
185
186
187
188
189
190
191
        if len(connectBonds) > 0:
            # Only add bonds that don't already exist.
            existingBonds = set(top.bonds())
            for bond in connectBonds:
                if bond not in existingBonds and (bond[1], bond[0]) not in existingBonds:
                    top.addBond(bond[0], bond[1])
                    existingBonds.add(bond)
192
193
194
195

    def getTopology(self):
        """Get the Topology of the model."""
        return self.topology
Justin MacCallum's avatar
Justin MacCallum committed
196

197
198
199
    def getNumFrames(self):
        """Get the number of frames stored in the file."""
        return len(self._positions)
Justin MacCallum's avatar
Justin MacCallum committed
200

201
    def getPositions(self, asNumpy=False, frame=0):
202
        """Get the atomic positions.
Justin MacCallum's avatar
Justin MacCallum committed
203

Robert McGibbon's avatar
Robert McGibbon committed
204
205
206
207
208
209
210
211
        Parameters
        ----------
        asNumpy : boolean=False
            if true, the values are returned as a numpy array instead of a list
            of Vec3s
        frame : int=0
            the index of the frame for which to get positions
        """
212
213
        if asNumpy:
            if self._numpyPositions is None:
214
215
216
217
218
                self._numpyPositions = [None]*len(self._positions)
            if self._numpyPositions[frame] is None:
                self._numpyPositions[frame] = Quantity(numpy.array(self._positions[frame].value_in_unit(nanometers)), nanometers)
            return self._numpyPositions[frame]
        return self._positions[frame]
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260

    @staticmethod
    def _loadNameReplacementTables():
        """Load the list of atom and residue name replacements."""
        if len(PDBFile._residueNameReplacements) == 0:
            tree = etree.parse(os.path.join(os.path.dirname(__file__), 'data', 'pdbNames.xml'))
            allResidues = {}
            proteinResidues = {}
            nucleicAcidResidues = {}
            for residue in tree.getroot().findall('Residue'):
                name = residue.attrib['name']
                if name == 'All':
                    PDBFile._parseResidueAtoms(residue, allResidues)
                elif name == 'Protein':
                    PDBFile._parseResidueAtoms(residue, proteinResidues)
                elif name == 'Nucleic':
                    PDBFile._parseResidueAtoms(residue, nucleicAcidResidues)
            for atom in allResidues:
                proteinResidues[atom] = allResidues[atom]
                nucleicAcidResidues[atom] = allResidues[atom]
            for residue in tree.getroot().findall('Residue'):
                name = residue.attrib['name']
                for id in residue.attrib:
                    if id == 'name' or id.startswith('alt'):
                        PDBFile._residueNameReplacements[residue.attrib[id]] = name
                if 'type' not in residue.attrib:
                    atoms = copy(allResidues)
                elif residue.attrib['type'] == 'Protein':
                    atoms = copy(proteinResidues)
                elif residue.attrib['type'] == 'Nucleic':
                    atoms = copy(nucleicAcidResidues)
                else:
                    atoms = copy(allResidues)
                PDBFile._parseResidueAtoms(residue, atoms)
                PDBFile._atomNameReplacements[name] = atoms

    @staticmethod
    def _parseResidueAtoms(residue, map):
        for atom in residue.findall('Atom'):
            name = atom.attrib['name']
            for id in atom.attrib:
                map[atom.attrib[id]] = name
Justin MacCallum's avatar
Justin MacCallum committed
261

262
    @staticmethod
263
    def writeFile(topology, positions, file=sys.stdout, keepIds=False, extraParticleIdentifier=' '):
264
        """Write a PDB file containing a single model.
Justin MacCallum's avatar
Justin MacCallum committed
265

Robert McGibbon's avatar
Robert McGibbon committed
266
267
268
269
270
271
272
273
274
275
276
277
278
        Parameters
        ----------
        topology : Topology
            The Topology defining the model to write
        positions : list
            The list of atomic positions to write
        file : file=stdout
            A file to write to
        keepIds : bool=False
            If True, keep the residue and chain IDs specified in the Topology
            rather than generating new ones.  Warning: It is up to the caller to
            make sure these are valid IDs that satisfy the requirements of the
            PDB format.  Otherwise, the output file will be invalid.
Rafal P. Wiewiora's avatar
Rafal P. Wiewiora committed
279
280
        extraParticleIdentifier : string=' '
            String to write in the element column of the ATOM records for atoms whose element is None (extra particles)
281
282
        """
        PDBFile.writeHeader(topology, file)
283
        PDBFile.writeModel(topology, positions, file, keepIds=keepIds, extraParticleIdentifier=extraParticleIdentifier)
284
285
286
287
288
        PDBFile.writeFooter(topology, file)

    @staticmethod
    def writeHeader(topology, file=sys.stdout):
        """Write out the header for a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
289

Robert McGibbon's avatar
Robert McGibbon committed
290
291
292
293
294
295
        Parameters
        ----------
        topology : Topology
            The Topology defining the molecular system being written
        file : file=stdout
            A file to write the file to
296
        """
297
        print("REMARK   1 CREATED WITH OPENMM %s, %s" % (Platform.getOpenMMVersion(), str(date.today())), file=file)
298
299
        vectors = topology.getPeriodicBoxVectors()
        if vectors is not None:
300
301
302
303
            a, b, c, alpha, beta, gamma = computeLengthsAndAngles(vectors)
            RAD_TO_DEG = 180/math.pi
            print("CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1           1 " % (
                    a*10, b*10, c*10, alpha*RAD_TO_DEG, beta*RAD_TO_DEG, gamma*RAD_TO_DEG), file=file)
Justin MacCallum's avatar
Justin MacCallum committed
304

305
    @staticmethod
306
    def writeModel(topology, positions, file=sys.stdout, modelIndex=None, keepIds=False, extraParticleIdentifier=' '):
307
        """Write out a model to a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
308

Robert McGibbon's avatar
Robert McGibbon committed
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
        Parameters
        ----------
        topology : Topology
            The Topology defining the model to write
        positions : list
            The list of atomic positions to write
        file : file=stdout
            A file to write the model to
        modelIndex : int=None
            If not None, the model will be surrounded by MODEL/ENDMDL records
            with this index
        keepIds : bool=False
            If True, keep the residue and chain IDs specified in the Topology
            rather than generating new ones.  Warning: It is up to the caller to
            make sure these are valid IDs that satisfy the requirements of the
324
325
            PDB format.  No guarantees are made about what will happen if they
            are not, and the output file could be invalid.
Rafal P. Wiewiora's avatar
Rafal P. Wiewiora committed
326
327
        extraParticleIdentifier : string=' '
            String to write in the element column of the ATOM records for atoms whose element is None (extra particles)
328
        """
Robert McGibbon's avatar
Robert McGibbon committed
329

330
        if len(list(topology.atoms())) != len(positions):
Justin MacCallum's avatar
Justin MacCallum committed
331
            raise ValueError('The number of positions must match the number of atoms')
332
333
        if is_quantity(positions):
            positions = positions.value_in_unit(angstroms)
334
335
336
337
        if any(math.isnan(norm(pos)) for pos in positions):
            raise ValueError('Particle position is NaN')
        if any(math.isinf(norm(pos)) for pos in positions):
            raise ValueError('Particle position is infinite')
338
339
        nonHeterogens = PDBFile._standardResidues[:]
        nonHeterogens.remove('HOH')
340
341
342
        atomIndex = 1
        posIndex = 0
        if modelIndex is not None:
343
            print("MODEL     %4d" % modelIndex, file=file)
344
        for (chainIndex, chain) in enumerate(topology.chains()):
345
            if keepIds and len(chain.id) == 1:
346
347
348
                chainName = chain.id
            else:
                chainName = chr(ord('A')+chainIndex%26)
349
350
351
352
353
354
            residues = list(chain.residues())
            for (resIndex, res) in enumerate(residues):
                if len(res.name) > 3:
                    resName = res.name[:3]
                else:
                    resName = res.name
355
                if keepIds and len(res.id) < 5:
356
357
358
                    resId = res.id
                else:
                    resId = "%4d" % ((resIndex+1)%10000)
359
360
361
                if len(res.insertionCode) == 1:
                    resIC = res.insertionCode
                else:
peastman's avatar
peastman committed
362
                    resIC = " "
363
364
365
366
                if res.name in nonHeterogens:
                    recordName = "ATOM  "
                else:
                    recordName = "HETATM"
367
                for atom in res.atoms():
368
369
370
                    if atom.element is not None:
                        symbol = atom.element.symbol
                    else:
371
                        symbol = extraParticleIdentifier
372
                    if len(atom.name) < 4 and atom.name[:1].isalpha() and len(symbol) < 2:
373
374
375
376
377
378
                        atomName = ' '+atom.name
                    elif len(atom.name) > 4:
                        atomName = atom.name[:4]
                    else:
                        atomName = atom.name
                    coords = positions[posIndex]
peastman's avatar
peastman committed
379
                    line = "%s%5d %-4s %3s %s%4s%1s   %s%s%s  1.00  0.00          %2s  " % (
peastman's avatar
peastman committed
380
                        recordName, atomIndex%100000, atomName, resName, chainName, resId, resIC, _format_83(coords[0]),
Robert McGibbon's avatar
Robert McGibbon committed
381
                        _format_83(coords[1]), _format_83(coords[2]), symbol)
peastman's avatar
peastman committed
382
383
                    if len(line) != 80:
                        raise ValueError('Fixed width overflow detected')
384
                    print(line, file=file)
385
386
387
                    posIndex += 1
                    atomIndex += 1
                if resIndex == len(residues)-1:
388
                    print("TER   %5d      %3s %s%4s" % (atomIndex, resName, chainName, resId), file=file)
389
390
                    atomIndex += 1
        if modelIndex is not None:
391
            print("ENDMDL", file=file)
392
393
394
395

    @staticmethod
    def writeFooter(topology, file=sys.stdout):
        """Write out the footer for a PDB file.
Justin MacCallum's avatar
Justin MacCallum committed
396

Robert McGibbon's avatar
Robert McGibbon committed
397
398
399
400
401
402
        Parameters
        ----------
        topology : Topology
            The Topology defining the molecular system being written
        file : file=stdout
            A file to write the file to
403
        """
peastman's avatar
peastman committed
404
        # Identify bonds that should be listed as CONECT records.
Robert McGibbon's avatar
Robert McGibbon committed
405

peastman's avatar
peastman committed
406
407
        conectBonds = []
        for atom1, atom2 in topology.bonds():
408
            if atom1.residue.name not in PDBFile._standardResidues or atom2.residue.name not in PDBFile._standardResidues:
peastman's avatar
peastman committed
409
410
411
412
                conectBonds.append((atom1, atom2))
            elif atom1.name == 'SG' and atom2.name == 'SG' and atom1.residue.name == 'CYS' and atom2.residue.name == 'CYS':
                conectBonds.append((atom1, atom2))
        if len(conectBonds) > 0:
Robert McGibbon's avatar
Robert McGibbon committed
413

peastman's avatar
peastman committed
414
            # Work out the index used in the PDB file for each atom.
Robert McGibbon's avatar
Robert McGibbon committed
415

peastman's avatar
peastman committed
416
417
418
419
420
421
422
423
424
425
            atomIndex = {}
            nextAtomIndex = 0
            prevChain = None
            for chain in topology.chains():
                for atom in chain.atoms():
                    if atom.residue.chain != prevChain:
                        nextAtomIndex += 1
                        prevChain = atom.residue.chain
                    atomIndex[atom] = nextAtomIndex
                    nextAtomIndex += 1
Robert McGibbon's avatar
Robert McGibbon committed
426

peastman's avatar
peastman committed
427
            # Record which other atoms each atom is bonded to.
Robert McGibbon's avatar
Robert McGibbon committed
428

peastman's avatar
peastman committed
429
430
431
432
433
434
435
436
437
438
            atomBonds = {}
            for atom1, atom2 in conectBonds:
                index1 = atomIndex[atom1]
                index2 = atomIndex[atom2]
                if index1 not in atomBonds:
                    atomBonds[index1] = []
                if index2 not in atomBonds:
                    atomBonds[index2] = []
                atomBonds[index1].append(index2)
                atomBonds[index2].append(index1)
Robert McGibbon's avatar
Robert McGibbon committed
439

peastman's avatar
peastman committed
440
            # Write the CONECT records.
Robert McGibbon's avatar
Robert McGibbon committed
441

peastman's avatar
peastman committed
442
443
444
            for index1 in sorted(atomBonds):
                bonded = atomBonds[index1]
                while len(bonded) > 4:
445
                    print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=file)
peastman's avatar
peastman committed
446
447
448
449
                    del bonded[:4]
                line = "CONECT%5d" % index1
                for index2 in bonded:
                    line = "%s%5d" % (line, index2)
450
451
                print(line, file=file)
        print("END", file=file)
452

453
454
455
456
457

def _format_83(f):
    """Format a single float into a string of width 8, with ideally 3 decimal
    places of precision. If the number is a little too large, we can
    gracefully degrade the precision by lopping off some of the decimal
Robert McGibbon's avatar
Robert McGibbon committed
458
    places. If it's much too large, we throw a ValueError"""
459
460
461
462
    if -999.999 < f < 9999.999:
        return '%8.3f' % f
    if -9999999 < f < 99999999:
        return ('%8.3f' % f)[:8]
463
    raise ValueError('coordinate "%s" could not be represented '
Robert McGibbon's avatar
Robert McGibbon committed
464
                     'in a width-8 field' % f)