Merge pull request #871 from swails/psfinscode

Improve CHARMM parsing when resnums have inscodes

Merge pull request #871 from swails/psfinscode
Improve CHARMM parsing when resnums have inscodes
9da36463 · peastman · 6f1f89d8 · f97f8685 · 9da36463 · 9da36463
Commit 9da36463 authored Apr 22, 2015 by peastman
6 changed files
--- a/wrappers/python/simtk/openmm/app/charmmpsffile.py
+++ b/wrappers/python/simtk/openmm/app/charmmpsffile.py
@@ -37,6 +37,7 @@ from __future__ import division
 from functools import wraps
 from math import pi, cos, sin, sqrt
 import os
+import re
 import simtk.openmm as mm
 from simtk.openmm.vec3 import Vec3
 import simtk.unit as u
@@ -99,6 +100,8 @@ class _ZeroDict(dict):

 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

+_resre = re.compile(r'(\d+)([a-zA-Z]*)')
+
 class CharmmPsfFile(object):
    """
    A chemical structure instantiated from CHARMM files.
@@ -192,11 +195,13 @@ class CharmmPsfFile(object):
        atom_list = AtomList()
        for i in xrange(natom):
            words = psfsections['NATOM'][1][i].split()
-            atid = int(words[0])
-            if atid != i + 1:
-                raise CharmmPSFError('Nonsequential atoms detected!')
            system = words[1]
-            resid = conv(words[2], int, 'residue number')
+            rematch = _resre.match(words[2])
+            if not rematch:
+                raise RuntimeError('Could not parse residue number %s' %
+                                   words[2])
+            resid, inscode = rematch.groups()
+            resid = int(resid)
            resname = words[3]
            name = words[4]
            attype = words[5]
@@ -209,7 +214,7 @@ class CharmmPsfFile(object):
            mass = conv(words[7], float, 'atomic mass')
            props = words[8:]
            atom = residue_list.add_atom(system, resid, resname, name,
-                                         attype, charge, mass, props)
+                            attype, charge, mass, inscode, props)
            atom_list.append(atom)
        atom_list.assign_indexes()
        atom_list.changed = False
@@ -443,175 +448,6 @@ class CharmmPsfFile(object):
                line = psf.readline().strip()
        return title, pointers, data

-    def writePsf(self, dest, vmd=False):
-        """
-        Writes a PSF file from the stored molecule
-
-        Parameters:
-            - dest (str or file-like) : The place to write the output PSF file.
-                    If it has a "write" attribute, it will be used to print the
-                    PSF file. Otherwise, it will be treated like a string and a
-                    file will be opened, printed, then closed
-            - vmd (bool) : If True, it will write out a PSF in the format that
-                    VMD prints it in (i.e., no NUMLP/NUMLPH or MOLNT sections)
-        Example:
-            >>> cs = CharmmPsfFile('testfiles/test.psf')
-            >>> cs.writePsf('testfiles/test2.psf')
-        """
-        # See if this is an extended format
-        ext = 'EXT' in self.flags
-        own_handle = False
-        # Index the atoms and residues
-        self.residue_list.assign_indexes()
-        self.atom_list.assign_indexes()
-        if not hasattr(dest, 'write'):
-            own_handle = True
-            dest = open(dest, 'w')
-
-        # Assign the formats we need to write with
-        if ext:
-            atmfmt1 = ('%10d %-8s %-8i %-8s %-8s %4d %10.6f %13.4f' + 11*' ')
-            atmfmt2 = ('%10d %-8s %-8i %-8s %-8s %-4s %10.6f %13.4f' + 11*' ')
-            intfmt = '%10d' # For pointers
-        else:
-            atmfmt1 = ('%8d %-4s %-4i %-4s %-4s %4d %10.6f %13.4f' + 11*' ')
-            atmfmt2 = ('%8d %-4s %-4i %-4s %-4s %-4s %10.6f %13.4f' + 11*' ')
-            intfmt = '%8d' # For pointers
-
-        # Now print the header then the title
-        dest.write('PSF ' + ' '.join(self.flags) + '\n')
-        dest.write('\n')
-        dest.write(intfmt % len(self.title) + ' !NTITLE\n')
-        dest.write('\n'.join(self.title) + '\n\n')
-        # Now time for the atoms
-        dest.write(intfmt % len(self.atom_list) + ' !NATOM\n')
-        # atmfmt1 is for CHARMM format (i.e., atom types are integers)
-        # atmfmt is for XPLOR format (i.e., atom types are strings)
-        for i, atom in enumerate(self.atom_list):
-            if isinstance(atom.attype, str):
-                fmt = atmfmt2
-            else:
-                fmt = atmfmt1
-            atmstr = fmt % (i+1, atom.system, atom.residue.resnum,
-                            atom.residue.resname, atom.name, atom.attype,
-                            atom.charge, atom.mass)
-            dest.write(atmstr + '   '.join(atom.props) + '\n')
-        dest.write('\n')
-        # Bonds
-        dest.write(intfmt % len(self.bond_list) + ' !NBOND: bonds\n')
-        for i, bond in enumerate(self.bond_list):
-            dest.write((intfmt*2) % (bond.atom1.idx+1, bond.atom2.idx+1))
-            if i % 4 == 3: # Write 4 bonds per line
-                dest.write('\n')
-        # See if we need to terminate
-        if len(self.bond_list) % 4 != 0 or len(self.bond_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # Angles
-        dest.write(intfmt % len(self.angle_list) + ' !NTHETA: angles\n')
-        for i, angle in enumerate(self.angle_list):
-            dest.write((intfmt*3) % (angle.atom1.idx+1, angle.atom2.idx+1,
-                                     angle.atom3.idx+1)
-            )
-            if i % 3 == 2: # Write 3 angles per line
-                dest.write('\n')
-        # See if we need to terminate
-        if len(self.angle_list) % 3 != 0 or len(self.angle_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # Dihedrals
-        dest.write(intfmt % len(self.dihedral_list) + ' !NPHI: dihedrals\n')
-        for i, dih in enumerate(self.dihedral_list):
-            dest.write((intfmt*4) % (dih.atom1.idx+1, dih.atom2.idx+1,
-                                     dih.atom3.idx+1, dih.atom4.idx+1)
-            )
-            if i % 2 == 1: # Write 2 dihedrals per line
-                dest.write('\n')
-        # See if we need to terminate
-        if len(self.dihedral_list) % 2 != 0 or len(self.dihedral_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # Impropers
-        dest.write(intfmt % len(self.improper_list) + ' !NIMPHI: impropers\n')
-        for i, imp in enumerate(self.improper_list):
-            dest.write((intfmt*4) % (imp.atom1.idx+1, imp.atom2.idx+1,
-                                     imp.atom3.idx+1, imp.atom4.idx+1)
-            )
-            if i % 2 == 1: # Write 2 dihedrals per line
-                dest.write('\n')
-        # See if we need to terminate
-        if len(self.improper_list) % 2 != 0 or len(self.improper_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # Donor section
-        dest.write(intfmt % len(self.donor_list) + ' !NDON: donors\n')
-        for i, don in enumerate(self.donor_list):
-            dest.write((intfmt*2) % (don.atom1.idx+1, don.atom2.idx+1))
-            if i % 4 == 3: # 4 donors per line
-                dest.write('\n')
-        if len(self.donor_list) % 4 != 0 or len(self.donor_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # Acceptor section
-        dest.write(intfmt % len(self.acceptor_list) + ' !NACC: acceptors\n')
-        for i, acc in enumerate(self.acceptor_list):
-            dest.write((intfmt*2) % (acc.atom1.idx+1, acc.atom2.idx+1))
-            if i % 4 == 3: # 4 donors per line
-                dest.write('\n')
-        if len(self.acceptor_list) % 4 != 0 or len(self.acceptor_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # NNB section ??
-        dest.write(intfmt % 0 + ' !NNB\n\n')
-        for i in range(len(self.atom_list)):
-            dest.write(intfmt % 0)
-            if i % 8 == 7: # Write 8 0's per line
-                dest.write('\n')
-        if len(self.atom_list) % 8 != 0: dest.write('\n')
-        dest.write('\n')
-        # Group section
-        dest.write((intfmt*2) % (len(self.group_list), self.group_list.nst2))
-        dest.write(' !NGRP NST2\n')
-        for i, gp in enumerate(self.group_list):
-            dest.write((intfmt*3) % (gp.bs, gp.type, gp.move))
-            if i % 3 == 2: dest.write('\n')
-        if len(self.group_list) % 3 != 0 or len(self.group_list) == 0:
-            dest.write('\n')
-        dest.write('\n')
-        # The next two sections are never found in VMD prmtops...
-        if not vmd:
-            # Molecule section; first set molecularity
-            set_molecules(self.atom_list)
-            mollist = [a.marked for a in self.atom_list]
-            dest.write(intfmt % max(mollist) + ' !MOLNT\n')
-            for i, atom in enumerate(self.atom_list):
-                dest.write(intfmt % atom.marked)
-                if i % 8 == 7: dest.write('\n')
-            if len(self.atom_list) % 8 != 0: dest.write('\n')
-            dest.write('\n')
-            # NUMLP/NUMLPH section
-            dest.write((intfmt*2) % (0, 0) + ' !NUMLP NUMLPH\n')
-            dest.write('\n')
-        # CMAP section
-        dest.write(intfmt % len(self.cmap_list) + ' !NCRTERM: cross-terms\n')
-        for i, cmap in enumerate(self.cmap_list):
-            dest.write((intfmt*4) % (cmap.atom1.idx+1, cmap.atom2.idx+1,
-                                     cmap.atom3.idx+1, cmap.atom4.idx+1)
-            )
-            if cmap.consecutive:
-                dest.write((intfmt*4) % (cmap.atom2.idx+1, cmap.atom3.idx+1,
-                                         cmap.atom4.idx+1, cmap.atom5.idx+1)
-                )
-            else:
-                dest.write((intfmt*4) % (cmap.atom5.idx+1, cmap.atom6.idx+1,
-                                         cmap.atom7.idx+1, cmap.atom8.idx+1)
-                )
-            dest.write('\n')
-        # Done!
-        # If we opened our own handle, close it
-        if own_handle:
-            dest.close()
-        
    def loadParameters(self, parmset):
        """
        Loads parameters from a parameter set that was loaded via CHARMM RTF,
@@ -776,13 +612,13 @@ class CharmmPsfFile(object):
        last_residue = None
        # Add each chain (separate 'system's) and residue
        for atom in self.atom_list:
+            resid = '%d%s' % (atom.residue.idx, atom.residue.inscode)
            if atom.system != last_chain:
                chain = topology.addChain(atom.system)
                last_chain = atom.system
-            if atom.residue.idx != last_residue:
-                last_residue = atom.residue.idx
-                residue = topology.addResidue(atom.residue.resname, chain,
-                                              str(atom.residue.idx))
+            if resid != last_residue:
+                last_residue = resid
+                residue = topology.addResidue(atom.residue.resname, chain, resid)
            if atom.type is not None:
                # This is the most reliable way of determining the element
                atomic_num = atom.type.atomic_number

--- a/wrappers/python/simtk/openmm/app/element.py
+++ b/wrappers/python/simtk/openmm/app/element.py
@@ -31,7 +31,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 __author__ = "Christopher M. Bruns"
 __version__ = "1.0"

-
+from collections import OrderedDict
 from simtk.unit import daltons, is_quantity
 import copy_reg

@@ -47,6 +47,7 @@ class Element(object):

    _elements_by_symbol = {}
    _elements_by_atomic_number = {}
+    _elements_by_mass = None

    def __init__(self, number, name, symbol, mass):
        """Create a new element
@@ -67,8 +68,11 @@ class Element(object):
        self._mass = mass
        # Index this element in a global table
        s = symbol.strip().upper()
+        ## If we add a new element, we need to re-hash elements by mass
+        Element._elements_by_mass = None

-        assert s not in Element._elements_by_symbol
+        if s in Element._elements_by_symbol:
+            raise ValueError('Duplicate element symbol %s' % s)
        Element._elements_by_symbol[s] = self
        if number in Element._elements_by_atomic_number:
            other_element = Element._elements_by_atomic_number[number]
@@ -96,20 +100,45 @@ class Element(object):
        """
        Get the element whose mass is CLOSEST to the requested mass. This method
        should not be used for repartitioned masses
+
+        Parameters
+        ----------
+        mass : float or Quantity
+            Mass of the atom to find the element for. Units assumed to be
+            daltons if not specified
+
+        Returns
+        -------
+        element : Element
+            The element whose atomic mass is closest to the input mass
        """
        # Assume masses are in daltons if they are not units
-        if not is_quantity(mass):
-            mass = mass * daltons
+        if is_quantity(mass):
+            mass = mass.value_in_unit(daltons)
+        if mass < 0:
+            raise ValueError('Invalid Higgs field')
+        # If this is our first time calling getByMass (or we added an element
+        # since the last call), re-generate the ordered by-mass dict cache
+        if Element._elements_by_mass is None:
+            Element._elements_by_mass = OrderedDict()
+            for elem in sorted(Element._elements_by_symbol.values(),
+                               key=lambda x: x.mass):
+                Element._elements_by_mass[elem.mass] = elem
+
        diff = mass
        best_guess = None

-        for key in Element._elements_by_atomic_number:
-            element = Element._elements_by_atomic_number[key]
-            massdiff = abs(element.mass - mass)
+        for elemmass, element in Element._elements_by_mass.iteritems():
+            massdiff = abs(elemmass._value - mass)
            if massdiff < diff:
                best_guess = element
                diff = massdiff
+            if elemmass._value > mass:
+                # Elements are only getting heavier, so bail out early
+                return best_guess

+        # This really should only happen if we wanted ununoctium or something
+        # bigger... won't really happen but still make sure we return an Element
        return best_guess

    @property
@@ -145,6 +174,9 @@ def _pickle_element(element):

 copy_reg.pickle(Element, _pickle_element)

+# NOTE: getElementByMass assumes all masses are Quantity instances with unit
+# "daltons". All elements need to obey this assumption, or that method will
+# fail. No checking is done in getElementByMass for performance reasons
 hydrogen =       Element(  1, "hydrogen", "H", 1.007947*daltons)
 deuterium =      Element(  1, "deuterium", "D", 2.01355321270*daltons)
 helium =         Element(  2, "helium", "He", 4.003*daltons)

--- a/wrappers/python/simtk/openmm/app/internal/charmm/topologyobjects.py
+++ b/wrappers/python/simtk/openmm/app/internal/charmm/topologyobjects.py
@@ -375,12 +375,13 @@ class AtomList(TrackedList):
 class Residue(object):
    """ Residue class """

-    def __init__(self, resname, idx):
+    def __init__(self, resname, idx, inscode=''):
        self.resname = resname
        self.idx = idx
        self.atoms = []
        self.resnum = None # Numbered based on SYSTEM name
        self.system = None
+        self.inscode = inscode

    def add_atom(self, atom):
        if self.system is None:
@@ -416,7 +417,7 @@ class Residue(object):
            return self.resname == thing.resname and self.idx == thing.idx
        if isinstance(thing, tuple) or isinstance(thing, list):
            # Must be resnum, resname
-            return thing == (self.resname, self.idx)
+            return thing == (self.resname, self.idx, self.inscode)
        return False # No other type can be equal.

    def __ne__(self, thing):
@@ -447,7 +448,7 @@ class ResidueList(list):
            resnum += 1

    def add_atom(self, system, resnum, resname, name,
-                 attype, charge, mass, props=None):
+                 attype, charge, mass, inscode, props=None):
        """
        Adds an atom to the list of residues. If the residue is not the same as
        the last residue that was created, a new Residue is created and added
@@ -461,25 +462,22 @@ class ResidueList(list):
            - attype (int or str) : Type of the atom
            - charge (float) : Partial atomic charge of the atom
            - mass (float) : Mass (amu) of the atom
+            - inscode (str) : Insertion code, if it is specified

        Returns:
            The Atom instance created and added to the list of residues
        """
-        if self._last_residue is None:
-            res = self._last_residue = Residue(resname, resnum)
+        lr = self._last_residue
+        if lr is None:
+            res = self._last_residue = Residue(resname, resnum, inscode)
            list.append(self, res)
-        elif (self._last_residue != (resname, resnum) or
-              system != self._last_residue.system):
-            if (self._last_residue.idx == resnum and
-                self._last_residue.system == system):
-                lresname = self._last_residue.resname
-                warnings.warn('Residue %d split into separate residues %s '
-                              'and %s' % (resnum, lresname, resname),
-                              SplitResidueWarning)
-            res = self._last_residue = Residue(resname, resnum)
+        elif (lr.resname != resname or lr.idx != resname or
+                lr.inscode != inscode  or system != lr.system):
+            res = self._last_residue = Residue(resname, resnum, inscode)
+            res.system = system
            list.append(self, res)
        else:
-            res = self._last_residue
+            res = lr
        atom = Atom(system, name, attype, float(charge), float(mass), props)
        res.add_atom(atom)
        return atom

--- a/wrappers/python/tests/TestCharmmFiles.py
+++ b/wrappers/python/tests/TestCharmmFiles.py
@@ -115,6 +115,13 @@ class TestCharmmFiles(unittest.TestCase):
        ene = state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
        self.assertAlmostEqual(ene, 15490.0033559, delta=0.05)

+    def test_InsCode(self):
+        """ Test the parsing of PSF files that contain insertion codes in their residue numbers """
+        psf = CharmmPsfFile('systems/4TVP-dmj_wat-ion.psf')
+        self.assertEqual(len(list(psf.topology.atoms())), 66264)
+        self.assertEqual(len(list(psf.topology.residues())), 20169)
+        self.assertEqual(len(list(psf.topology.bonds())), 46634)
+
    def test_ImplicitSolventForces(self):
        """Compute forces for different implicit solvent types, and compare them to ones generated with a previous version of OpenMM to ensure they haven't changed."""
        solventType = [HCT, OBC1, OBC2, GBn, GBn2]

--- a/wrappers/python/tests/TestElement.py
+++ b/wrappers/python/tests/TestElement.py
 import pickle
-import unittest
-from simtk.unit import dalton
+import random
+from simtk.unit import dalton, is_quantity
 from simtk.openmm.app import element
+import unittest

 class TestElement(unittest.TestCase):
    def test_immutable(self):
@@ -14,18 +15,37 @@ class TestElement(unittest.TestCase):
        newsulfur = pickle.loads(pickle.dumps(element.sulfur))
        # make sure that a new object is not created during the pickle/unpickle
        # cycle
-        assert element.sulfur == newsulfur
-        assert element.sulfur is newsulfur
-        assert id(element.sulfur) == id(newsulfur)
+        self.assertEqual(element.sulfur, newsulfur)
+        self.assertTrue(element.sulfur is newsulfur)
    
    def test_attributes(self):
-        assert element.hydrogen.atomic_number == 1
-        assert element.hydrogen.symbol == 'H'
-        assert element.hydrogen.name == 'hydrogen'
-        assert element.hydrogen.mass == 1.007947 * dalton
+        self.assertEqual(element.hydrogen.atomic_number, 1)
+        self.assertEqual(element.hydrogen.symbol, 'H')
+        self.assertEqual(element.hydrogen.name, 'hydrogen')
+        self.assertEqual(element.hydrogen.mass, 1.007947 * dalton)

+    def test_getByMass(self):
+        """ Tests the getByMass method """
+        def exhaustive_search(mass):
+            """
+            Searches through all element symbols and finds the one with the
+            smallest mass difference
+            """
+            min_diff = mass
+            closest_element = None
+            for symbol, elem in element.Element._elements_by_symbol.items():
+                diff = abs(elem.mass._value - mass)
+                if diff < min_diff:
+                    min_diff = diff
+                    closest_element = elem
+            return closest_element

-if __name__ == '__main__':
-    unittest.main()
+        # Check 5000 random numbers between 0 and 300
+        for i in range(5000):
+            mass = random.random() * 300
+            elem = element.Element.getByMass(mass)
+            self.assertTrue(elem is exhaustive_search(mass))


+if __name__ == '__main__':
+    unittest.main()
--- a/wrappers/python/tests/systems/4TVP-dmj_wat-ion.psf
+++ b/wrappers/python/tests/systems/4TVP-dmj_wat-ion.psf