Redo the way CHARMM PSF files are parsed. This should be much more robust in

terms of supporting PSF files that don't really conform to the "standard" way that CHARMM prints them. The downside is that it relies on ! comment tags which I think are comments for CHARMM. However, the use of !NATOM and !NBOND and other tags seem to be _much_ more conserved than the general format of the PSF file across different dialects of PSFs. So the current approach should be more flexible.

Redo the way CHARMM PSF files are parsed. This should be much more robust in
terms of supporting PSF files that don't really conform to the "standard" way that CHARMM prints them. The downside is that it relies on ! comment tags which I think are comments for CHARMM. However, the use of !NATOM and !NBOND and other tags seem to be _much_ more conserved than the general format of the PSF file across different dialects of PSFs. So the current approach should be more flexible.
4b9221e3 · Jason Swails · 27381f70 · 4b9221e3 · 4b9221e3
Commit 4b9221e3 authored Aug 09, 2014 by Jason Swails
2 changed files
--- a/wrappers/python/simtk/openmm/app/charmmpsffile.py
+++ b/wrappers/python/simtk/openmm/app/charmmpsffile.py
@@ -45,14 +45,13 @@ from simtk.openmm.app.amberprmtopfile import HCT, OBC1, OBC2, GBn, GBn2
 from simtk.openmm.app.internal.customgbforces import (GBSAHCTForce,
                GBSAOBC1Force, GBSAOBC2Force, GBSAGBnForce, GBSAGBn2Force)
 # CHARMM imports
-from simtk.openmm.app.internal.charmm._charmmfile import CharmmFile
 from simtk.openmm.app.internal.charmm.topologyobjects import (
                ResidueList, AtomList, TrackedList, Bond, Angle, Dihedral,
                Improper, AcceptorDonor, Group, Cmap, UreyBradley,
                NoUreyBradley)
 from simtk.openmm.app.internal.charmm.exceptions import (
                CharmmPSFError, MoleculeError, CharmmPSFWarning,
-                MissingParameter)
+                MissingParameter, CharmmPsfEOF)
 import warnings
 TINY = 1e-8
@@ -74,6 +73,29 @@ def _catchindexerror(func):
    return newfunc
+class _ZeroDict(dict):
+    """
+    Contains a dict that returns dummy (zero) arguments when a key is not
+    present rather than raising a KeyError.  The return value for non-existent
+    items is (0, []). It also special-case sections that have multiple pointers
+    to avoid index errors if those are not present in the PSF file
+    """
+    def __getitem__(self, key):
+        try:
+            return dict.__getitem__(self, key)
+        except KeyError:
+            if key.startswith('NGRP'):
+                for k in self:
+                    if k.startswith('NGRP'):
+                        return dict.__getitem__(self, k)
+                return [0, 0], []
+            elif key.startswith('NUMLP'):
+                for k in self:
+                    if k.startswith('NUMLP'):
+                        return dict.__getitem__(self, k)
+                return [0, 0], []
+            return 0, []
 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 class CharmmPsfFile(object):
@@ -128,8 +150,8 @@ class CharmmPsfFile(object):
    @_catchindexerror
    def __init__(self, psf_name):
        """
-        Opens and parses a PSF file, then instantiates a CharmmPsfFile instance
+        Opens and parses a PSF file, then instantiates a CharmmPsfFile
-        from the data.
+        instance from the data.
        Parameters:
            psf_name (str) : Name of the PSF file (it must exist)
@@ -143,33 +165,32 @@ class CharmmPsfFile(object):
        if not os.path.exists(psf_name):
            raise IOError('Could not find PSF file %s' % psf_name)
        # Open the PSF and read the first line. It must start with "PSF"
-        psf = CharmmFile(psf_name, 'r')
+        psf = open(psf_name, 'r')
        line = psf.readline()
        if not line.startswith('PSF'):
            raise CharmmPSFError('Unrecognized PSF file. First line is %s' %
                                 line.strip())
        # Store the flags
        psf_flags = line.split()[1:]
-        # Next line is blank
+        # Now get all of the sections and store them in a dict
        psf.readline()
-        # The next line has one number -- the number of title lines
+        # Now get all of the sections
-        ntitle = conv(psf.readline().strip(), int, 'title count')
+        psfsections = _ZeroDict()
+        while True:
+            try:
+                sec, ptr, data = CharmmPsfFile._parse_psf_section(psf)
+            except CharmmPsfEOF:
+                break
+            psfsections[sec] = (ptr, data)
        # store the title
-        title = list()
+        title = psfsections['NTITLE'][1]
-        for i in range(ntitle):
-            title.append(psf.readline().rstrip())
-        # Skip all blank lines. Most of the time there is only 1, but I've seen
-        # a file that has 2
-        line = psf.readline().strip()
-        while not line:
-            line = psf.readline().strip()
        # Next is the number of atoms
-        natom = conv(line, int, 'natom')
+        natom = conv(psfsections['NATOM'][0], int, 'natom')
        # Parse all of the atoms
        residue_list = ResidueList()
        atom_list = AtomList()
        for i in xrange(natom):
-            words = psf.readline().split()
+            words = psfsections['NATOM'][1][i].split()
            atid = int(words[0])
            if atid != i + 1:
                raise CharmmPSFError('Nonsequential atoms detected!')
@@ -190,10 +211,10 @@ class CharmmPsfFile(object):
                                         attype, charge, mass, props)
            atom_list.append(atom)
        atom_list.assign_indexes()
-        # Eat the next line
+        atom_list.changed = False
-        psf.readline()
        # Now get the number of bonds
-        nbond, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        nbond = conv(psfsections['NBOND'][0], int, 'number of bonds')
+        holder = psfsections['NBOND'][1]
        bond_list = TrackedList()
        if len(holder) != nbond * 2:
            raise CharmmPSFError('Got %d indexes for %d bonds' %
@@ -204,7 +225,8 @@ class CharmmPsfFile(object):
            bond_list.append(Bond(atom_list[id1], atom_list[id2]))
        bond_list.changed = False
        # Now get the number of angles and the angle list
-        ntheta, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        ntheta = conv(psfsections['NTHETA'][0], int, 'number of angles')
+        holder = psfsections['NTHETA'][1]
        angle_list = TrackedList()
        if len(holder) != ntheta * 3:
            raise CharmmPSFError('Got %d indexes for %d angles' %
@@ -218,7 +240,8 @@ class CharmmPsfFile(object):
            )
        angle_list.changed = False
        # Now get the number of torsions and the torsion list
-        nphi, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        nphi = conv(psfsections['NPHI'][0], int, 'number of torsions')
+        holder = psfsections['NPHI'][1]
        dihedral_list = TrackedList()
        if len(holder) != nphi * 4:
            raise CharmmPSFError('Got %d indexes for %d torsions' %
@@ -234,7 +257,8 @@ class CharmmPsfFile(object):
            )
        dihedral_list.changed = False
        # Now get the number of improper torsions
-        nimphi, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        nimphi = conv(psfsections['NIMPHI'][0], int, 'number of impropers')
+        holder = psfsections['NIMPHI'][1]
        improper_list = TrackedList()
        if len(holder) != nimphi * 4:
            raise CharmmPSFError('Got %d indexes for %d impropers' %
@@ -250,7 +274,8 @@ class CharmmPsfFile(object):
            )
        improper_list.changed = False
        # Now handle the donors (what is this used for??)
-        ndon, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        ndon = conv(psfsections['NDON'][0], int, 'number of donors')
+        holder = psfsections['NDON'][1]
        donor_list = TrackedList()
        if len(holder) != ndon * 2:
            raise CharmmPSFError('Got %d indexes for %d donors' %
@@ -261,7 +286,8 @@ class CharmmPsfFile(object):
            donor_list.append(AcceptorDonor(atom_list[id1], atom_list[id2]))
        donor_list.changed = False
        # Now handle the acceptors (what is this used for??)
-        nacc, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        nacc = conv(psfsections['NACC'][0], int, 'number of acceptors')
+        holder = psfsections['NACC'][1]
        acceptor_list = TrackedList()
        if len(holder) != nacc * 2:
            raise CharmmPSFError('Got %d indexes for %d acceptors' %
@@ -271,16 +297,13 @@ class CharmmPsfFile(object):
            id2 = holder[2*i+1] - 1
            acceptor_list.append(AcceptorDonor(atom_list[id1], atom_list[id2]))
        acceptor_list.changed = False
-        # Now get the NNB section. Not sure what this section is for or what it
-        # does...
-        nnb, holder = CharmmPsfFile._parse_psf_section(psf, int)
        # Now get the group sections
-        pointers, holder = CharmmPsfFile._parse_psf_section(psf, int)
        group_list = TrackedList()
        try:
-            ngrp, nst2 = pointers
+            ngrp, nst2 = psfsections['NGRP NST2'][0]
        except ValueError:
            raise CharmmPSFError('Could not unpack GROUP pointers')
+        holder = psfsections['NGRP NST2'][1]
        group_list.nst2 = nst2
        # Now handle the groups
        if len(holder) != ngrp * 3:
@@ -292,19 +315,8 @@ class CharmmPsfFile(object):
            i3 = holder[3*i+2]
            group_list.append(Group(i1, i2, i3))
        group_list.changed = False
-        # The next section might be the number of molecules or it might be the
-        # cross-term (cmap) section. The first thing we'll do is determine
-        # molecularity based on the atom connectivity. If every PSF file was
-        # guaranteed to be "correct", we could just compare the MOLNT
-        # section with the one we compute here. However, CHARMM GUI appears
-        # to assign MOLNT as a dummy section (with all 1's), so this
-        # approach will not work. Instead, look at the value of the pointer
-        # and the number of entries in the group. If the # of entries is
-        # NATOM, assume we have MOLNT section. Warn if the MOLNT section is
-        # 'wrong'...
-        pointer, holder = CharmmPsfFile._parse_psf_section(psf, int)
        # Assign all of the atoms to molecules recursively
+        holder = psfsections['MOLNT'][1]
        set_molecules(atom_list)
        molecule_list = [atom.marked for atom in atom_list]
        if len(holder) == len(atom_list):
@@ -312,21 +324,14 @@ class CharmmPsfFile(object):
                warnings.warn('Detected PSF molecule section that is WRONG. '
                              'Resetting molecularity.', CharmmPSFWarning)
            # We have a CHARMM PSF file; now do NUMLP/NUMLPH sections
-            words = psf.readline().split()
+            numlp, numlph = psfsections['NUMLP NUMLPH'][0]
-            numlp = conv(words[0], int, 'numlp')
-            numlph = conv(words[1], int, 'numlph')
            if numlp != 0 or numlph != 0:
                raise NotImplemented('Cannot currently handle PSFs with lone '
                                     'pairs defined in the NUMLP/NUMLPH '
                                     'section.')
-            psf.readline() # blank
+        # Now do the CMAPs
-            # Now we get to the cross-term section
+        ncrterm = conv(psfsections['NCRTERM'][0], int, 'Number of cross-terms')
-            ncrterm, holder = CharmmPsfFile._parse_psf_section(psf, int)
+        holder = psfsections['NCRTERM'][1]
-        else:
-            ncrterm = pointer
-        # At this point, ncrterm and holder are both set to the CMAP list for
-        # VMD and non-VMD PSFs.
-        # Now get the cmaps
        cmap_list = TrackedList()
        if len(holder) != ncrterm * 8:
            raise CharmmPSFError('Got %d CMAP indexes for %d cmap terms' %
@@ -347,7 +352,6 @@ class CharmmPsfFile(object):
            )
        cmap_list.changed = False
-        # Now set the instance attributes
        self.residue_list = residue_list
        self.atom_list = atom_list
        self.bond_list = bond_list
@@ -381,18 +385,18 @@ class CharmmPsfFile(object):
            raise CharmmPSFError('Could not convert %s' % message)
    @staticmethod
-    def _parse_psf_section(psf, dtype):
+    def _parse_psf_section(psf):
        """
        This method parses a section of the PSF file
        Parameters:
            - psf (CharmmFile) : Open file that is pointing to the first line
                                 of the section that is to be parsed
-            - dtype (type) : The data type to convert all of the data into
        Returns:
-            (pointers, data)
+            (title, pointers, data)
+            - title (str) : The label of the PSF section we are parsing
            - pointers (int/tuple of ints) : If one pointer is set, pointers is
                    simply the integer that is value of that pointer. Otherwise
                    it is a tuple with every pointer value defined in the first
@@ -401,22 +405,42 @@ class CharmmPsfFile(object):
                    to `dtype'
        """
        conv = CharmmPsfFile._convert
-        words = psf.readline().split()
+        line = psf.readline()
+        while not line.strip():
+            if not line:
+                raise CharmmPsfEOF('Unexpected EOF in PSF file')
+            else:
+                line = psf.readline()
+        if '!' in line:
+            words = line[:line.index('!')].split()
+            title = line[line.index('!')+1:].strip().upper()
+            # Strip out description
+            if ':' in title:
+                title = title[:title.index(':')]
+        else:
+            raise CharmmPSFError('Could not determine section title')
        if len(words) == 1:
            pointers = conv(words[0], int, 'pointer')
        else:
            pointers = tuple([conv(w, int, 'pointer') for w in words])
        line = psf.readline().strip()
-        if not line:
+        if not line and title.startswith('NNB'):
            # This will correctly handle the NNB section (which has a spurious
            # blank line) as well as any sections that have 0 members.
            line = psf.readline().strip()
        data = []
-        while line:
+        if title == 'NATOM' or title == 'NTITLE':
-            words = line.split()
+            # Store these two sections as strings (ATOM section we will parse
-            data.extend([conv(w, dtype, 'PSF data') for w in words])
+            # later). The rest of the sections are integer pointers
-            line = psf.readline().strip()
+            while line:
-        return pointers, data
+                data.append(line)
+                line = psf.readline().strip()
+        else:
+            while line:
+                words = line.split()
+                data.extend([conv(w, int, 'PSF data') for w in words])
+                line = psf.readline().strip()
+        return title, pointers, data
    def writePsf(self, dest, vmd=False):
        """

--- a/wrappers/python/simtk/openmm/app/internal/charmm/exceptions.py
+++ b/wrappers/python/simtk/openmm/app/internal/charmm/exceptions.py
@@ -42,6 +42,9 @@ class CharmmWarning(Warning):
 class CharmmPSFError(CharmmError):
    """ If there is a problem parsing CHARMM PSF files """
+class CharmmPsfEOF(CharmmError):
+    """ Raised when the end-of-file is reached on a CHARMM PSF file """
 class SplitResidueWarning(CharmmWarning):
    """ For if a residue with the same number but different names is split """