Use hex for large atom and residue IDs in PDB files (#3522)

79740ece · Peter Eastman · GitHub · 7c42ef5e · 79740ece · 79740ece
Unverified Commit 79740ece authored Mar 22, 2022 by Peter Eastman Committed by GitHub Mar 22, 2022
3 changed files
--- a/wrappers/python/openmm/app/internal/pdbstructure.py
+++ b/wrappers/python/openmm/app/internal/pdbstructure.py
@@ -164,10 +164,10 @@ class PdbStructure(object):
            if command == "ATOM  " or command == "HETATM":
                self._add_atom(Atom(pdb_line, self, self.extraParticleIdentifier))
            elif command == "CONECT":
-                atoms = [int(pdb_line[6:11])]
+                atoms = [_parse_atom_index(pdb_line[6:11])]
                for pos in (11,16,21,26):
                    try:
-                        atoms.append(int(pdb_line[pos:pos+5]))
+                        atoms.append(_parse_atom_index(pdb_line[pos:pos+5]))
                    except:
                        pass
                self._current_model.connects.append(atoms)
@@ -208,11 +208,9 @@ class PdbStructure(object):
        self._finalize()

    def _reset_atom_numbers(self):
-        self._atom_numbers_are_hex = False
        self._next_atom_number = 1

    def _reset_residue_numbers(self):
-        self._residue_numbers_are_hex = False
        self._next_residue_number = 1

    def write(self, output_stream=sys.stdout):
@@ -726,18 +724,11 @@ class Atom(object):
        self.is_final_residue_in_chain = False
        # Start parsing fields from pdb line
        self.record_name = pdb_line[0:6].strip()
-        if pdbstructure is not None and pdbstructure._atom_numbers_are_hex:
-            self.serial_number = int(pdb_line[6:11], 16)
-        else:
-            try:
-                self.serial_number = int(pdb_line[6:11])
-            except:
-                try:
-                    self.serial_number = int(pdb_line[6:11], 16)
-                    pdbstructure._atom_numbers_are_hex = True
-                except:
-                    # Just give it the next number in sequence.
-                    self.serial_number = pdbstructure._next_atom_number
+        try:
+            self.serial_number = _parse_atom_index(pdb_line[6:11])
+        except:
+            # Just give it the next number in sequence.
+            self.serial_number = pdbstructure._next_atom_number
        self.name_with_spaces = pdb_line[12:16]
        alternate_location_indicator = pdb_line[16]

@@ -753,31 +744,27 @@ class Atom(object):
        self.residue_name = self.residue_name_with_spaces.strip()

        self.chain_id = pdb_line[21]
-        if pdbstructure is not None and pdbstructure._residue_numbers_are_hex:
-            self.residue_number = int(pdb_line[22:26], 16)
-        else:
+        try:
+            self.residue_number = int(pdb_line[22:26])
+        except:
            try:
-                self.residue_number = int(pdb_line[22:26])
+                self.residue_number = int(pdb_line[22:26], 16) - 0xA000 + 10000
            except:
-                try:
-                    self.residue_number = int(pdb_line[22:26], 16)
-                    pdbstructure._residue_numbers_are_hex = True
-                except:
-                    # When VMD runs out of hex values it starts filling the residue ID field with ****.
-                    # Look at the most recent atoms to figure out whether this is a new residue or not.
-                    if pdbstructure._current_model is None or pdbstructure._current_model._current_chain is None or pdbstructure._current_model._current_chain._current_residue is None:
-                        # This is the first residue in the model.
+                # When VMD runs out of hex values it starts filling the residue ID field with ****.
+                # Look at the most recent atoms to figure out whether this is a new residue or not.
+                if pdbstructure._current_model is None or pdbstructure._current_model._current_chain is None or pdbstructure._current_model._current_chain._current_residue is None:
+                    # This is the first residue in the model.
+                    self.residue_number = pdbstructure._next_residue_number
+                else:
+                    currentRes = pdbstructure._current_model._current_chain._current_residue
+                    if currentRes.name_with_spaces != self.residue_name_with_spaces:
+                        # The residue name has changed.
+                        self.residue_number = pdbstructure._next_residue_number
+                    elif self.name_with_spaces in currentRes.atoms_by_name:
+                        # There is already an atom with this name.
                        self.residue_number = pdbstructure._next_residue_number
                    else:
-                        currentRes = pdbstructure._current_model._current_chain._current_residue
-                        if currentRes.name_with_spaces != self.residue_name_with_spaces:
-                            # The residue name has changed.
-                            self.residue_number = pdbstructure._next_residue_number
-                        elif self.name_with_spaces in currentRes.atoms_by_name:
-                            # There is already an atom with this name.
-                            self.residue_number = pdbstructure._next_residue_number
-                        else:
-                            self.residue_number = currentRes.number
+                        self.residue_number = currentRes.number
        self.insertion_code = pdb_line[26]
        # coordinates, occupancy, and temperature factor belong in Atom.Location object
        x = float(pdb_line[30:38])
@@ -986,6 +973,13 @@ class Atom(object):
            return str(self.position)


+def _parse_atom_index(index):
+    """Parse the string containing an atom index, which might be either decimal or hex."""
+    try:
+        return int(index)
+    except:
+        return int(index, 16) - 0xA0000 + 100000
+
 # run module directly for testing
 if __name__=='__main__':
    # Test the examples in the docstrings

--- a/wrappers/python/openmm/app/pdbfile.py
+++ b/wrappers/python/openmm/app/pdbfile.py
@@ -363,7 +363,7 @@ class PDBFile(object):
                if keepIds and len(res.id) < 5:
                    resId = res.id
                else:
-                    resId = "%4d" % ((resIndex+1)%10000)
+                    resId = _formatIndex(resIndex+1, 4)
                if len(res.insertionCode) == 1:
                    resIC = res.insertionCode
                else:
@@ -384,8 +384,8 @@ class PDBFile(object):
                    else:
                        atomName = atom.name
                    coords = positions[posIndex]
-                    line = "%s%5d %-4s %3s %s%4s%1s   %s%s%s  1.00  0.00          %2s  " % (
-                        recordName, atomIndex%100000, atomName, resName, chainName, resId, resIC, _format_83(coords[0]),
+                    line = "%s%5s %-4s %3s %s%4s%1s   %s%s%s  1.00  0.00          %2s  " % (
+                        recordName, _formatIndex(atomIndex, 5), atomName, resName, chainName, resId, resIC, _format_83(coords[0]),
                        _format_83(coords[1]), _format_83(coords[2]), symbol)
                    if len(line) != 80:
                        raise ValueError('Fixed width overflow detected')
@@ -393,7 +393,7 @@ class PDBFile(object):
                    posIndex += 1
                    atomIndex += 1
                if resIndex == len(residues)-1:
-                    print("TER   %5d      %3s %s%4s" % (atomIndex, resName, chainName, resId), file=file)
+                    print("TER   %5s      %3s %s%4s" % (_formatIndex(atomIndex, 5), resName, chainName, resId), file=file)
                    atomIndex += 1
        if modelIndex is not None:
            print("ENDMDL", file=file)
@@ -450,11 +450,11 @@ class PDBFile(object):
            for index1 in sorted(atomBonds):
                bonded = atomBonds[index1]
                while len(bonded) > 4:
-                    print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=file)
+                    print("CONECT%5s%5s%5s%5s" % (_formatIndex(index1, 5), _formatIndex(bonded[0], 5), _formatIndex(bonded[1], 5), _formatIndex(bonded[2], 5)), file=file)
                    del bonded[:4]
-                line = "CONECT%5d" % index1
+                line = "CONECT%5s" % _formatIndex(index1, 5)
                for index2 in bonded:
-                    line = "%s%5d" % (line, index2)
+                    line = "%s%5s" % (line, _formatIndex(index2, 5))
                print(line, file=file)
        print("END", file=file)

@@ -470,3 +470,14 @@ def _format_83(f):
        return ('%8.3f' % f)[:8]
    raise ValueError('coordinate "%s" could not be represented '
                     'in a width-8 field' % f)
+
+def _formatIndex(index, places):
+    """Create a string representation of an atom or residue index.  If the value is larger than can fit
+    in the available space, switch to hex.
+    """
+    if index < 10**places:
+        format = f'%{places}d'
+        return format % index
+    format = f'%{places}X'
+    shiftedIndex = (index - 10**places + 10*16**(places-1)) % (16**places)
+    return format % shiftedIndex
\ No newline at end of file
--- a/wrappers/python/tests/TestPdbFile.py
+++ b/wrappers/python/tests/TestPdbFile.py
@@ -94,6 +94,45 @@ class TestPdbFile(unittest.TestCase):
            self.assertEqual(19, len(pdb.positions))
            self.assertEqual('ILE', list(pdb.topology.residues())[0].name)

+    def test_LargeFile(self):
+        """Write and read a file with more than 100,000 atoms"""
+        topology = Topology()
+        chain = topology.addChain('A')
+        for i in range(20000):
+            res = topology.addResidue('MOL', chain)
+            atoms = []
+            for j in range(6):
+                atoms.append(topology.addAtom(f'AT{j}', elem.carbon, res))
+            for j in range(5):
+                topology.addBond(atoms[j], atoms[j+1])
+        positions = [Vec3(0, 0, 0)]*topology.getNumAtoms()
+
+        # The model has 20,000 residues and 120,000 atoms.
+
+        output = StringIO()
+        PDBFile.writeFile(topology, positions, output)
+        input = StringIO(output.getvalue())
+        pdb = PDBFile(input)
+        output.close()
+        input.close()
+        self.assertEqual(len(positions), len(pdb.positions))
+        self.assertEqual(topology.getNumAtoms(), pdb.topology.getNumAtoms())
+        self.assertEqual(topology.getNumResidues(), pdb.topology.getNumResidues())
+        self.assertEqual(topology.getNumChains(), pdb.topology.getNumChains())
+        self.assertEqual(topology.getNumBonds(), pdb.topology.getNumBonds())
+        for atom in pdb.topology.atoms():
+            self.assertEqual(str(atom.index+1), atom.id)
+        for res in pdb.topology.residues():
+            self.assertEqual(str(res.index+1), res.id)
+
+        # Make sure the CONECT records were interpreted correctly.
+
+        bonds = set()
+        for atom1, atom2 in topology.bonds():
+            bonds.add(tuple(sorted((atom1.index, atom2.index))))
+        for atom1, atom2 in pdb.topology.bonds():
+            assert tuple(sorted((atom1.index, atom2.index))) in bonds
+
    def assertVecAlmostEqual(self, p1, p2, tol=1e-7):
        unit = p1.unit
        p1 = p1.value_in_unit(unit)