Unverified Commit 79740ece authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Use hex for large atom and residue IDs in PDB files (#3522)

parent 7c42ef5e
......@@ -164,10 +164,10 @@ class PdbStructure(object):
if command == "ATOM " or command == "HETATM":
self._add_atom(Atom(pdb_line, self, self.extraParticleIdentifier))
elif command == "CONECT":
atoms = [int(pdb_line[6:11])]
atoms = [_parse_atom_index(pdb_line[6:11])]
for pos in (11,16,21,26):
try:
atoms.append(int(pdb_line[pos:pos+5]))
atoms.append(_parse_atom_index(pdb_line[pos:pos+5]))
except:
pass
self._current_model.connects.append(atoms)
......@@ -208,11 +208,9 @@ class PdbStructure(object):
self._finalize()
def _reset_atom_numbers(self):
self._atom_numbers_are_hex = False
self._next_atom_number = 1
def _reset_residue_numbers(self):
self._residue_numbers_are_hex = False
self._next_residue_number = 1
def write(self, output_stream=sys.stdout):
......@@ -726,18 +724,11 @@ class Atom(object):
self.is_final_residue_in_chain = False
# Start parsing fields from pdb line
self.record_name = pdb_line[0:6].strip()
if pdbstructure is not None and pdbstructure._atom_numbers_are_hex:
self.serial_number = int(pdb_line[6:11], 16)
else:
try:
self.serial_number = int(pdb_line[6:11])
except:
try:
self.serial_number = int(pdb_line[6:11], 16)
pdbstructure._atom_numbers_are_hex = True
except:
# Just give it the next number in sequence.
self.serial_number = pdbstructure._next_atom_number
try:
self.serial_number = _parse_atom_index(pdb_line[6:11])
except:
# Just give it the next number in sequence.
self.serial_number = pdbstructure._next_atom_number
self.name_with_spaces = pdb_line[12:16]
alternate_location_indicator = pdb_line[16]
......@@ -753,31 +744,27 @@ class Atom(object):
self.residue_name = self.residue_name_with_spaces.strip()
self.chain_id = pdb_line[21]
if pdbstructure is not None and pdbstructure._residue_numbers_are_hex:
self.residue_number = int(pdb_line[22:26], 16)
else:
try:
self.residue_number = int(pdb_line[22:26])
except:
try:
self.residue_number = int(pdb_line[22:26])
self.residue_number = int(pdb_line[22:26], 16) - 0xA000 + 10000
except:
try:
self.residue_number = int(pdb_line[22:26], 16)
pdbstructure._residue_numbers_are_hex = True
except:
# When VMD runs out of hex values it starts filling the residue ID field with ****.
# Look at the most recent atoms to figure out whether this is a new residue or not.
if pdbstructure._current_model is None or pdbstructure._current_model._current_chain is None or pdbstructure._current_model._current_chain._current_residue is None:
# This is the first residue in the model.
# When VMD runs out of hex values it starts filling the residue ID field with ****.
# Look at the most recent atoms to figure out whether this is a new residue or not.
if pdbstructure._current_model is None or pdbstructure._current_model._current_chain is None or pdbstructure._current_model._current_chain._current_residue is None:
# This is the first residue in the model.
self.residue_number = pdbstructure._next_residue_number
else:
currentRes = pdbstructure._current_model._current_chain._current_residue
if currentRes.name_with_spaces != self.residue_name_with_spaces:
# The residue name has changed.
self.residue_number = pdbstructure._next_residue_number
elif self.name_with_spaces in currentRes.atoms_by_name:
# There is already an atom with this name.
self.residue_number = pdbstructure._next_residue_number
else:
currentRes = pdbstructure._current_model._current_chain._current_residue
if currentRes.name_with_spaces != self.residue_name_with_spaces:
# The residue name has changed.
self.residue_number = pdbstructure._next_residue_number
elif self.name_with_spaces in currentRes.atoms_by_name:
# There is already an atom with this name.
self.residue_number = pdbstructure._next_residue_number
else:
self.residue_number = currentRes.number
self.residue_number = currentRes.number
self.insertion_code = pdb_line[26]
# coordinates, occupancy, and temperature factor belong in Atom.Location object
x = float(pdb_line[30:38])
......@@ -986,6 +973,13 @@ class Atom(object):
return str(self.position)
def _parse_atom_index(index):
"""Parse the string containing an atom index, which might be either decimal or hex."""
try:
return int(index)
except:
return int(index, 16) - 0xA0000 + 100000
# run module directly for testing
if __name__=='__main__':
# Test the examples in the docstrings
......
......@@ -363,7 +363,7 @@ class PDBFile(object):
if keepIds and len(res.id) < 5:
resId = res.id
else:
resId = "%4d" % ((resIndex+1)%10000)
resId = _formatIndex(resIndex+1, 4)
if len(res.insertionCode) == 1:
resIC = res.insertionCode
else:
......@@ -384,8 +384,8 @@ class PDBFile(object):
else:
atomName = atom.name
coords = positions[posIndex]
line = "%s%5d %-4s %3s %s%4s%1s %s%s%s 1.00 0.00 %2s " % (
recordName, atomIndex%100000, atomName, resName, chainName, resId, resIC, _format_83(coords[0]),
line = "%s%5s %-4s %3s %s%4s%1s %s%s%s 1.00 0.00 %2s " % (
recordName, _formatIndex(atomIndex, 5), atomName, resName, chainName, resId, resIC, _format_83(coords[0]),
_format_83(coords[1]), _format_83(coords[2]), symbol)
if len(line) != 80:
raise ValueError('Fixed width overflow detected')
......@@ -393,7 +393,7 @@ class PDBFile(object):
posIndex += 1
atomIndex += 1
if resIndex == len(residues)-1:
print("TER %5d %3s %s%4s" % (atomIndex, resName, chainName, resId), file=file)
print("TER %5s %3s %s%4s" % (_formatIndex(atomIndex, 5), resName, chainName, resId), file=file)
atomIndex += 1
if modelIndex is not None:
print("ENDMDL", file=file)
......@@ -450,11 +450,11 @@ class PDBFile(object):
for index1 in sorted(atomBonds):
bonded = atomBonds[index1]
while len(bonded) > 4:
print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=file)
print("CONECT%5s%5s%5s%5s" % (_formatIndex(index1, 5), _formatIndex(bonded[0], 5), _formatIndex(bonded[1], 5), _formatIndex(bonded[2], 5)), file=file)
del bonded[:4]
line = "CONECT%5d" % index1
line = "CONECT%5s" % _formatIndex(index1, 5)
for index2 in bonded:
line = "%s%5d" % (line, index2)
line = "%s%5s" % (line, _formatIndex(index2, 5))
print(line, file=file)
print("END", file=file)
......@@ -470,3 +470,14 @@ def _format_83(f):
return ('%8.3f' % f)[:8]
raise ValueError('coordinate "%s" could not be represented '
'in a width-8 field' % f)
def _formatIndex(index, places):
"""Create a string representation of an atom or residue index. If the value is larger than can fit
in the available space, switch to hex.
"""
if index < 10**places:
format = f'%{places}d'
return format % index
format = f'%{places}X'
shiftedIndex = (index - 10**places + 10*16**(places-1)) % (16**places)
return format % shiftedIndex
\ No newline at end of file
......@@ -94,6 +94,45 @@ class TestPdbFile(unittest.TestCase):
self.assertEqual(19, len(pdb.positions))
self.assertEqual('ILE', list(pdb.topology.residues())[0].name)
def test_LargeFile(self):
"""Write and read a file with more than 100,000 atoms"""
topology = Topology()
chain = topology.addChain('A')
for i in range(20000):
res = topology.addResidue('MOL', chain)
atoms = []
for j in range(6):
atoms.append(topology.addAtom(f'AT{j}', elem.carbon, res))
for j in range(5):
topology.addBond(atoms[j], atoms[j+1])
positions = [Vec3(0, 0, 0)]*topology.getNumAtoms()
# The model has 20,000 residues and 120,000 atoms.
output = StringIO()
PDBFile.writeFile(topology, positions, output)
input = StringIO(output.getvalue())
pdb = PDBFile(input)
output.close()
input.close()
self.assertEqual(len(positions), len(pdb.positions))
self.assertEqual(topology.getNumAtoms(), pdb.topology.getNumAtoms())
self.assertEqual(topology.getNumResidues(), pdb.topology.getNumResidues())
self.assertEqual(topology.getNumChains(), pdb.topology.getNumChains())
self.assertEqual(topology.getNumBonds(), pdb.topology.getNumBonds())
for atom in pdb.topology.atoms():
self.assertEqual(str(atom.index+1), atom.id)
for res in pdb.topology.residues():
self.assertEqual(str(res.index+1), res.id)
# Make sure the CONECT records were interpreted correctly.
bonds = set()
for atom1, atom2 in topology.bonds():
bonds.add(tuple(sorted((atom1.index, atom2.index))))
for atom1, atom2 in pdb.topology.bonds():
assert tuple(sorted((atom1.index, atom2.index))) in bonds
def assertVecAlmostEqual(self, p1, p2, tol=1e-7):
unit = p1.unit
p1 = p1.value_in_unit(unit)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment