Unverified Commit 79740ece authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Use hex for large atom and residue IDs in PDB files (#3522)

parent 7c42ef5e
...@@ -164,10 +164,10 @@ class PdbStructure(object): ...@@ -164,10 +164,10 @@ class PdbStructure(object):
if command == "ATOM " or command == "HETATM": if command == "ATOM " or command == "HETATM":
self._add_atom(Atom(pdb_line, self, self.extraParticleIdentifier)) self._add_atom(Atom(pdb_line, self, self.extraParticleIdentifier))
elif command == "CONECT": elif command == "CONECT":
atoms = [int(pdb_line[6:11])] atoms = [_parse_atom_index(pdb_line[6:11])]
for pos in (11,16,21,26): for pos in (11,16,21,26):
try: try:
atoms.append(int(pdb_line[pos:pos+5])) atoms.append(_parse_atom_index(pdb_line[pos:pos+5]))
except: except:
pass pass
self._current_model.connects.append(atoms) self._current_model.connects.append(atoms)
...@@ -208,11 +208,9 @@ class PdbStructure(object): ...@@ -208,11 +208,9 @@ class PdbStructure(object):
self._finalize() self._finalize()
def _reset_atom_numbers(self): def _reset_atom_numbers(self):
self._atom_numbers_are_hex = False
self._next_atom_number = 1 self._next_atom_number = 1
def _reset_residue_numbers(self): def _reset_residue_numbers(self):
self._residue_numbers_are_hex = False
self._next_residue_number = 1 self._next_residue_number = 1
def write(self, output_stream=sys.stdout): def write(self, output_stream=sys.stdout):
...@@ -726,18 +724,11 @@ class Atom(object): ...@@ -726,18 +724,11 @@ class Atom(object):
self.is_final_residue_in_chain = False self.is_final_residue_in_chain = False
# Start parsing fields from pdb line # Start parsing fields from pdb line
self.record_name = pdb_line[0:6].strip() self.record_name = pdb_line[0:6].strip()
if pdbstructure is not None and pdbstructure._atom_numbers_are_hex: try:
self.serial_number = int(pdb_line[6:11], 16) self.serial_number = _parse_atom_index(pdb_line[6:11])
else: except:
try: # Just give it the next number in sequence.
self.serial_number = int(pdb_line[6:11]) self.serial_number = pdbstructure._next_atom_number
except:
try:
self.serial_number = int(pdb_line[6:11], 16)
pdbstructure._atom_numbers_are_hex = True
except:
# Just give it the next number in sequence.
self.serial_number = pdbstructure._next_atom_number
self.name_with_spaces = pdb_line[12:16] self.name_with_spaces = pdb_line[12:16]
alternate_location_indicator = pdb_line[16] alternate_location_indicator = pdb_line[16]
...@@ -753,31 +744,27 @@ class Atom(object): ...@@ -753,31 +744,27 @@ class Atom(object):
self.residue_name = self.residue_name_with_spaces.strip() self.residue_name = self.residue_name_with_spaces.strip()
self.chain_id = pdb_line[21] self.chain_id = pdb_line[21]
if pdbstructure is not None and pdbstructure._residue_numbers_are_hex: try:
self.residue_number = int(pdb_line[22:26], 16) self.residue_number = int(pdb_line[22:26])
else: except:
try: try:
self.residue_number = int(pdb_line[22:26]) self.residue_number = int(pdb_line[22:26], 16) - 0xA000 + 10000
except: except:
try: # When VMD runs out of hex values it starts filling the residue ID field with ****.
self.residue_number = int(pdb_line[22:26], 16) # Look at the most recent atoms to figure out whether this is a new residue or not.
pdbstructure._residue_numbers_are_hex = True if pdbstructure._current_model is None or pdbstructure._current_model._current_chain is None or pdbstructure._current_model._current_chain._current_residue is None:
except: # This is the first residue in the model.
# When VMD runs out of hex values it starts filling the residue ID field with ****. self.residue_number = pdbstructure._next_residue_number
# Look at the most recent atoms to figure out whether this is a new residue or not. else:
if pdbstructure._current_model is None or pdbstructure._current_model._current_chain is None or pdbstructure._current_model._current_chain._current_residue is None: currentRes = pdbstructure._current_model._current_chain._current_residue
# This is the first residue in the model. if currentRes.name_with_spaces != self.residue_name_with_spaces:
# The residue name has changed.
self.residue_number = pdbstructure._next_residue_number
elif self.name_with_spaces in currentRes.atoms_by_name:
# There is already an atom with this name.
self.residue_number = pdbstructure._next_residue_number self.residue_number = pdbstructure._next_residue_number
else: else:
currentRes = pdbstructure._current_model._current_chain._current_residue self.residue_number = currentRes.number
if currentRes.name_with_spaces != self.residue_name_with_spaces:
# The residue name has changed.
self.residue_number = pdbstructure._next_residue_number
elif self.name_with_spaces in currentRes.atoms_by_name:
# There is already an atom with this name.
self.residue_number = pdbstructure._next_residue_number
else:
self.residue_number = currentRes.number
self.insertion_code = pdb_line[26] self.insertion_code = pdb_line[26]
# coordinates, occupancy, and temperature factor belong in Atom.Location object # coordinates, occupancy, and temperature factor belong in Atom.Location object
x = float(pdb_line[30:38]) x = float(pdb_line[30:38])
...@@ -986,6 +973,13 @@ class Atom(object): ...@@ -986,6 +973,13 @@ class Atom(object):
return str(self.position) return str(self.position)
def _parse_atom_index(index):
"""Parse the string containing an atom index, which might be either decimal or hex."""
try:
return int(index)
except:
return int(index, 16) - 0xA0000 + 100000
# run module directly for testing # run module directly for testing
if __name__=='__main__': if __name__=='__main__':
# Test the examples in the docstrings # Test the examples in the docstrings
......
...@@ -363,7 +363,7 @@ class PDBFile(object): ...@@ -363,7 +363,7 @@ class PDBFile(object):
if keepIds and len(res.id) < 5: if keepIds and len(res.id) < 5:
resId = res.id resId = res.id
else: else:
resId = "%4d" % ((resIndex+1)%10000) resId = _formatIndex(resIndex+1, 4)
if len(res.insertionCode) == 1: if len(res.insertionCode) == 1:
resIC = res.insertionCode resIC = res.insertionCode
else: else:
...@@ -384,8 +384,8 @@ class PDBFile(object): ...@@ -384,8 +384,8 @@ class PDBFile(object):
else: else:
atomName = atom.name atomName = atom.name
coords = positions[posIndex] coords = positions[posIndex]
line = "%s%5d %-4s %3s %s%4s%1s %s%s%s 1.00 0.00 %2s " % ( line = "%s%5s %-4s %3s %s%4s%1s %s%s%s 1.00 0.00 %2s " % (
recordName, atomIndex%100000, atomName, resName, chainName, resId, resIC, _format_83(coords[0]), recordName, _formatIndex(atomIndex, 5), atomName, resName, chainName, resId, resIC, _format_83(coords[0]),
_format_83(coords[1]), _format_83(coords[2]), symbol) _format_83(coords[1]), _format_83(coords[2]), symbol)
if len(line) != 80: if len(line) != 80:
raise ValueError('Fixed width overflow detected') raise ValueError('Fixed width overflow detected')
...@@ -393,7 +393,7 @@ class PDBFile(object): ...@@ -393,7 +393,7 @@ class PDBFile(object):
posIndex += 1 posIndex += 1
atomIndex += 1 atomIndex += 1
if resIndex == len(residues)-1: if resIndex == len(residues)-1:
print("TER %5d %3s %s%4s" % (atomIndex, resName, chainName, resId), file=file) print("TER %5s %3s %s%4s" % (_formatIndex(atomIndex, 5), resName, chainName, resId), file=file)
atomIndex += 1 atomIndex += 1
if modelIndex is not None: if modelIndex is not None:
print("ENDMDL", file=file) print("ENDMDL", file=file)
...@@ -450,11 +450,11 @@ class PDBFile(object): ...@@ -450,11 +450,11 @@ class PDBFile(object):
for index1 in sorted(atomBonds): for index1 in sorted(atomBonds):
bonded = atomBonds[index1] bonded = atomBonds[index1]
while len(bonded) > 4: while len(bonded) > 4:
print("CONECT%5d%5d%5d%5d" % (index1, bonded[0], bonded[1], bonded[2]), file=file) print("CONECT%5s%5s%5s%5s" % (_formatIndex(index1, 5), _formatIndex(bonded[0], 5), _formatIndex(bonded[1], 5), _formatIndex(bonded[2], 5)), file=file)
del bonded[:4] del bonded[:4]
line = "CONECT%5d" % index1 line = "CONECT%5s" % _formatIndex(index1, 5)
for index2 in bonded: for index2 in bonded:
line = "%s%5d" % (line, index2) line = "%s%5s" % (line, _formatIndex(index2, 5))
print(line, file=file) print(line, file=file)
print("END", file=file) print("END", file=file)
...@@ -470,3 +470,14 @@ def _format_83(f): ...@@ -470,3 +470,14 @@ def _format_83(f):
return ('%8.3f' % f)[:8] return ('%8.3f' % f)[:8]
raise ValueError('coordinate "%s" could not be represented ' raise ValueError('coordinate "%s" could not be represented '
'in a width-8 field' % f) 'in a width-8 field' % f)
def _formatIndex(index, places):
"""Create a string representation of an atom or residue index. If the value is larger than can fit
in the available space, switch to hex.
"""
if index < 10**places:
format = f'%{places}d'
return format % index
format = f'%{places}X'
shiftedIndex = (index - 10**places + 10*16**(places-1)) % (16**places)
return format % shiftedIndex
\ No newline at end of file
...@@ -94,6 +94,45 @@ class TestPdbFile(unittest.TestCase): ...@@ -94,6 +94,45 @@ class TestPdbFile(unittest.TestCase):
self.assertEqual(19, len(pdb.positions)) self.assertEqual(19, len(pdb.positions))
self.assertEqual('ILE', list(pdb.topology.residues())[0].name) self.assertEqual('ILE', list(pdb.topology.residues())[0].name)
def test_LargeFile(self):
"""Write and read a file with more than 100,000 atoms"""
topology = Topology()
chain = topology.addChain('A')
for i in range(20000):
res = topology.addResidue('MOL', chain)
atoms = []
for j in range(6):
atoms.append(topology.addAtom(f'AT{j}', elem.carbon, res))
for j in range(5):
topology.addBond(atoms[j], atoms[j+1])
positions = [Vec3(0, 0, 0)]*topology.getNumAtoms()
# The model has 20,000 residues and 120,000 atoms.
output = StringIO()
PDBFile.writeFile(topology, positions, output)
input = StringIO(output.getvalue())
pdb = PDBFile(input)
output.close()
input.close()
self.assertEqual(len(positions), len(pdb.positions))
self.assertEqual(topology.getNumAtoms(), pdb.topology.getNumAtoms())
self.assertEqual(topology.getNumResidues(), pdb.topology.getNumResidues())
self.assertEqual(topology.getNumChains(), pdb.topology.getNumChains())
self.assertEqual(topology.getNumBonds(), pdb.topology.getNumBonds())
for atom in pdb.topology.atoms():
self.assertEqual(str(atom.index+1), atom.id)
for res in pdb.topology.residues():
self.assertEqual(str(res.index+1), res.id)
# Make sure the CONECT records were interpreted correctly.
bonds = set()
for atom1, atom2 in topology.bonds():
bonds.add(tuple(sorted((atom1.index, atom2.index))))
for atom1, atom2 in pdb.topology.bonds():
assert tuple(sorted((atom1.index, atom2.index))) in bonds
def assertVecAlmostEqual(self, p1, p2, tol=1e-7): def assertVecAlmostEqual(self, p1, p2, tol=1e-7):
unit = p1.unit unit = p1.unit
p1 = p1.value_in_unit(unit) p1 = p1.value_in_unit(unit)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment