"vscode:/vscode.git/clone" did not exist on "fdce64f702d78307c149c79958f3e1578f8fed2d"
Unverified Commit 51adf8c2 authored by Evan Pretti's avatar Evan Pretti Committed by GitHub
Browse files

Merge pull request #5020 from epretti/template-match-errors

Output more detailed messages when template matching fails
parents bb3073d4 1cf42774
......@@ -40,7 +40,8 @@ import math
import warnings
from math import sqrt, cos
from copy import deepcopy
from collections import defaultdict
from collections import Counter, defaultdict
from difflib import SequenceMatcher
import openmm as mm
import openmm.unit as unit
from . import element as elem
......@@ -1794,62 +1795,226 @@ def _applyMultiResiduePatch(data, clusters, patch, candidateTemplates, selectedT
def _findMatchErrors(forcefield, res):
"""Try to guess why a residue failed to match any template and return an error message."""
residueCounts = _countResidueAtoms([atom.element for atom in res.atoms()])
numResidueAtoms = sum(residueCounts.values())
numResidueHeavyAtoms = sum(residueCounts[element] for element in residueCounts if element not in (None, elem.hydrogen))
# Loop over templates and see how closely each one might match.
def counterSubtract(counter1, counter2):
"""Subtracts two Counter objects (equivalent to counter1 - counter2 but preserves negatives)."""
difference = counter1.copy()
difference.subtract(counter2)
return difference
def elemToNum(elem):
"""Returns the atomic number of an element or 0 for None."""
return 0 if elem is None else elem.atomic_number
def makeIntBondSpec(bond):
"""Converts an internal bond to (x, y) where x and y are atomic numbers of the bonded atoms' elements, and x < y."""
elemNum1 = elemToNum(bond[0].element)
elemNum2 = elemToNum(bond[1].element)
return min(elemNum1, elemNum2), max(elemNum1, elemNum2)
def makeExtBondSpec(bond):
"""Converts an external bond to the atomic number of the element of the non-external atom."""
return elemToNum(bond.atom1.element) if bond.atom1.residue is res else elemToNum(bond.atom2.element)
def bestMatchAtom(templatesDiffs, templateNames, useHeavy):
"""Finds the best matching templates based on atoms, optionally ignoring non-heavy atoms."""
bestMatches = []
bestScore = None
for templateName in templateNames:
# Find the (signed) differences in atom counts for all elements,
# optionally considering only heavy atoms.
allDiffs = templatesDiffs[templateName].items()
diffs = [(elemNum, diff) for elemNum, diff in allDiffs if not useHeavy or elemNum > 1]
# Build a score as (x, y), where y is the sum of the magnitudes of
# the differences in element counts, and x is a flag set if the
# residue has more of any element than the template. Minimizing the
# score will, when these tuples are compared, favor templates where
# the residue is missing atoms first, and only if there are none,
# include templates for which the residue has extra atoms.
score = (any(diff > 0 for _, diff in allDiffs), sum(abs(diff) for _, diff in diffs))
if bestScore is None or score <= bestScore:
# Keep a list of every template with the lowest score seen.
if score != bestScore:
# If bestScore is None or score < bestScore, clear the list.
bestMatches.clear()
bestScore = score
bestMatches.append(templateName)
return bestMatches, bestScore
def bestMatchBond(templatesDiffs, templateNames):
"""Finds the best matching templates based on bonds."""
bestMatches = []
bestScore = None
for templateName in templateNames:
diffs = templatesDiffs[templateName].items()
# Favor templates where the residue is missing bonds first, and if
# there are none, include templates where it has extra bonds.
score = (any(diff > 0 for _, diff in diffs), sum(abs(diff) for _, diff in diffs))
if bestScore is None or score <= bestScore:
if score != bestScore:
bestMatches.clear()
bestScore = score
bestMatches.append(templateName)
return bestMatches, bestScore
def joinMessages(messages):
"""Produce a human-readable message from the individual message strings passed."""
messages = list(messages)
if len(messages) < 3:
return ' and '.join(messages)
messages[-1] = f'and {messages[-1]}'
return ', '.join(messages)
def formatDiffMessage(diffs, formatter):
"""Formats a message describing a difference between a residue and a template."""
missing, extra = [], []
for key, diff in diffs.items():
if diff < 0:
missing.append((key, -diff))
if diff > 0:
extra.append((key, diff))
messages = []
if missing:
message = joinMessages(f'{diff} {formatter(key, diff)}' for key, diff in sorted(missing))
messages.append(f'is missing {message}')
if extra:
message = joinMessages(f'{diff} {formatter(key, diff)}' for key, diff in sorted(extra))
messages.append(f'has {message} too many')
return joinMessages(messages)
def formatAtomDiff(key, diff):
"""Formats a string describing an element associated with a different number of atoms."""
return (f'{elem.Element.getByAtomicNumber(key).symbol} atom' if key else 'extra site') + ('' if diff == 1 else 's')
def formatBondDiff(key, diff):
"""Formats a string describing elements associated with a different number of bonds."""
name1 = elem.Element.getByAtomicNumber(key[0]).symbol if key else 'extra site'
name2 = elem.Element.getByAtomicNumber(key[1]).symbol if key else 'extra site'
return f'{name1}-{name2} bond' + ('' if diff == 1 else 's')
def pickBestMatch(bestMatches):
"""If there are multiple best-scoring matches, pick one with the closest name to the residue. Call only with a non-empty list."""
if not res.name:
return bestMatches[0]
return max(bestMatches, key=lambda match: SequenceMatcher(a=res.name.strip(), b=match.strip(), autojunk=False).ratio())
# First check to see if there are no templates in the force field.
if not forcefield._templates:
return f'The force field contains no residue templates.'
# Get all elements in all templates in the force field and see if this
# residue uses an element not supported. Otherwise, prepare fingerprints of
# the residue and templates based on counts of the elements of their atoms.
supportedElements = set(elemToNum(atom.element) for template in forcefield._templates.values() for atom in template.atoms)
residueAtomCounts = Counter(elemToNum(atom.element) for atom in res.atoms())
unsupportedElements = set(residueAtomCounts.keys()) - supportedElements
if unsupportedElements:
unsupportedMessage = joinMessages(formatAtomDiff(elemNum, 0) for elemNum in sorted(unsupportedElements))
return f'The residue contains {unsupportedMessage}, which are not supported by any template in the force field.'
# It will be useful to provide a special message if this is a terminal
# residue of the chain, since this is a common cause of topology problems.
bestMatchName = None
numBestMatchAtoms = 3*numResidueAtoms
numBestMatchHeavyAtoms = 2*numResidueHeavyAtoms
for templateName in forcefield._templates:
chainResidues = list(res.chain.residues())
if len(chainResidues) > 1 and (res == chainResidues[0] or res == chainResidues[-1]):
terminalMessage = ' Is the chain terminated in a way that is unsupported by the force field?'
else:
terminalMessage = ''
def makeTemplateAtomDiff(templateName):
"""Prepares a Counter describing the difference between a residue's and a template's atoms."""
template = forcefield._templates[templateName]
templateCounts = _countResidueAtoms([atom.element for atom in template.atoms])
return counterSubtract(residueAtomCounts, Counter(elemToNum(atom.element) for atom in template.atoms))
templatesAtomDiffs = {templateName: makeTemplateAtomDiff(templateName) for templateName in forcefield._templates}
# Compare the residue with templates, first based on heavy atoms, then if
# templates are found where all heavy atoms match, on all atoms.
bestMatches = templatesAtomDiffs.keys()
bestMatches, bestScore = bestMatchAtom(templatesAtomDiffs, bestMatches, True)
if bestMatches and bestScore[1]:
# If there are matches found and the best score's sum is non-zero
# (an imperfect match), return. Otherwise, if the best matches are
# perfect, keep filtering with additional criteria.
bestMatch = pickBestMatch(bestMatches)
return f'The set of atoms is similar to {bestMatch}, but {formatDiffMessage(templatesAtomDiffs[bestMatch], formatAtomDiff)}.{terminalMessage}'
bestMatches, bestScore = bestMatchAtom(templatesAtomDiffs, bestMatches, False)
if bestMatches and bestScore[1]:
bestMatch = pickBestMatch(bestMatches)
bestMatchDiffs = templatesAtomDiffs[bestMatch]
# Give additional help in the special cases where the residue is missing
# sites or hydrogen atoms and nothing else.
if bestMatchDiffs[0] < 0 and all(diff == 0 for key, diff in bestMatchDiffs.items() if key != 0):
specialLabel = 'it' if bestMatchDiffs[0] == -1 else 'them'
specialMessage = f' You may be able to add {specialLabel} with Modeller.addExtraParticles().'
elif bestMatchDiffs[1] < 0 and all(diff == 0 for key, diff in bestMatchDiffs.items() if key != 1):
specialLabel = 'it' if bestMatchDiffs[1] == -1 else 'them'
specialMessage = f' You may be able to add {specialLabel} with Modeller.addHydrogens().'
else:
specialMessage = terminalMessage
return f'The set of heavy atoms matches {bestMatch}, but the residue {formatDiffMessage(bestMatchDiffs, formatAtomDiff)}.{specialMessage}'
# We found templates that are atom-for-atom matches to the residue, so now
# prepare fingerprints of the residue and templates based on their bonds.
# The compare the residue with templates based on bonds.
residueIntBondCounts = Counter(makeIntBondSpec(bond) for bond in res.internal_bonds())
def makeTemplateIntBondDiff(templateName):
"""Prepares a Counter describing the difference between a residue's and a template's bonds."""
template = forcefield._templates[templateName]
return counterSubtract(residueIntBondCounts, Counter(makeIntBondSpec((template.atoms[atom1], template.atoms[atom2])) for atom1, atom2 in template.bonds))
# Does the residue have any atoms that clearly aren't in the template?
# We only need to prepare data for the templates remaining in bestMatches.
if any(element not in templateCounts or templateCounts[element] < residueCounts[element] for element in residueCounts):
continue
templatesIntBondDiffs = {templateName: makeTemplateIntBondDiff(templateName) for templateName in bestMatches}
# If there are too many missing atoms, discard this template.
bestMatches, bestScore = bestMatchBond(templatesIntBondDiffs, bestMatches)
if bestMatches and bestScore[1]:
bestMatch = pickBestMatch(bestMatches)
if not tuple(res.internal_bonds()):
# Special message when the residue is missing all internal bonds.
return (f'The set of atoms matches {bestMatch}, but the residue has no bonds between its atoms. '
'If the topology was read from a PDB, it may contain non-standard residues/names and/or be missing CONECT records.')
return f'The set of atoms matches {bestMatch}, but the residue {formatDiffMessage(templatesIntBondDiffs[bestMatch], formatBondDiff)}.'
numTemplateAtoms = sum(templateCounts.values())
numTemplateHeavyAtoms = sum(templateCounts[element] for element in templateCounts if element not in (None, elem.hydrogen))
if numTemplateAtoms > numBestMatchAtoms:
continue
if numTemplateHeavyAtoms > numBestMatchHeavyAtoms:
continue
# Finally, check external bonds. Normally these should always be from heavy
# atoms, so don't do separate checks excluding vs. including non-heavy atoms
# (but the check will work regardless).
# If this template has the same number of missing atoms as our previous best one, look at the name
# to decide which one to use.
residueExtBondCounts = Counter(makeExtBondSpec(bond) for bond in res.external_bonds())
if numTemplateAtoms == numBestMatchAtoms:
if bestMatchName == res.name or res.name not in templateName:
continue
def makeTemplateExtBondDiff(templateName):
"""Prepares a Counter describing the difference between a residue's and a template's external bonds."""
template = forcefield._templates[templateName]
return counterSubtract(residueExtBondCounts, Counter(elemToNum(template.atoms[atom].element) for atom in template.externalBonds))
# Accept this as our new best match.
templatesExtBondDiffs = {templateName: makeTemplateExtBondDiff(templateName) for templateName in bestMatches}
bestMatchName = templateName
numBestMatchAtoms = numTemplateAtoms
numBestMatchHeavyAtoms = numTemplateHeavyAtoms
numBestMatchExtraParticles = len([atom for atom in template.atoms if atom.element is None])
bestMatches, bestScore = bestMatchAtom(templatesExtBondDiffs, bestMatches, False)
if bestMatches and bestScore[1]:
bestMatch = pickBestMatch(bestMatches)
bestMatchDiffs = templatesExtBondDiffs[bestMatch]
if all(value <= 0 for value in bestMatchDiffs.values()):
# Special message if external bonds are missing only, not different.
specialMessage = ' Is the chain missing a terminal capping group?'
else:
specialMessage = terminalMessage
return f'The atoms and bonds in the residue match {bestMatch}, but the set of externally bonded atoms {formatDiffMessage(bestMatchDiffs, formatAtomDiff)}.{specialMessage}'
# Return an appropriate error message.
# If we have matches at this point, atoms and bonds match perfectly, so the
# connectivity must be different. If bestMatches is empty, something else
# went wrong, so return a generic error message.
if bestMatches:
# Display all possible matching templates at this point to try to help,
# since we can't give any more detailed information.
return f'The atoms and bonds in the residue match {joinMessages(bestMatches)}, but the connectivity is different.'
if numBestMatchAtoms == numResidueAtoms:
chainResidues = list(res.chain.residues())
if len(chainResidues) > 1 and (res == chainResidues[0] or res == chainResidues[-1]):
return 'The set of atoms matches %s, but the bonds are different. Perhaps the chain is missing a terminal group?' % bestMatchName
return 'The set of atoms matches %s, but the bonds are different.' % bestMatchName
if bestMatchName is not None:
if numBestMatchHeavyAtoms == numResidueHeavyAtoms:
numResidueExtraParticles = len([atom for atom in res.atoms() if atom.element is None])
if numResidueExtraParticles == 0 and numBestMatchExtraParticles == 0:
return 'The set of atoms is similar to %s, but it is missing %d hydrogen atoms.' % (bestMatchName, numBestMatchAtoms-numResidueAtoms)
if numBestMatchExtraParticles-numResidueExtraParticles == numBestMatchAtoms-numResidueAtoms:
return 'The set of atoms is similar to %s, but it is missing %d extra particles. You can add them with Modeller.addExtraParticles().' % (bestMatchName, numBestMatchAtoms-numResidueAtoms)
return 'The set of atoms is similar to %s, but it is missing %d atoms.' % (bestMatchName, numBestMatchAtoms-numResidueAtoms)
return 'This might mean your input topology is missing some atoms or bonds, or possibly that you are using the wrong force field.'
def _createResidueTemplate(residue):
......
......@@ -831,6 +831,162 @@ class TestForceField(unittest.TestCase):
self.assertEqual(templates[1].name, 'ALA')
self.assertEqual(templates[2].name, 'CALA')
def test_matchErrorMessages(self):
"""Test match error detection and diagnostics"""
# Load a force field to test with and prepare some lines with which to build topologies from PDB files.
forcefield = ForceField('amber14-all.xml', 'amber14/opc.xml')
pdbLines = [
'ATOM 0 CH3 ACE A 1 0 0 0 C',
'ATOM 1 HH31 ACE A 1 0 0 0 H',
'ATOM 2 HH32 ACE A 1 0 0 0 H',
'ATOM 3 HH33 ACE A 1 0 0 0 H',
'ATOM 4 C ACE A 1 0 0 0 C',
'ATOM 5 O ACE A 1 0 0 0 O',
'ATOM 6 N GLY A 2 0 0 0 N',
'ATOM 7 H GLY A 2 0 0 0 H',
'ATOM 8 CA GLY A 2 0 0 0 C',
'ATOM 9 HA2 GLY A 2 0 0 0 H',
'ATOM 10 HA3 GLY A 2 0 0 0 H',
'ATOM 11 C GLY A 2 0 0 0 C',
'ATOM 12 O GLY A 2 0 0 0 O',
'ATOM 13 N NME A 3 0 0 0 N',
'ATOM 14 H NME A 3 0 0 0 H',
'ATOM 15 CH3 NME A 3 0 0 0 C',
'ATOM 16 HH31 NME A 3 0 0 0 H',
'ATOM 17 HH32 NME A 3 0 0 0 H',
'ATOM 18 HH33 NME A 3 0 0 0 H',
]
def makeSystem(lines):
return forcefield.createSystem(PDBFile(StringIO("\n".join(lines))).topology)
# This should succeed and not produce any match errors.
self.assertEqual(makeSystem(pdbLines).getNumParticles(), 19)
# Add an He atom and B atoms atom to GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The residue contains He atoms and B atoms, which are not supported by any template in the force field'):
makeSystem(pdbLines[:9] + [
'ATOM 19 X1 GLY A 2 0 0 0 He',
'ATOM 20 X2 GLY A 2 0 0 0 B',
'ATOM 21 X3 GLY A 2 0 0 0 B',
] + pdbLines[9:])
# Delete CA atom from GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of atoms is similar to GLY, but is missing 1 C atom'):
makeSystem(pdbLines[:8] + pdbLines[9:])
# Add an F atom to GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of atoms is similar to GLY, but has 1 F atom too many'):
makeSystem(pdbLines[:9] + [
'ATOM 19 X1 GLY A 2 0 0 0 F',
] + pdbLines[9:])
# Delete CA atom from GLY and add an F atom.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of atoms is similar to GLY, but is missing 1 C atom and has 1 F atom too many'):
makeSystem(pdbLines[:8] + [
'ATOM 19 X1 GLY A 2 0 0 0 F',
] + pdbLines[9:])
# Add 1 F atom, 2 Cl atoms, and 1 Br atom to GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of atoms is similar to GLY, but has 1 F atom, 2 Cl atoms, and 1 Br atom too many'):
makeSystem(pdbLines[:9] + [
'ATOM 19 X1 GLY A 2 0 0 0 F',
'ATOM 20 X2 GLY A 2 0 0 0 Cl',
'ATOM 21 X3 GLY A 2 0 0 0 Cl',
'ATOM 22 X4 GLY A 2 0 0 0 Br',
] + pdbLines[9:])
# Add a virtual site to GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of heavy atoms matches GLY, but the residue has 1 extra site too many'):
makeSystem(pdbLines[:9] + [
'ATOM 19 X1 GLY A 2 0 0 0 EP',
] + pdbLines[9:])
# Delete HA3 atom from GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of heavy atoms matches GLY, but the residue is missing 1 H atom.*You may be able to add it with.*addHydrogens'):
makeSystem(pdbLines[:10] + pdbLines[11:])
# Delete HA2 and HA3 atoms from GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of heavy atoms matches GLY, but the residue is missing 2 H atoms.*You may be able to add them with.*addHydrogens'):
makeSystem(pdbLines[:9] + pdbLines[11:])
# Delete HA3 atom from GLY and add a virtual site.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of heavy atoms matches GLY, but the residue is missing 1 H atom and has 1 extra site too many'):
makeSystem(pdbLines[:10] + [
'ATOM 19 X1 GLY A 2 0 0 0 EP',
] + pdbLines[11:])
# Rename HA3 atom to remove the CA-HA3 bond.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of atoms matches GLY, but the residue is missing 1 H-C bond'):
makeSystem(pdbLines[:10] + [
'ATOM 10 X1 GLY A 2 0 0 0 H',
] + pdbLines[11:])
# Add an extra N-O bond.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The set of atoms matches GLY, but the residue has 1 N-O bond too many'):
makeSystem(pdbLines + [
'CONECT 6 12'
])
# Remove an external bond to NME by renaming its N atom.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The atoms and bonds in the residue match GLY, but the set of externally bonded atoms is missing 1 C atom'):
makeSystem(pdbLines[:13] + [
'ATOM 13 X1 NME A 3 0 0 0 N',
] + pdbLines[14:])
# Add an extra external bond to NME.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The atoms and bonds in the residue match GLY, but the set of externally bonded atoms has 1 O atom too many'):
makeSystem(pdbLines + [
'CONECT 12 15'
])
# Delete ACE so that a capping group is missing from GLY.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The atoms and bonds in the residue match GLY, but the set of externally bonded atoms is missing 1 N atom.*Is the chain missing a terminal capping group?'):
makeSystem(pdbLines[6:])
# Keep the atom/bond element fingerprint the same but change the connectivity.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*GLY.*The atoms and bonds in the residue match GLY, but the connectivity is different'):
# Rename O to break the C=O bond, but then reattach the O to the CA.
makeSystem(pdbLines[:12] + [
'ATOM 12 X1 GLY A 2 0 0 0 O',
] + pdbLines[13:] + [
'CONECT 8 12'
])
# Make water with incorrect atom names so bonds will be missing.
pdbLines = [
'ATOM 0 X1 HOH A 1 0 0 0 O',
'ATOM 1 X2 HOH A 1 0 0 0 H',
'ATOM 2 X3 HOH A 1 0 0 0 H',
]
# Check for a special message when all bonds are missing.
forcefield = ForceField('opc3.xml')
with self.assertRaisesRegex(ValueError, 'No template found for residue.*HOH.*The set of atoms matches HOH, but the residue has no bonds between its atoms'):
makeSystem(pdbLines)
# Add a site to a residue with a force field that doesn't support sites.
with self.assertRaisesRegex(ValueError, 'No template found for residue.*HOH.*The residue contains extra sites, which are not supported by any template in the force field'):
makeSystem(pdbLines + [
'ATOM 3 X4 HOH A 1 0 0 0 EP',
])
# Load a force field so that 1 site will be missing.
forcefield = ForceField('opc.xml')
with self.assertRaisesRegex(ValueError, 'No template found for residue.*HOH.*The set of heavy atoms matches HOH, but the residue is missing 1 extra site.*You may be able to add it with.*addExtraParticles'):
makeSystem(pdbLines)
# Load a force field so that 2 sites will be missing.
forcefield = ForceField('tip5p.xml')
with self.assertRaisesRegex(ValueError, 'No template found for residue.*HOH.*The set of heavy atoms matches HOH, but the residue is missing 2 extra sites.*You may be able to add them with.*addExtraParticles'):
makeSystem(pdbLines)
# Use an empty force field so that there are no templates.
forcefield = ForceField()
with self.assertRaisesRegex(ValueError, 'No template found for residue.*HOH.*The force field contains no residue templates'):
makeSystem(pdbLines)
def test_Wildcard(self):
"""Test that PeriodicTorsionForces using wildcard ('') for atom types / classes in the ffxml are correctly registered"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment