Commit db72d0cb authored by peastman's avatar peastman
Browse files

Merge pull request #702 from peastman/pdbx

Created reader for PDBx/mmCIF files
parents 4ab3b428 49722158
......@@ -2,7 +2,7 @@ OpenMM was developed by Simbios, the NIH National Center for Physics-Based
Simulation of Biological Structures at Stanford, funded under the NIH Roadmap
for Medical Research, grant U54 GM072970. See https://simtk.org.
Portions copyright © 2008-2014 Stanford University and the Authors.
Portions copyright � 2008-2014 Stanford University and the Authors.
There are several licenses which cover different parts of OpenMM as described
below.
......@@ -119,3 +119,11 @@ freely, subject to the following restrictions:
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
6. PdbxReader
OpenMM uses the PDBx/mmCIF parser written by John Westbrook. It is distributed
under the Creative Commons Attribution 3.0 Unported license. For details, see
https://creativecommons.org/licenses/by/3.0. This library was modified to move
it inside the simtk.openmm.app.internal module.
\ No newline at end of file
......@@ -145,7 +145,10 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
"simtk.openmm",
"simtk.openmm.app",
"simtk.openmm.app.internal",
"simtk.openmm.app.internal.charmm"]
"simtk.openmm.app.internal.charmm",
"simtk.openmm.app.internal.pdbx",
"simtk.openmm.app.internal.pdbx.reader",
"simtk.openmm.app.internal.pdbx.writer"]
setupKeywords["data_files"] = []
setupKeywords["package_data"] = {"simtk" : [],
"simtk.unit" : [],
......
......@@ -12,6 +12,7 @@ __email__ = "peastman@stanford.edu"
from topology import Topology, Chain, Residue, Atom
from pdbfile import PDBFile
from pdbxfile import PDBxFile
from forcefield import ForceField
from simulation import Simulation
from pdbreporter import PDBReporter
......
##
#
# File: PdbxContainers.py
# Original: 02-Feb-2009 jdw
#
# Update:
# 23-Mar-2011 jdw Added method to rename attributes in category containers.
# 05-Apr-2011 jdw Change cif writer to select double quoting as preferred
# quoting style where possible.
# 16-Jan-2012 jdw Create base class for DataCategory class
# 22-Mar-2012 jdw when append attributes to existing categories update
# existing rows with placeholder null values.
# 2-Sep-2012 jdw add option to avoid embedded quoting that might
# confuse simple parsers.
# 28-Jun-2013 jdw export remove method
# 29-Jun-2013 jdw export remove row method
##
"""
A collection of container classes supporting the PDBx/mmCIF storage model.
A base container class is defined which supports common features of
data and definition containers. PDBx data files are organized in
sections called data blocks which are mapped to data containers.
PDBx dictionaries contain definition sections and data sections
which are mapped to definition and data containes respectively.
Data in both PDBx data files and dictionaries are organized in
data categories. In the PDBx syntax individual items or data
identified by labels of the form '_categoryName.attributeName'.
The terms category and attribute in PDBx jargon are analogous
table and column in relational data model, or class and attribute
in an object oriented data model.
The DataCategory class provides base storage container for instance
data and definition meta data.
"""
__docformat__ = "restructuredtext en"
__author__ = "John Westbrook"
__email__ = "jwest@rcsb.rutgers.edu"
__license__ = "Creative Commons Attribution 3.0 Unported"
__version__ = "V0.01"
import re,sys,traceback
class CifName(object):
''' Class of utilities for CIF-style data names -
'''
def __init__(self):
pass
@staticmethod
def categoryPart(name):
tname=""
if name.startswith("_"):
tname=name[1:]
else:
tname=name
i = tname.find(".")
if i == -1:
return tname
else:
return tname[:i]
@staticmethod
def attributePart(name):
i = name.find(".")
if i == -1:
return None
else:
return name[i+1:]
class ContainerBase(object):
''' Container base class for data and definition objects.
'''
def __init__(self,name):
# The enclosing scope of the data container (e.g. data_/save_)
self.__name = name
# List of category names within this container -
self.__objNameList=[]
# dictionary of DataCategory objects keyed by category name.
self.__objCatalog={}
self.__type=None
def getType(self):
return self.__type
def setType(self,type):
self.__type=type
def getName(self):
return self.__name
def setName(self,name):
self.__name=name
def exists(self,name):
if self.__objCatalog.has_key(name):
return True
else:
return False
def getObj(self,name):
if self.__objCatalog.has_key(name):
return self.__objCatalog[name]
else:
return None
def getObjNameList(self):
return self.__objNameList
def append(self,obj):
""" Add the input object to the current object catalog. An existing object
of the same name will be overwritten.
"""
if obj.getName() is not None:
if not self.__objCatalog.has_key(obj.getName()):
# self.__objNameList is keeping track of object order here --
self.__objNameList.append(obj.getName())
self.__objCatalog[obj.getName()]=obj
def replace(self,obj):
""" Replace an existing object with the input object
"""
if ((obj.getName() is not None) and (self.__objCatalog.has_key(obj.getName())) ):
self.__objCatalog[obj.getName()]=obj
def printIt(self,fh=sys.stdout,type="brief"):
fh.write("+ %s container: %30s contains %4d categories\n" %
(self.getType(),self.getName(),len(self.__objNameList)))
for nm in self.__objNameList:
fh.write("--------------------------------------------\n")
fh.write("Data category: %s\n" % nm)
if type == 'brief':
self.__objCatalog[nm].printIt(fh)
else:
self.__objCatalog[nm].dumpIt(fh)
def rename(self,curName,newName):
""" Change the name of an object in place -
"""
try:
i=self.__objNameList.index(curName)
self.__objNameList[i]=newName
self.__objCatalog[newName]=self.__objCatalog[curName]
self.__objCatalog[newName].setName(newName)
return True
except:
return False
def remove(self,curName):
""" Revmove object by name. Return True on success or False otherwise.
"""
try:
if self.__objCatalog.has_key(curName):
del self.__objCatalog[curName]
i=self.__objNameList.index(curName)
del self.__objNameList[i]
return True
else:
return False
except:
pass
return False
class DefinitionContainer(ContainerBase):
def __init__(self,name):
super(DefinitionContainer,self).__init__(name)
self.setType('definition')
def isCategory(self):
if self.exists('category'):
return True
return False
def isAttribute(self):
if self.exists('item'):
return True
return False
def printIt(self,fh=sys.stdout,type="brief"):
fh.write("Definition container: %30s contains %4d categories\n" %
(self.getName(),len(self.getObjNameList())))
if self.isCategory():
fh.write("Definition type: category\n")
elif self.isAttribute():
fh.write("Definition type: item\n")
else:
fh.write("Definition type: undefined\n")
for nm in self.getObjNameList():
fh.write("--------------------------------------------\n")
fh.write("Definition category: %s\n" % nm)
if type == 'brief':
self.getObj(nm).printIt(fh)
else:
self.getObj(nm).dumpId(fh)
class DataContainer(ContainerBase):
''' Container class for DataCategory objects.
'''
def __init__(self,name):
super(DataContainer,self).__init__(name)
self.setType('data')
self.__globalFlag=False
def invokeDataBlockMethod(self,type,method,db):
self.__currentRow = 1
exec method.getInline()
def setGlobal(self):
self.__globalFlag=True
def getGlobal(self):
return self.__globalFlag
class DataCategoryBase(object):
""" Base object definition for a data category -
"""
def __init__(self,name,attributeNameList=None,rowList=None):
self._name = name
#
if rowList is not None:
self._rowList=rowList
else:
self._rowList=[]
if attributeNameList is not None:
self._attributeNameList=attributeNameList
else:
self._attributeNameList=[]
#
# Derived class data -
#
self._catalog={}
self._numAttributes=0
#
self.__setup()
def __setup(self):
self._numAttributes = len(self._attributeNameList)
self._catalog={}
for attributeName in self._attributeNameList:
attributeNameLC = attributeName.lower()
self._catalog[attributeNameLC] = attributeName
#
def setRowList(self,rowList):
self._rowList=rowList
def setAttributeNameList(self,attributeNameList):
self._attributeNameList=attributeNameList
self.__setup()
def setName(self,name):
self._name=name
def get(self):
return (self._name,self._attributeNameList,self._rowList)
class DataCategory(DataCategoryBase):
""" Methods for creating, accessing, and formatting PDBx cif data categories.
"""
def __init__(self,name,attributeNameList=None,rowList=None):
super(DataCategory,self).__init__(name,attributeNameList,rowList)
#
self.__lfh = sys.stdout
self.__currentRowIndex=0
self.__currentAttribute=None
#
self.__avoidEmbeddedQuoting=False
#
# --------------------------------------------------------------------
# any whitespace
self.__wsRe=re.compile(r"\s")
self.__wsAndQuotesRe=re.compile(r"[\s'\"]")
# any newline or carriage control
self.__nlRe=re.compile(r"[\n\r]")
#
# single quote
self.__sqRe=re.compile(r"[']")
#
self.__sqWsRe=re.compile(r"('\s)|(\s')")
# double quote
self.__dqRe=re.compile(r'["]')
self.__dqWsRe=re.compile(r'("\s)|(\s")')
#
self.__intRe=re.compile(r'^[0-9]+$')
self.__floatRe=re.compile(r'^-?(([0-9]+)[.]?|([0-9]*[.][0-9]+))([(][0-9]+[)])?([eE][+-]?[0-9]+)?$')
#
self.__dataTypeList=['DT_NULL_VALUE','DT_INTEGER','DT_FLOAT','DT_UNQUOTED_STRING','DT_ITEM_NAME',
'DT_DOUBLE_QUOTED_STRING','DT_SINGLE_QUOTED_STRING','DT_MULTI_LINE_STRING']
self.__formatTypeList=['FT_NULL_VALUE','FT_NUMBER','FT_NUMBER','FT_UNQUOTED_STRING',
'FT_QUOTED_STRING','FT_QUOTED_STRING','FT_QUOTED_STRING','FT_MULTI_LINE_STRING']
#
def __getitem__(self, x):
""" Implements list-type functionality -
Implements op[x] for some special cases -
x=integer - returns the row in category (normal list behavior)
x=string - returns the value of attribute 'x' in first row.
"""
if isinstance(x, int):
#return self._rowList.__getitem__(x)
return self._rowList[x]
elif isinstance(x, str):
try:
#return self._rowList[0][x]
ii=self.getAttributeIndex(x)
return self._rowList[0][ii]
except (IndexError, KeyError):
raise KeyError
raise TypeError, x
def getCurrentAttribute(self):
return self.__currentAttribute
def getRowIndex(self):
return self.__currentRowIndex
def getRowList(self):
return self._rowList
def getRowCount(self):
return (len(self._rowList))
def getRow(self,index):
try:
return self._rowList[index]
except:
return []
def removeRow(self,index):
try:
if ((index >= 0) and (index < len(self._rowList))):
del self._rowList[index]
if self.__currentRowIndex >= len(self._rowList):
self.__currentRowIndex = len(self._rowList) -1
return True
else:
pass
except:
pass
return False
def getFullRow(self,index):
""" Return a full row based on the length of the the attribute list.
"""
try:
if (len(self._rowList[index]) < self._numAttributes):
for ii in range( self._numAttributes-len(self._rowList[index])):
self._rowList[index].append('?')
return self._rowList[index]
except:
return ['?' for ii in range(self._numAttributes)]
def getName(self):
return self._name
def getAttributeList(self):
return self._attributeNameList
def getAttributeCount(self):
return len(self._attributeNameList)
def getAttributeListWithOrder(self):
oL=[]
for ii,att in enumerate(self._attributeNameList):
oL.append((att,ii))
return oL
def getAttributeIndex(self,attributeName):
try:
return self._attributeNameList.index(attributeName)
except:
return -1
def hasAttribute(self,attributeName):
return attributeName in self._attributeNameList
def getIndex(self,attributeName):
try:
return self._attributeNameList.index(attributeName)
except:
return -1
def getItemNameList(self):
itemNameList=[]
for att in self._attributeNameList:
itemNameList.append("_"+self._name+"."+att)
return itemNameList
def append(self,row):
#self.__lfh.write("PdbxContainer(append) category %s row %r\n" % (self._name,row))
self._rowList.append(row)
def appendAttribute(self,attributeName):
attributeNameLC = attributeName.lower()
if attributeNameLC in self._catalog:
i = self._attributeNameList.index(self._catalog[attributeNameLC])
self._attributeNameList[i] = attributeName
self._catalog[attributeNameLC] = attributeName
#self.__lfh.write("Appending existing attribute %s\n" % attributeName)
else:
#self.__lfh.write("Appending existing attribute %s\n" % attributeName)
self._attributeNameList.append(attributeName)
self._catalog[attributeNameLC] = attributeName
#
self._numAttributes = len(self._attributeNameList)
def appendAttributeExtendRows(self,attributeName):
attributeNameLC = attributeName.lower()
if attributeNameLC in self._catalog:
i = self._attributeNameList.index(self._catalog[attributeNameLC])
self._attributeNameList[i] = attributeName
self._catalog[attributeNameLC] = attributeName
self.__lfh.write("Appending existing attribute %s\n" % attributeName)
else:
self._attributeNameList.append(attributeName)
self._catalog[attributeNameLC] = attributeName
# add a placeholder to any existing rows for the new attribute.
if (len(self._rowList) > 0):
for row in self._rowList:
row.append("?")
#
self._numAttributes = len(self._attributeNameList)
def getValue(self,attributeName=None,rowIndex=None):
if attributeName is None:
attribute = self.__currentAttribute
else:
attribute = attributeName
if rowIndex is None:
rowI = self.__currentRowIndex
else:
rowI =rowIndex
if isinstance(attribute, str) and isinstance(rowI,int):
try:
return self._rowList[rowI][self._attributeNameList.index(attribute)]
except (IndexError):
raise IndexError
raise IndexError, attribute
def setValue(self,value,attributeName=None,rowIndex=None):
if attributeName is None:
attribute=self.__currentAttribute
else:
attribute=attributeName
if rowIndex is None:
rowI = self.__currentRowIndex
else:
rowI = rowIndex
if isinstance(attribute, str) and isinstance(rowI,int):
try:
# if row index is out of range - add the rows -
for ii in range(rowI+1 - len(self._rowList)):
self._rowList.append(self.__emptyRow())
# self._rowList[rowI][attribute]=value
ll=len(self._rowList[rowI])
ind=self._attributeNameList.index(attribute)
# extend the list if needed -
if ( ind >= ll):
self._rowList[rowI].extend([None for ii in xrange(2*ind -ll)])
self._rowList[rowI][ind]=value
except (IndexError):
self.__lfh.write("DataCategory(setvalue) index error category %s attribute %s index %d value %r\n" %
(self._name,attribute,rowI,value))
traceback.print_exc(file=self.__lfh)
#raise IndexError
except (ValueError):
self.__lfh.write("DataCategory(setvalue) value error category %s attribute %s index %d value %r\n" %
(self._name,attribute,rowI,value))
traceback.print_exc(file=self.__lfh)
#raise ValueError
def __emptyRow(self):
return [None for ii in range(len(self._attributeNameList))]
def replaceValue(self,oldValue,newValue,attributeName):
numReplace=0
if attributeName not in self._attributeNameList:
return numReplace
ind=self._attributeNameList.index(attributeName)
for row in self._rowList:
if row[ind] == oldValue:
row[ind]=newValue
numReplace += 1
return numReplace
def replaceSubstring(self,oldValue,newValue,attributeName):
ok=False
if attributeName not in self._attributeNameList:
return ok
ind=self._attributeNameList.index(attributeName)
for row in self._rowList:
val=row[ind]
row[ind]=val.replace(oldValue,newValue)
if val != row[ind]:
ok=True
return ok
def invokeAttributeMethod(self,attributeName,type,method,db):
self.__currentRowIndex = 0
self.__currentAttribute=attributeName
self.appendAttribute(attributeName)
currentRowIndex=self.__currentRowIndex
#
ind=self._attributeNameList.index(attributeName)
if len(self._rowList) == 0:
row=[None for ii in xrange(len(self._attributeNameList)*2)]
row[ind]=None
self._rowList.append(row)
for row in self._rowList:
ll = len(row)
if (ind >= ll):
row.extend([None for ii in xrange(2*ind-ll)])
row[ind]=None
exec method.getInline()
self.__currentRowIndex+=1
currentRowIndex=self.__currentRowIndex
def invokeCategoryMethod(self,type,method,db):
self.__currentRowIndex = 0
exec method.getInline()
def getAttributeLengthMaximumList(self):
mList=[0 for i in len(self._attributeNameList)]
for row in self._rowList:
for indx,val in enumerate(row):
mList[indx] = max(mList[indx],len(val))
return mList
def renameAttribute(self,curAttributeName,newAttributeName):
""" Change the name of an attribute in place -
"""
try:
i=self._attributeNameList.index(curAttributeName)
self._attributeNameList[i]=newAttributeName
del self._catalog[curAttributeName.lower()]
self._catalog[newAttributeName.lower()]=newAttributeName
return True
except:
return False
def printIt(self,fh=sys.stdout):
fh.write("--------------------------------------------\n")
fh.write(" Category: %s attribute list length: %d\n" %
(self._name,len(self._attributeNameList)))
for at in self._attributeNameList:
fh.write(" Category: %s attribute: %s\n" % (self._name,at))
fh.write(" Row value list length: %d\n" % len(self._rowList))
#
for row in self._rowList[:2]:
#
if len(row) == len(self._attributeNameList):
for ii,v in enumerate(row):
fh.write(" %30s: %s ...\n" % (self._attributeNameList[ii],str(v)[:30]))
else:
fh.write("+WARNING - %s data length %d attribute name length %s mismatched\n" %
(self._name,len(row),len(self._attributeNameList)))
def dumpIt(self,fh=sys.stdout):
fh.write("--------------------------------------------\n")
fh.write(" Category: %s attribute list length: %d\n" %
(self._name,len(self._attributeNameList)))
for at in self._attributeNameList:
fh.write(" Category: %s attribute: %s\n" % (self._name,at))
fh.write(" Value list length: %d\n" % len(self._rowList))
for row in self._rowList:
for ii,v in enumerate(row):
fh.write(" %30s: %s\n" % (self._attributeNameList[ii],v))
def __formatPdbx(self, inp):
""" Format input data following PDBx quoting rules -
"""
try:
if (inp is None):
return ("?",'DT_NULL_VALUE')
# pure numerical values are returned as unquoted strings
if (isinstance(inp,int) or self.__intRe.search(str(inp))):
return ( [str(inp)],'DT_INTEGER')
if (isinstance(inp,float) or self.__floatRe.search(str(inp))):
return ([str(inp)],'DT_FLOAT')
# null value handling -
if (inp == "." or inp == "?"):
return ([inp],'DT_NULL_VALUE')
if (inp == ""):
return (["."],'DT_NULL_VALUE')
# Contains white space or quotes ?
if not self.__wsAndQuotesRe.search(inp):
if inp.startswith("_"):
return (self.__doubleQuotedList(inp),'DT_ITEM_NAME')
else:
return ([str(inp)],'DT_UNQUOTED_STRING')
else:
if self.__nlRe.search(inp):
return (self.__semiColonQuotedList(inp),'DT_MULTI_LINE_STRING')
else:
if (self.__avoidEmbeddedQuoting):
# change priority to choose double quoting where possible.
if not self.__dqRe.search(inp) and not self.__sqWsRe.search(inp):
return (self.__doubleQuotedList(inp),'DT_DOUBLE_QUOTED_STRING')
elif not self.__sqRe.search(inp) and not self.__dqWsRe.search(inp):
return (self.__singleQuotedList(inp),'DT_SINGLE_QUOTED_STRING')
else:
return (self.__semiColonQuotedList(inp),'DT_MULTI_LINE_STRING')
else:
# change priority to choose double quoting where possible.
if not self.__dqRe.search(inp):
return (self.__doubleQuotedList(inp),'DT_DOUBLE_QUOTED_STRING')
elif not self.__sqRe.search(inp):
return (self.__singleQuotedList(inp),'DT_SINGLE_QUOTED_STRING')
else:
return (self.__semiColonQuotedList(inp),'DT_MULTI_LINE_STRING')
except:
traceback.print_exc(file=self.__lfh)
def __dataTypePdbx(self, inp):
""" Detect the PDBx data type -
"""
if (inp is None):
return ('DT_NULL_VALUE')
# pure numerical values are returned as unquoted strings
if isinstance(inp,int) or self.__intRe.search(str(inp)):
return ('DT_INTEGER')
if isinstance(inp,float) or self.__floatRe.search(str(inp)):
return ('DT_FLOAT')
# null value handling -
if (inp == "." or inp == "?"):
return ('DT_NULL_VALUE')
if (inp == ""):
return ('DT_NULL_VALUE')
# Contains white space or quotes ?
if not self.__wsAndQuotesRe.search(inp):
if inp.startswith("_"):
return ('DT_ITEM_NAME')
else:
return ('DT_UNQUOTED_STRING')
else:
if self.__nlRe.search(inp):
return ('DT_MULTI_LINE_STRING')
else:
if (self.__avoidEmbeddedQuoting):
if not self.__sqRe.search(inp) and not self.__dqWsRe.search(inp):
return ('DT_DOUBLE_QUOTED_STRING')
elif not self.__dqRe.search(inp) and not self.__sqWsRe.search(inp):
return ('DT_SINGLE_QUOTED_STRING')
else:
return ('DT_MULTI_LINE_STRING')
else:
if not self.__sqRe.search(inp):
return ('DT_DOUBLE_QUOTED_STRING')
elif not self.__dqRe.search(inp):
return ('DT_SINGLE_QUOTED_STRING')
else:
return ('DT_MULTI_LINE_STRING')
def __singleQuotedList(self,inp):
l=[]
l.append("'")
l.append(inp)
l.append("'")
return(l)
def __doubleQuotedList(self,inp):
l=[]
l.append('"')
l.append(inp)
l.append('"')
return(l)
def __semiColonQuotedList(self,inp):
l=[]
l.append("\n")
if inp[-1] == '\n':
l.append(";")
l.append(inp)
l.append(";")
l.append("\n")
else:
l.append(";")
l.append(inp)
l.append("\n")
l.append(";")
l.append("\n")
return(l)
def getValueFormatted(self,attributeName=None,rowIndex=None):
if attributeName is None:
attribute=self.__currentAttribute
else:
attribute=attributeName
if rowIndex is None:
rowI = self.__currentRowIndex
else:
rowI = rowIndex
if isinstance(attribute, str) and isinstance(rowI,int):
try:
list,type=self.__formatPdbx(self._rowList[rowI][self._attributeNameList.index(attribute)])
return "".join(list)
except (IndexError):
self.__lfh.write("attributeName %s rowI %r rowdata %r\n" % (attributeName,rowI,self._rowList[rowI]))
raise IndexError
raise TypeError, attribute
def getValueFormattedByIndex(self,attributeIndex,rowIndex):
try:
list,type=self.__formatPdbx(self._rowList[rowIndex][attributeIndex])
return "".join(list)
except (IndexError):
raise IndexError
def getAttributeValueMaxLengthList(self,steps=1):
mList=[0 for i in range(len(self._attributeNameList))]
for row in self._rowList[::steps]:
for indx in range(len(self._attributeNameList)):
val=row[indx]
mList[indx] = max(mList[indx],len(str(val)))
return mList
def getFormatTypeList(self,steps=1):
try:
curDataTypeList=['DT_NULL_VALUE' for i in range(len(self._attributeNameList))]
for row in self._rowList[::steps]:
for indx in range(len(self._attributeNameList)):
val=row[indx]
# print "index ",indx," val ",val
dType=self.__dataTypePdbx(val)
dIndx=self.__dataTypeList.index(dType)
# print "d type", dType, " d type index ",dIndx
cType=curDataTypeList[indx]
cIndx=self.__dataTypeList.index(cType)
cIndx= max(cIndx,dIndx)
curDataTypeList[indx]=self.__dataTypeList[cIndx]
# Map the format types to the data types
curFormatTypeList=[]
for dt in curDataTypeList:
ii=self.__dataTypeList.index(dt)
curFormatTypeList.append(self.__formatTypeList[ii])
except:
self.__lfh.write("PdbxDataCategory(getFormatTypeList) ++Index error at index %d in row %r\n" % (indx,row))
return curFormatTypeList,curDataTypeList
def getFormatTypeListX(self):
curDataTypeList=['DT_NULL_VALUE' for i in range(len(self._attributeNameList))]
for row in self._rowList:
for indx in range(len(self._attributeNameList)):
val=row[indx]
#print "index ",indx," val ",val
dType=self.__dataTypePdbx(val)
dIndx=self.__dataTypeList.index(dType)
#print "d type", dType, " d type index ",dIndx
cType=curDataTypeList[indx]
cIndx=self.__dataTypeList.index(cType)
cIndx= max(cIndx,dIndx)
curDataTypeList[indx]=self.__dataTypeList[cIndx]
# Map the format types to the data types
curFormatTypeList=[]
for dt in curDataTypeList:
ii=self.__dataTypeList.index(dt)
curFormatTypeList.append(self.__formatTypeList[ii])
return curFormatTypeList,curDataTypeList
##
# File: PdbxParser.py
# Date: 2009-10-25 Jdw Original from py-pdbx-parser-v2
#
# Update:
#
# 2009-11-05 - (jdw) Change table storage architecture for list of
# dictionaries to list of lists.
# 2012-01-09 - (jdw) This module now obsolted by PdbxReader/PdbxWriter
# modules. APIs are preserved.
#
# 2012-09-01 - (jdw) Revise tokenizer to better handle embedded quoting.
#
# NOTE - - Now obsolete - Use pdb.reader.PdbxReader & pdbx.writer.PdbxWriter
#
##
"""
PDBx/mmCIF dictionary and data file parser.
Acknowledgements:
The tokenizer used in this module is modeled after the clever parser design
used in the PyMMLIB package.
PyMMLib Development Group
Authors: Ethan Merritt: merritt@u.washington.ed & Jay Painter: jay.painter@gmail.com
See: http://pymmlib.sourceforge.net/
"""
__docformat__ = "restructuredtext en"
__author__ = "John Westbrook"
__email__ = "jwest@rcsb.rutgers.edu"
__license__ = "Creative Commons Attribution 3.0 Unported"
__version__ = "V0.01"
import re,sys
from simtk.openmm.app.internal.pdbx.reader.PdbxContainers import *
class PdbxError(Exception):
""" Class for catch general errors
"""
pass
class SyntaxError(Exception):
""" Class for catching syntax errors
"""
def __init__(self, lineNumber, text):
Exception.__init__(self)
self.lineNumber = lineNumber
self.text = text
def __str__(self):
return "%%ERROR - [at line: %d] %s" % (self.lineNumber, self.text)
class PdbxReader(object):
""" PDBx reader for data files and dictionaries.
"""
def __init__(self,ifh):
""" ifh - input file handle returned by open()
"""
#
self.__curLineNumber = 0
self.__ifh=ifh
self.__stateDict={"data": "ST_DATA_CONTAINER",
"loop": "ST_TABLE",
"global": "ST_GLOBAL_CONTAINER",
"save": "ST_DEFINITION",
"stop": "ST_STOP"}
def read(self, containerList):
"""
Appends to the input list of definition and data containers.
"""
self.__curLineNumber = 0
try:
self.__parser(self.__tokenizer(self.__ifh), containerList)
except StopIteration:
pass
else:
raise PdbxError()
def __syntaxError(self, errText):
raise SyntaxError(self.__curLineNumber, errText)
def __getContainerName(self,inWord):
""" Returns the name of the data_ or save_ container
"""
return str(inWord[5:]).strip()
def __getState(self, inWord):
"""Identifies reserved syntax elements and assigns an associated state.
Returns: (reserved word, state)
where -
reserved word - is one of CIF syntax elements:
data_, loop_, global_, save_, stop_
state - the parser state required to process this next section.
"""
i = inWord.find("_")
if i == -1:
return None,"ST_UNKNOWN"
try:
rWord=inWord[:i].lower()
return rWord, self.__stateDict[rWord]
except:
return None,"ST_UNKNOWN"
def __parser(self, tokenizer, containerList):
""" Parser for PDBx data files and dictionaries.
Input - tokenizer() reentrant method recognizing data item names (_category.attribute)
quoted strings (single, double and multi-line semi-colon delimited), and unquoted
strings.
containerList - list-type container for data and definition objects parsed from
from the input file.
Return:
containerList - is appended with data and definition objects -
"""
# Working container - data or definition
curContainer = None
#
# Working category container
categoryIndex = {}
curCategory = None
#
curRow = None
state = None
# Find the first reserved word and begin capturing data.
#
while True:
curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
if curWord is None:
continue
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
break
while True:
#
# Set the current state -
#
# At this point in the processing cycle we are expecting a token containing
# either a '_category.attribute' or a reserved word.
#
if curCatName is not None:
state = "ST_KEY_VALUE_PAIR"
elif curWord is not None:
reservedWord, state = self.__getState(curWord)
else:
self.__syntaxError("Miscellaneous syntax error")
return
#
# Process _category.attribute value assignments
#
if state == "ST_KEY_VALUE_PAIR":
try:
curCategory = categoryIndex[curCatName]
except KeyError:
# A new category is encountered - create a container and add a row
curCategory = categoryIndex[curCatName] = DataCategory(curCatName)
try:
curContainer.append(curCategory)
except AttributeError:
self.__syntaxError("Category cannot be added to data_ block")
return
curRow = []
curCategory.append(curRow)
else:
# Recover the existing row from the category
try:
curRow = curCategory[0]
except IndexError:
self.__syntaxError("Internal index error accessing category data")
return
# Check for duplicate attributes and add attribute to table.
if curAttName in curCategory.getAttributeList():
self.__syntaxError("Duplicate attribute encountered in category")
return
else:
curCategory.appendAttribute(curAttName)
# Get the data for this attribute from the next token
tCat, tAtt, curQuotedString, curWord = tokenizer.next()
if tCat is not None or (curQuotedString is None and curWord is None):
self.__syntaxError("Missing data for item _%s.%s" % (curCatName,curAttName))
if curWord is not None:
#
# Validation check token for misplaced reserved words -
#
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
self.__syntaxError("Unexpected reserved word: %s" % (reservedWord))
curRow.append(curWord)
elif curQuotedString is not None:
curRow.append(curQuotedString)
else:
self.__syntaxError("Missing value in item-value pair")
curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
continue
#
# Process a loop_ declaration and associated data -
#
elif state == "ST_TABLE":
# The category name in the next curCatName,curAttName pair
# defines the name of the category container.
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
if curCatName is None or curAttName is None:
self.__syntaxError("Unexpected token in loop_ declaration")
return
# Check for a previous category declaration.
if categoryIndex.has_key(curCatName):
self.__syntaxError("Duplicate category declaration in loop_")
return
curCategory = DataCategory(curCatName)
try:
curContainer.append(curCategory)
except AttributeError:
self.__syntaxError("loop_ declaration outside of data_ block or save_ frame")
return
curCategory.appendAttribute(curAttName)
# Read the rest of the loop_ declaration
while True:
curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
if curCatName is None:
break
if curCatName != curCategory.getName():
self.__syntaxError("Changed category name in loop_ declaration")
return
curCategory.appendAttribute(curAttName)
# If the next token is a 'word', check it for any reserved words -
if curWord is not None:
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
if reservedWord == "stop":
return
else:
self.__syntaxError("Unexpected reserved word after loop declaration: %s" % (reservedWord))
# Read the table of data for this loop_ -
while True:
curRow = []
curCategory.append(curRow)
for tAtt in curCategory.getAttributeList():
if curWord is not None:
curRow.append(curWord)
elif curQuotedString is not None:
curRow.append(curQuotedString)
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
# loop_ data processing ends if -
# A new _category.attribute is encountered
if curCatName is not None:
break
# A reserved word is encountered
if curWord is not None:
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
break
continue
elif state == "ST_DEFINITION":
# Ignore trailing unnamed saveframe delimiters e.g. 'save_'
sName=self.__getContainerName(curWord)
if (len(sName) > 0):
curContainer = DefinitionContainer(sName)
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
elif state == "ST_DATA_CONTAINER":
#
dName=self.__getContainerName(curWord)
if len(dName) == 0:
dName="unidentified"
curContainer = DataContainer(dName)
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
elif state == "ST_STOP":
return
elif state == "ST_GLOBAL":
curContainer = DataContainer("blank-global")
curContainer.setGlobal()
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
elif state == "ST_UNKNOWN":
self.__syntaxError("Unrecogized syntax element: " + str(curWord))
return
def __tokenizer(self, ifh):
""" Tokenizer method for the mmCIF syntax file -
Each return/yield from this method returns information about
the next token in the form of a tuple with the following structure.
(category name, attribute name, quoted strings, words w/o quotes or white space)
Differentiated the reqular expression to the better handle embedded quotes.
"""
#
# Regex definition for mmCIF syntax - semi-colon delimited strings are handled
# outside of this regex.
mmcifRe = re.compile(
r"(?:"
"(?:_(.+?)[.](\S+))" "|" # _category.attribute
"(?:['](.*?)(?:[']\s|[']$))" "|" # single quoted strings
"(?:[\"](.*?)(?:[\"]\s|[\"]$))" "|" # double quoted strings
"(?:\s*#.*$)" "|" # comments (dumped)
"(\S+)" # unquoted words
")")
fileIter = iter(ifh)
## Tokenizer loop begins here ---
while True:
line = fileIter.next()
self.__curLineNumber += 1
# Dump comments
if line.startswith("#"):
continue
# Gobble up the entire semi-colon/multi-line delimited string and
# and stuff this into the string slot in the return tuple
#
if line.startswith(";"):
mlString = [line[1:]]
while True:
line = fileIter.next()
self.__curLineNumber += 1
if line.startswith(";"):
break
mlString.append(line)
# remove trailing new-line that is part of the \n; delimiter
mlString[-1] = mlString[-1].rstrip()
#
yield (None, None, "".join(mlString), None)
#
# Need to process the remainder of the current line -
line = line[1:]
#continue
# Apply regex to the current line consolidate the single/double
# quoted within the quoted string category
for it in mmcifRe.finditer(line):
tgroups = it.groups()
if tgroups != (None, None, None, None, None):
if tgroups[2] is not None:
qs = tgroups[2]
elif tgroups[3] is not None:
qs = tgroups[3]
else:
qs = None
groups = (tgroups[0],tgroups[1],qs,tgroups[4])
yield groups
def __tokenizerOrg(self, ifh):
""" Tokenizer method for the mmCIF syntax file -
Each return/yield from this method returns information about
the next token in the form of a tuple with the following structure.
(category name, attribute name, quoted strings, words w/o quotes or white space)
mmcifRe = re.compile(
r"(?:"
"(?:_(.+?)[.](\S+))" "|" # _category.attribute
"(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|" # quoted strings
"(?:\s*#.*$)" "|" # comments (dumped)
"(\S+)" # unquoted words
")")
"""
#
# Regex definition for mmCIF syntax - semi-colon delimited strings are handled
# outside of this regex.
mmcifRe = re.compile(
r"(?:"
"(?:_(.+?)[.](\S+))" "|" # _category.attribute
"(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|" # quoted strings
"(?:\s*#.*$)" "|" # comments (dumped)
"(\S+)" # unquoted words
")")
fileIter = iter(ifh)
## Tokenizer loop begins here ---
while True:
line = fileIter.next()
self.__curLineNumber += 1
# Dump comments
if line.startswith("#"):
continue
# Gobble up the entire semi-colon/multi-line delimited string and
# and stuff this into the string slot in the return tuple
#
if line.startswith(";"):
mlString = [line[1:]]
while True:
line = fileIter.next()
self.__curLineNumber += 1
if line.startswith(";"):
break
mlString.append(line)
# remove trailing new-line that is part of the \n; delimiter
mlString[-1] = mlString[-1].rstrip()
#
yield (None, None, "".join(mlString), None)
#
# Need to process the remainder of the current line -
line = line[1:]
#continue
## Apply regex to the current line
for it in mmcifRe.finditer(line):
groups = it.groups()
if groups != (None, None, None, None):
yield groups
class PdbxWriter(object):
"""Write PDBx data files or dictionaries using the input container
or container list.
"""
def __init__(self,ofh=sys.stdout):
self.__ofh=ofh
self.__containerList=[]
self.__MAXIMUM_LINE_LENGTH = 2048
self.__SPACING = 2
self.__INDENT_DEFINITION = 3
self.__indentSpace = " " * self.__INDENT_DEFINITION
self.__doDefinitionIndent=False
def write(self, containerList):
self.__containerList=containerList
for container in self.__containerList:
self.writeContainer(container)
def writeContainer(self,container):
indS=" " * self.__INDENT_DEFINITION
if isinstance(container, DefinitionContainer):
self.__write("save_%s\n" % container.getName())
self.__doDefinitionIndent=True
self.__write(indS+"#\n")
elif isinstance(container, DataContainer):
if (container.getGlobal()):
self.__write("global_\n")
self.__doDefinitionIndent=False
self.__write("\n")
else:
self.__write("data_%s\n" % container.getName())
self.__doDefinitionIndent=False
self.__write("#\n")
for nm in container.getObjNameList():
obj=container.getObj(nm)
objL=obj.getRowList()
# Skip empty objects
if len(objL) == 0:
continue
# Item - value formattting
elif len(objL) == 1:
self.__writeItemValueFormat(obj)
# Table formatting -
elif len(objL) > 1 and len(obj.getAttributeList()) > 0:
self.__writeTableFormat(obj)
else:
raise PdbxError()
if self.__doDefinitionIndent:
self.__write(indS+"#")
else:
self.__write("#")
# Add a trailing saveframe reserved word
if isinstance(container, DefinitionContainer):
self.__write("save_\n")
self.__write("#\n")
def __write(self, st):
self.__ofh.write(st)
def __writeItemValueFormat(self, myCategory):
# Compute the maximum item name length within this category -
attributeNameLengthMax = 0
for attributeName in myCategory.getAttributeList():
attributeNameLengthMax = max(attributeNameLengthMax, len(attributeName))
itemNameLengthMax = self.__SPACING + len(myCategory.getName()) + attributeNameLengthMax + 2
#
lineList=[]
for attributeName,iPos in myCategory.getAttributeListWithOrder():
lineList.append("\n")
if self.__doDefinitionIndent:
# - add indent --
lineList.append(self.__indentSpace)
itemName = "_%s.%s" % (myCategory.getName(), attributeName)
lineList.append(itemName.ljust(itemNameLengthMax))
lineList.append(myCategory.getValueFormatted(attributeName,0))
lineList.append("\n")
self.__write("".join(lineList))
def __writeTableFormat(self, myCategory):
# Write the declaration of the loop_
#
lineList=[]
lineList.append('\n')
if self.__doDefinitionIndent:
lineList.append(self.__indentSpace)
lineList.append("loop_")
for attributeName in myCategory.getAttributeList():
lineList.append('\n')
if self.__doDefinitionIndent:
lineList.append(self.__indentSpace)
itemName = "_%s.%s" % (myCategory.getName(), attributeName)
lineList.append(itemName)
self.__write("".join(lineList))
#
# Write the data in tabular format -
#
#print myCategory.getName()
#print myCategory.getAttributeList()
formatTypeList,dataTypeList=myCategory.getFormatTypeList()
maxLengthList=myCategory.getAttributeValueMaxLengthList()
spacing = " " * self.__SPACING
#
#print formatTypeList
#print dataTypeList
#print maxLengthList
#
for iRow in range(myCategory.getRowCount()):
lineList = []
lineList.append('\n')
if self.__doDefinitionIndent:
lineList.append(self.__indentSpace + " ")
for iAt in range(myCategory.getAttributeCount()):
formatType = formatTypeList[iAt]
maxLength = maxLengthList[iAt]
if (formatType == 'FT_UNQUOTED_STRING' or formatType == 'FT_NULL_VALUE'):
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val.ljust(maxLength))
elif formatType == 'FT_NUMBER':
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val.rjust(maxLength))
elif formatType == 'FT_QUOTED_STRING':
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val.ljust(maxLength))
elif formatType == "FT_MULTI_LINE_STRING":
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val)
lineList.append(spacing)
self.__write("".join(lineList))
self.__write("\n")
##
# File: PdbxReadWriteTests.py
# Author: jdw
# Date: 9-Oct-2011
# Version: 0.001
#
# Updated:
# 24-Oct-2012 jdw update path details and reorganize.
#
##
""" Various tests caess for PDBx/mmCIF data file and dictionary reader and writer.
"""
__docformat__ = "restructuredtext en"
__author__ = "John Westbrook"
__email__ = "jwest@rcsb.rutgers.edu"
__license__ = "Creative Commons Attribution 3.0 Unported"
__version__ = "V0.01"
import sys, unittest, traceback
import sys, time, os, os.path, shutil
from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
from simtk.openmm.app.internal.pdbx.writer.PdbxWriter import PdbxWriter
from simtk.openmm.app.internal.pdbx.reader.PdbxContainers import *
class PdbxReadWriteTests(unittest.TestCase):
def setUp(self):
self.lfh=sys.stdout
self.verbose=False
self.pathPdbxDataFile = "../tests/1kip.cif"
self.pathOutputFile = "testOutputDataFile.cif"
def tearDown(self):
pass
def testSimpleInitialization(self):
"""Test case - Simple initialization of a data category and data block
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
fn="test-simple.cif"
attributeNameList=['aOne','aTwo','aThree','aFour','aFive','aSix','aSeven','aEight','aNine','aTen']
rowList=[[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10],
[1,2,3,4,5,6,7,8,9,10]
]
nameCat='myCategory'
#
#
curContainer=DataContainer("myblock")
aCat=DataCategory(nameCat,attributeNameList,rowList)
aCat.printIt()
curContainer.append(aCat)
curContainer.printIt()
#
myContainerList=[]
myContainerList.append(curContainer)
ofh = open(fn, "w")
pdbxW=PdbxWriter(ofh)
pdbxW.write(myContainerList)
ofh.close()
myContainerList=[]
ifh = open(fn, "r")
pRd=PdbxReader(ifh)
pRd.read(myContainerList)
ifh.close()
for container in myContainerList:
for objName in container.getObjNameList():
name,aList,rList=container.getObj(objName).get()
self.lfh.write("Recovered data category %s\n" % name)
self.lfh.write("Attribute list %r\n" % repr(aList))
self.lfh.write("Row list %r\n" % repr(rList))
except:
traceback.print_exc(file=self.lfh)
self.fail()
def testWriteDataFile(self):
"""Test case - write data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ofh = open("test-output.cif", "w")
curContainer=DataContainer("myblock")
aCat=DataCategory("pdbx_seqtool_mapping_ref")
aCat.appendAttribute("ordinal")
aCat.appendAttribute("entity_id")
aCat.appendAttribute("auth_mon_id")
aCat.appendAttribute("auth_mon_num")
aCat.appendAttribute("pdb_chain_id")
aCat.appendAttribute("ref_mon_id")
aCat.appendAttribute("ref_mon_num")
aCat.append([1,2,3,4,5,6,7])
aCat.append([1,2,3,4,5,6,7])
aCat.append([1,2,3,4,5,6,7])
aCat.append([1,2,3,4,5,6,7])
aCat.append([7,6,5,4,3,2,1])
aCat.printIt()
curContainer.append(aCat)
curContainer.printIt()
#
myDataList.append(curContainer)
pdbxW=PdbxWriter(ofh)
pdbxW.write(myDataList)
ofh.close()
except:
traceback.print_exc(file=self.lfh)
self.fail()
def testUpdateDataFile(self):
"""Test case - update data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
# Create a initial data file --
#
myDataList=[]
curContainer=DataContainer("myblock")
aCat=DataCategory("pdbx_seqtool_mapping_ref")
aCat.appendAttribute("ordinal")
aCat.appendAttribute("entity_id")
aCat.appendAttribute("auth_mon_id")
aCat.appendAttribute("auth_mon_num")
aCat.appendAttribute("pdb_chain_id")
aCat.appendAttribute("ref_mon_id")
aCat.appendAttribute("ref_mon_num")
aCat.append([9,2,3,4,5,6,7])
aCat.append([10,2,3,4,5,6,7])
aCat.append([11,2,3,4,5,6,7])
aCat.append([12,2,3,4,5,6,7])
#self.lfh.write("Assigned data category state-----------------\n")
#aCat.dumpIt(fh=self.lfh)
curContainer.append(aCat)
myDataList.append(curContainer)
ofh = open("test-output-1.cif", "w")
pdbxW=PdbxWriter(ofh)
pdbxW.write(myDataList)
ofh.close()
#
#
# Read and update the data -
#
myDataList=[]
ifh = open("test-output-1.cif", "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
#
myBlock=myDataList[0]
myBlock.printIt()
myCat=myBlock.getObj('pdbx_seqtool_mapping_ref')
myCat.printIt()
for iRow in xrange(0,myCat.getRowCount()):
myCat.setValue('some value', 'ref_mon_id',iRow)
myCat.setValue(100, 'ref_mon_num',iRow)
ofh = open("test-output-2.cif", "w")
pdbxW=PdbxWriter(ofh)
pdbxW.write(myDataList)
ofh.close()
#
except:
traceback.print_exc(file=self.lfh)
self.fail()
def testReadDataFile(self):
"""Test case - read data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ifh = open(self.pathPdbxDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
except:
traceback.print_exc(file=self.lfh)
self.fail()
def testReadWriteDataFile(self):
"""Test case - data file read write test
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
myDataList=[]
ifh = open(self.pathPdbxDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
ofh = open(self.pathOutputFile, "w")
pWr=PdbxWriter(ofh)
pWr.write(myDataList)
ofh.close()
except:
traceback.print_exc(file=self.lfh)
self.fail()
def simpleSuite():
suiteSelect = unittest.TestSuite()
suiteSelect.addTest(PdbxReadWriteTests("testSimpleInitialization"))
suiteSelect.addTest(PdbxReadWriteTests("testUpdateDataFile"))
suiteSelect.addTest(PdbxReadWriteTests("testReadWriteDataFile"))
return suiteSelect
if __name__ == '__main__':
#
mySuite=simpleSuite()
unittest.TextTestRunner(verbosity=2).run(mySuite)
#
##
# File: PdbxReader.py
# Date: 2012-01-09 Jdw Adapted from PdbxParser
#
# Updates:
#
# 2012-01-09 - (jdw) Separate reader and writer classes.
#
# 2012-09-02 - (jdw) Revise tokenizer to better handle embedded quoting.
#
##
"""
PDBx/mmCIF dictionary and data file parser.
Acknowledgements:
The tokenizer used in this module is modeled after the clever parser design
used in the PyMMLIB package.
PyMMLib Development Group
Authors: Ethan Merritt: merritt@u.washington.ed & Jay Painter: jay.painter@gmail.com
See: http://pymmlib.sourceforge.net/
"""
import re,sys
from simtk.openmm.app.internal.pdbx.reader.PdbxContainers import *
class PdbxError(Exception):
""" Class for catch general errors
"""
pass
class SyntaxError(Exception):
""" Class for catching syntax errors
"""
def __init__(self, lineNumber, text):
Exception.__init__(self)
self.lineNumber = lineNumber
self.text = text
def __str__(self):
return "%%ERROR - [at line: %d] %s" % (self.lineNumber, self.text)
class PdbxReader(object):
""" PDBx reader for data files and dictionaries.
"""
def __init__(self,ifh):
""" ifh - input file handle returned by open()
"""
#
self.__curLineNumber = 0
self.__ifh=ifh
self.__stateDict={"data": "ST_DATA_CONTAINER",
"loop": "ST_TABLE",
"global": "ST_GLOBAL_CONTAINER",
"save": "ST_DEFINITION",
"stop": "ST_STOP"}
def read(self, containerList):
"""
Appends to the input list of definition and data containers.
"""
self.__curLineNumber = 0
try:
self.__parser(self.__tokenizer(self.__ifh), containerList)
except StopIteration:
pass
else:
raise PdbxError()
def __syntaxError(self, errText):
raise SyntaxError(self.__curLineNumber, errText)
def __getContainerName(self,inWord):
""" Returns the name of the data_ or save_ container
"""
return str(inWord[5:]).strip()
def __getState(self, inWord):
"""Identifies reserved syntax elements and assigns an associated state.
Returns: (reserved word, state)
where -
reserved word - is one of CIF syntax elements:
data_, loop_, global_, save_, stop_
state - the parser state required to process this next section.
"""
i = inWord.find("_")
if i == -1:
return None,"ST_UNKNOWN"
try:
rWord=inWord[:i].lower()
return rWord, self.__stateDict[rWord]
except:
return None,"ST_UNKNOWN"
def __parser(self, tokenizer, containerList):
""" Parser for PDBx data files and dictionaries.
Input - tokenizer() reentrant method recognizing data item names (_category.attribute)
quoted strings (single, double and multi-line semi-colon delimited), and unquoted
strings.
containerList - list-type container for data and definition objects parsed from
from the input file.
Return:
containerList - is appended with data and definition objects -
"""
# Working container - data or definition
curContainer = None
#
# Working category container
categoryIndex = {}
curCategory = None
#
curRow = None
state = None
# Find the first reserved word and begin capturing data.
#
while True:
curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
if curWord is None:
continue
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
break
while True:
#
# Set the current state -
#
# At this point in the processing cycle we are expecting a token containing
# either a '_category.attribute' or a reserved word.
#
if curCatName is not None:
state = "ST_KEY_VALUE_PAIR"
elif curWord is not None:
reservedWord, state = self.__getState(curWord)
else:
self.__syntaxError("Miscellaneous syntax error")
return
#
# Process _category.attribute value assignments
#
if state == "ST_KEY_VALUE_PAIR":
try:
curCategory = categoryIndex[curCatName]
except KeyError:
# A new category is encountered - create a container and add a row
curCategory = categoryIndex[curCatName] = DataCategory(curCatName)
try:
curContainer.append(curCategory)
except AttributeError:
self.__syntaxError("Category cannot be added to data_ block")
return
curRow = []
curCategory.append(curRow)
else:
# Recover the existing row from the category
try:
curRow = curCategory[0]
except IndexError:
self.__syntaxError("Internal index error accessing category data")
return
# Check for duplicate attributes and add attribute to table.
if curAttName in curCategory.getAttributeList():
self.__syntaxError("Duplicate attribute encountered in category")
return
else:
curCategory.appendAttribute(curAttName)
# Get the data for this attribute from the next token
tCat, tAtt, curQuotedString, curWord = tokenizer.next()
if tCat is not None or (curQuotedString is None and curWord is None):
self.__syntaxError("Missing data for item _%s.%s" % (curCatName,curAttName))
if curWord is not None:
#
# Validation check token for misplaced reserved words -
#
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
self.__syntaxError("Unexpected reserved word: %s" % (reservedWord))
curRow.append(curWord)
elif curQuotedString is not None:
curRow.append(curQuotedString)
else:
self.__syntaxError("Missing value in item-value pair")
curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
continue
#
# Process a loop_ declaration and associated data -
#
elif state == "ST_TABLE":
# The category name in the next curCatName,curAttName pair
# defines the name of the category container.
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
if curCatName is None or curAttName is None:
self.__syntaxError("Unexpected token in loop_ declaration")
return
# Check for a previous category declaration.
if categoryIndex.has_key(curCatName):
self.__syntaxError("Duplicate category declaration in loop_")
return
curCategory = DataCategory(curCatName)
try:
curContainer.append(curCategory)
except AttributeError:
self.__syntaxError("loop_ declaration outside of data_ block or save_ frame")
return
curCategory.appendAttribute(curAttName)
# Read the rest of the loop_ declaration
while True:
curCatName, curAttName, curQuotedString, curWord = tokenizer.next()
if curCatName is None:
break
if curCatName != curCategory.getName():
self.__syntaxError("Changed category name in loop_ declaration")
return
curCategory.appendAttribute(curAttName)
# If the next token is a 'word', check it for any reserved words -
if curWord is not None:
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
if reservedWord == "stop":
return
else:
self.__syntaxError("Unexpected reserved word after loop declaration: %s" % (reservedWord))
# Read the table of data for this loop_ -
while True:
curRow = []
curCategory.append(curRow)
for tAtt in curCategory.getAttributeList():
if curWord is not None:
curRow.append(curWord)
elif curQuotedString is not None:
curRow.append(curQuotedString)
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
# loop_ data processing ends if -
# A new _category.attribute is encountered
if curCatName is not None:
break
# A reserved word is encountered
if curWord is not None:
reservedWord, state = self.__getState(curWord)
if reservedWord is not None:
break
continue
elif state == "ST_DEFINITION":
# Ignore trailing unnamed saveframe delimiters e.g. 'save_'
sName=self.__getContainerName(curWord)
if (len(sName) > 0):
curContainer = DefinitionContainer(sName)
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
elif state == "ST_DATA_CONTAINER":
#
dName=self.__getContainerName(curWord)
if len(dName) == 0:
dName="unidentified"
curContainer = DataContainer(dName)
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
elif state == "ST_STOP":
return
elif state == "ST_GLOBAL":
curContainer = DataContainer("blank-global")
curContainer.setGlobal()
containerList.append(curContainer)
categoryIndex = {}
curCategory = None
curCatName,curAttName,curQuotedString,curWord = tokenizer.next()
elif state == "ST_UNKNOWN":
self.__syntaxError("Unrecogized syntax element: " + str(curWord))
return
def __tokenizer(self, ifh):
""" Tokenizer method for the mmCIF syntax file -
Each return/yield from this method returns information about
the next token in the form of a tuple with the following structure.
(category name, attribute name, quoted strings, words w/o quotes or white space)
Differentiated the reqular expression to the better handle embedded quotes.
"""
#
# Regex definition for mmCIF syntax - semi-colon delimited strings are handled
# outside of this regex.
mmcifRe = re.compile(
r"(?:"
"(?:_(.+?)[.](\S+))" "|" # _category.attribute
"(?:['](.*?)(?:[']\s|[']$))" "|" # single quoted strings
"(?:[\"](.*?)(?:[\"]\s|[\"]$))" "|" # double quoted strings
"(?:\s*#.*$)" "|" # comments (dumped)
"(\S+)" # unquoted words
")")
fileIter = iter(ifh)
## Tokenizer loop begins here ---
while True:
line = fileIter.next()
self.__curLineNumber += 1
# Dump comments
if line.startswith("#"):
continue
# Gobble up the entire semi-colon/multi-line delimited string and
# and stuff this into the string slot in the return tuple
#
if line.startswith(";"):
mlString = [line[1:]]
while True:
line = fileIter.next()
self.__curLineNumber += 1
if line.startswith(";"):
break
mlString.append(line)
# remove trailing new-line that is part of the \n; delimiter
mlString[-1] = mlString[-1].rstrip()
#
yield (None, None, "".join(mlString), None)
#
# Need to process the remainder of the current line -
line = line[1:]
#continue
# Apply regex to the current line consolidate the single/double
# quoted within the quoted string category
for it in mmcifRe.finditer(line):
tgroups = it.groups()
if tgroups != (None, None, None, None, None):
if tgroups[2] is not None:
qs = tgroups[2]
elif tgroups[3] is not None:
qs = tgroups[3]
else:
qs = None
groups = (tgroups[0],tgroups[1],qs,tgroups[4])
yield groups
def __tokenizerOrg(self, ifh):
""" Tokenizer method for the mmCIF syntax file -
Each return/yield from this method returns information about
the next token in the form of a tuple with the following structure.
(category name, attribute name, quoted strings, words w/o quotes or white space)
"""
#
# Regex definition for mmCIF syntax - semi-colon delimited strings are handled
# outside of this regex.
mmcifRe = re.compile(
r"(?:"
"(?:_(.+?)[.](\S+))" "|" # _category.attribute
"(?:['\"](.*?)(?:['\"]\s|['\"]$))" "|" # quoted strings
"(?:\s*#.*$)" "|" # comments (dumped)
"(\S+)" # unquoted words
")")
fileIter = iter(ifh)
## Tokenizer loop begins here ---
while True:
line = fileIter.next()
self.__curLineNumber += 1
# Dump comments
if line.startswith("#"):
continue
# Gobble up the entire semi-colon/multi-line delimited string and
# and stuff this into the string slot in the return tuple
#
if line.startswith(";"):
mlString = [line[1:]]
while True:
line = fileIter.next()
self.__curLineNumber += 1
if line.startswith(";"):
break
mlString.append(line)
# remove trailing new-line that is part of the \n; delimiter
mlString[-1] = mlString[-1].rstrip()
#
yield (None, None, "".join(mlString), None)
#
# Need to process the remainder of the current line -
line = line[1:]
#continue
## Apply regex to the current line
for it in mmcifRe.finditer(line):
groups = it.groups()
if groups != (None, None, None, None):
yield groups
##
# File: PdbxReaderTests.py
# Author: jdw
# Date: 9-Jan-2012
# Version: 0.001
#
# Update:
# 27-Sep-2012 jdw add test case for reading PDBx structure factor file
#
##
"""
Test cases for reading PDBx/mmCIF data files PdbxReader class -
"""
import sys, unittest, traceback
import sys, time, os, os.path, shutil
from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
from simtk.openmm.app.internal.pdbx.reader.PdbxContainers import *
class PdbxReaderTests(unittest.TestCase):
def setUp(self):
self.lfh=sys.stderr
self.verbose=False
self.pathPdbxDataFile ="../tests/1kip.cif"
self.pathBigPdbxDataFile ="../tests/1ffk.cif"
self.pathSFDataFile ="../tests/1kip-sf.cif"
def tearDown(self):
pass
def testReadSmallDataFile(self):
"""Test case - read data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ifh = open(self.pathPdbxDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def testReadBigDataFile(self):
"""Test case - read data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ifh = open(self.pathBigPdbxDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def testReadSFDataFile(self):
"""Test case - read PDB structure factor data file and compute statistics on f/sig(f).
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myContainerList=[]
ifh = open(self.pathSFDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myContainerList)
c0=myContainerList[0]
#
catObj=c0.getObj("refln")
if catObj is None:
return false
nRows=catObj.getRowCount()
#
# Get column name index.
#
itDict={}
itNameList=catObj.getItemNameList()
for idxIt,itName in enumerate(itNameList):
itDict[str(itName).lower()]=idxIt
#
idf=itDict['_refln.f_meas_au']
idsigf=itDict['_refln.f_meas_sigma_au']
minR=100
maxR=-1
sumR=0
icount=0
for row in catObj.getRowList():
try:
f=float(row[idf])
sigf=float(row[idsigf])
ratio=sigf/f
#self.lfh.write(" %f %f %f\n" % (f,sigf,ratio))
maxR=max(maxR,ratio)
minR=min(minR,ratio)
sumR+=ratio
icount+=1
except:
continue
ifh.close()
self.lfh.write("f/sig(f) min %f max %f avg %f count %d\n" % (minR, maxR, sumR/icount,icount))
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def simpleSuite():
suiteSelect = unittest.TestSuite()
suiteSelect.addTest(PdbxReaderTests("testReadBigDataFile"))
suiteSelect.addTest(PdbxReaderTests("testReadSmallDataFile"))
suiteSelect.addTest(PdbxReaderTests("testReadSFDataFile"))
return suiteSelect
if __name__ == '__main__':
mySuite=simpleSuite()
unittest.TextTestRunner(verbosity=2).run(mySuite)
#
##
# File: PdbxWriter.py
# Date: 2011-10-09 Jdw Adapted from PdbxParser.py
#
# Updates:
# 5-Apr-2011 jdw Using the double quote format preference
# 23-Oct-2012 jdw update path details and reorganize.
#
###
"""
Classes for writing data and dictionary containers in PDBx/mmCIF format.
"""
__docformat__ = "restructuredtext en"
__author__ = "John Westbrook"
__email__ = "jwest@rcsb.rutgers.edu"
__license__ = "Creative Commons Attribution 3.0 Unported"
__version__ = "V0.01"
from simtk.openmm.app.internal.pdbx.reader.PdbxContainers import *
class PdbxError(Exception):
""" Class for catch general errors
"""
pass
class PdbxWriter(object):
"""Write PDBx data files or dictionaries using the input container
or container list.
"""
def __init__(self,ofh=sys.stdout):
self.__ofh=ofh
self.__containerList=[]
self.__MAXIMUM_LINE_LENGTH = 2048
self.__SPACING = 2
self.__INDENT_DEFINITION = 3
self.__indentSpace = " " * self.__INDENT_DEFINITION
self.__doDefinitionIndent=False
# Maximum number of rows checked for value length and format
self.__rowPartition=None
def setRowPartition(self,numRows):
''' Maximum number of rows checked for value length and format
'''
self.__rowPartition=numRows
def write(self, containerList):
self.__containerList=containerList
for container in self.__containerList:
self.writeContainer(container)
def writeContainer(self,container):
indS=" " * self.__INDENT_DEFINITION
if isinstance(container, DefinitionContainer):
self.__write("save_%s\n" % container.getName())
self.__doDefinitionIndent=True
self.__write(indS+"#\n")
elif isinstance(container, DataContainer):
if (container.getGlobal()):
self.__write("global_\n")
self.__doDefinitionIndent=False
self.__write("\n")
else:
self.__write("data_%s\n" % container.getName())
self.__doDefinitionIndent=False
self.__write("#\n")
for nm in container.getObjNameList():
obj=container.getObj(nm)
objL=obj.getRowList()
# Skip empty objects
if len(objL) == 0:
continue
# Item - value formattting
elif len(objL) == 1:
self.__writeItemValueFormat(obj)
# Table formatting -
elif len(objL) > 1 and len(obj.getAttributeList()) > 0:
self.__writeTableFormat(obj)
else:
raise PdbxError()
if self.__doDefinitionIndent:
self.__write(indS+"#")
else:
self.__write("#")
# Add a trailing saveframe reserved word
if isinstance(container, DefinitionContainer):
self.__write("\nsave_\n")
self.__write("#\n")
def __write(self, st):
self.__ofh.write(st)
def __writeItemValueFormat(self, myCategory):
# Compute the maximum item name length within this category -
attributeNameLengthMax = 0
for attributeName in myCategory.getAttributeList():
attributeNameLengthMax = max(attributeNameLengthMax, len(attributeName))
itemNameLengthMax = self.__SPACING + len(myCategory.getName()) + attributeNameLengthMax + 2
#
lineList=[]
lineList.append("#\n")
for attributeName,iPos in myCategory.getAttributeListWithOrder():
if self.__doDefinitionIndent:
# - add indent --
lineList.append(self.__indentSpace)
itemName = "_%s.%s" % (myCategory.getName(), attributeName)
lineList.append(itemName.ljust(itemNameLengthMax))
lineList.append(myCategory.getValueFormatted(attributeName,0))
lineList.append("\n")
self.__write("".join(lineList))
def __writeTableFormat(self, myCategory):
# Write the declaration of the loop_
#
lineList=[]
lineList.append('#\n')
if self.__doDefinitionIndent:
lineList.append(self.__indentSpace)
lineList.append("loop_")
for attributeName in myCategory.getAttributeList():
lineList.append('\n')
if self.__doDefinitionIndent:
lineList.append(self.__indentSpace)
itemName = "_%s.%s" % (myCategory.getName(), attributeName)
lineList.append(itemName)
self.__write("".join(lineList))
#
# Write the data in tabular format -
#
#print myCategory.getName()
#print myCategory.getAttributeList()
# For speed make the following evaluation on a portion of the table
if self.__rowPartition is not None:
numSteps=max(1,myCategory.getRowCount()/self.__rowPartition)
else:
numSteps=1
formatTypeList,dataTypeList=myCategory.getFormatTypeList(steps=numSteps)
maxLengthList=myCategory.getAttributeValueMaxLengthList(steps=numSteps)
spacing = " " * self.__SPACING
#
#print formatTypeList
#print dataTypeList
#print maxLengthList
#
for iRow in range(myCategory.getRowCount()):
lineList = []
lineList.append('\n')
if self.__doDefinitionIndent:
lineList.append(self.__indentSpace + " ")
for iAt in range(myCategory.getAttributeCount()):
formatType = formatTypeList[iAt]
maxLength = maxLengthList[iAt]
if (formatType == 'FT_UNQUOTED_STRING' or formatType == 'FT_NULL_VALUE'):
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val.ljust(maxLength))
elif formatType == 'FT_NUMBER':
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val.rjust(maxLength))
elif formatType == 'FT_QUOTED_STRING':
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val.ljust(maxLength+2))
elif formatType == "FT_MULTI_LINE_STRING":
val=myCategory.getValueFormattedByIndex(iAt,iRow)
lineList.append(val)
lineList.append(spacing)
self.__write("".join(lineList))
self.__write("\n")
##
# File: PdbxWriterTests.py
# Author: jdw
# Date: 3-November-2009
# Version: 0.001
#
# Update:
# 5-Apr-2011 jdw Using the double quote format preference
# 24-Oct-2012 jdw Update path and examples.
##
"""
Test implementing PDBx/mmCIF write and formatting operations.
"""
__docformat__ = "restructuredtext en"
__author__ = "John Westbrook"
__email__ = "jwest@rcsb.rutgers.edu"
__license__ = "Creative Commons Attribution 3.0 Unported"
__version__ = "V0.01"
import sys, unittest, traceback
import sys, time, os, os.path, shutil
from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
from simtk.openmm.app.internal.pdbx.writer.PdbxWriter import PdbxWriter
from simtk.openmm.app.internal.pdbx.reader.PdbxContainers import *
class PdbxWriterTests(unittest.TestCase):
def setUp(self):
self.lfh=sys.stderr
self.verbose=False
self.pathPdbxDataFile ="../tests/1kip.cif"
self.pathOutputFile ="testOutputDataFile.cif"
def tearDown(self):
pass
def testWriteDataFile(self):
"""Test case - write data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ofh = open("test-output.cif", "w")
curContainer=DataContainer("myblock")
aCat=DataCategory("pdbx_seqtool_mapping_ref")
aCat.appendAttribute("ordinal")
aCat.appendAttribute("entity_id")
aCat.appendAttribute("auth_mon_id")
aCat.appendAttribute("auth_mon_num")
aCat.appendAttribute("pdb_chain_id")
aCat.appendAttribute("ref_mon_id")
aCat.appendAttribute("ref_mon_num")
aCat.append((1,2,3,4,5,6,7))
aCat.append((1,2,3,4,5,6,7))
aCat.append((1,2,3,4,5,6,7))
aCat.append((1,2,3,4,5,6,7))
curContainer.append(aCat)
myDataList.append(curContainer)
pdbxW=PdbxWriter(ofh)
pdbxW.write(myDataList)
ofh.close()
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def testUpdateDataFile(self):
"""Test case - write data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
# Create a initial data file --
#
myDataList=[]
ofh = open("test-output-1.cif", "w")
curContainer=DataContainer("myblock")
aCat=DataCategory("pdbx_seqtool_mapping_ref")
aCat.appendAttribute("ordinal")
aCat.appendAttribute("entity_id")
aCat.appendAttribute("auth_mon_id")
aCat.appendAttribute("auth_mon_num")
aCat.appendAttribute("pdb_chain_id")
aCat.appendAttribute("ref_mon_id")
aCat.appendAttribute("ref_mon_num")
aCat.append((1,2,3,4,5,6,7))
aCat.append((1,2,3,4,5,6,7))
aCat.append((1,2,3,4,5,6,7))
aCat.append((1,2,3,4,5,6,7))
curContainer.append(aCat)
myDataList.append(curContainer)
pdbxW=PdbxWriter(ofh)
pdbxW.write(myDataList)
ofh.close()
#
# Read and update the data -
#
myDataList=[]
ifh = open("test-output-1.cif", "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
#
myBlock=myDataList[0]
myBlock.printIt()
myCat=myBlock.getObj('pdbx_seqtool_mapping_ref')
myCat.printIt()
for iRow in xrange(0,myCat.getRowCount()):
myCat.setValue('some value', 'ref_mon_id',iRow)
myCat.setValue(100, 'ref_mon_num',iRow)
ofh = open("test-output-2.cif", "w")
pdbxW=PdbxWriter(ofh)
pdbxW.write(myDataList)
ofh.close()
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def testReadDataFile(self):
"""Test case - read data file
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ifh = open(self.pathPdbxDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def testReadWriteDataFile(self):
"""Test case - data file read write test
"""
self.lfh.write("\nStarting %s %s\n" % (self.__class__.__name__,
sys._getframe().f_code.co_name))
try:
#
myDataList=[]
ifh = open(self.pathPdbxDataFile, "r")
pRd=PdbxReader(ifh)
pRd.read(myDataList)
ifh.close()
ofh = open(self.pathOutputFile, "w")
pWr=PdbxWriter(ofh)
pWr.write(myDataList)
ofh.close()
except:
traceback.print_exc(file=sys.stderr)
self.fail()
def suite():
return unittest.makeSuite(PdbxWriterTests,'test')
if __name__ == '__main__':
unittest.main()
"""
pdbfile.py: Used for loading PDB files.
This is part of the OpenMM molecular simulation toolkit originating from
Simbios, the NIH National Center for Physics-Based Simulation of
Biological Structures at Stanford, funded under the NIH Roadmap for
Medical Research, grant U54 GM072970. See https://simtk.org.
Portions copyright (c) 2014 Stanford University and the Authors.
Authors: Peter Eastman
Contributors:
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
__author__ = "Peter Eastman"
__version__ = "1.0"
import os
import sys
from simtk.openmm import Vec3
from simtk.openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
from simtk.openmm.app import Topology
from simtk.unit import nanometers, angstroms, is_quantity, norm, Quantity
import element as elem
try:
import numpy
except:
pass
class PDBxFile(object):
"""PDBxFile parses a PDBx/mmCIF file and constructs a Topology and a set of atom positions from it."""
def __init__(self, file):
"""Load a PDBx/mmCIF file.
The atom positions and Topology can be retrieved by calling getPositions() and getTopology().
Parameters:
- file (string) the name of the file to load. Alternatively you can pass an open file object.
"""
top = Topology()
## The Topology read from the PDBx/mmCIF file
self.topology = top
self._positions = []
# Load the file.
inputFile = file
if isinstance(file, str):
inputFile = open(file)
reader = PdbxReader(inputFile)
data = []
reader.read(data)
block = data[0]
# Build the topology.
atomData = block.getObj('atom_site')
atomNameCol = atomData.getAttributeIndex('label_atom_id')
atomIdCol = atomData.getAttributeIndex('id')
resNameCol = atomData.getAttributeIndex('label_comp_id')
resIdCol = atomData.getAttributeIndex('label_seq_id')
asymIdCol = atomData.getAttributeIndex('label_asym_id')
chainIdCol = atomData.getAttributeIndex('label_entity_id')
elementCol = atomData.getAttributeIndex('type_symbol')
modelCol = atomData.getAttributeIndex('pdbx_PDB_model_num')
xCol = atomData.getAttributeIndex('Cartn_x')
yCol = atomData.getAttributeIndex('Cartn_y')
zCol = atomData.getAttributeIndex('Cartn_z')
lastChainId = None
lastResId = None
lastAsymId = None
atomTable = {}
models = []
for row in atomData.getRowList():
asymId = ('A' if asymIdCol == -1 else row[asymIdCol])
atomKey = ((row[resIdCol], asymId, row[atomNameCol]))
model = ('1' if modelCol == -1 else row[modelCol])
if model not in models:
models.append(model)
self._positions.append([])
modelIndex = models.index(model)
if modelIndex == 0:
# This row defines a new atom.
if lastChainId != row[chainIdCol]:
# The start of a new chain.
chain = top.addChain()
lastChainId = row[chainIdCol]
lastResId = None
lastAsymId = None
if lastResId != row[resIdCol] or lastAsymId != asymId:
# The start of a new residue.
res = top.addResidue(row[resNameCol], chain)
lastResId = row[resIdCol]
lastAsymId = asymId
element = None
try:
element = elem.get_by_symbol(row[elementCol])
except KeyError:
pass
atom = top.addAtom(row[atomNameCol], element, res)
atomTable[atomKey] = atom
else:
# This row defines coordinates for an existing atom in one of the later models.
try:
atom = atomTable[atomKey]
except KeyError:
raise ValueError('Unknown atom %s in residue %s %s for model %s' % (row[atomNameCol], row[resNameCol], row[resIdCol], model))
if atom.index != len(self._positions[modelIndex]):
raise ValueError('Atom %s for model %s does not match the order of atoms for model %s' % (row[atomIdCol], model, models[0]))
self._positions[modelIndex].append(Vec3(float(row[xCol]), float(row[yCol]), float(row[zCol]))*0.1)
for i in range(len(self._positions)):
self._positions[i] = self._positions[i]*nanometers
## The atom positions read from the PDBx/mmCIF file. If the file contains multiple frames, these are the positions in the first frame.
self.positions = self._positions[0]
self.topology.createStandardBonds()
self.topology.createDisulfideBonds(self.positions)
self._numpyPositions = None
# Record unit cell information, if present.
cell = block.getObj('cell')
if cell is not None and cell.getRowCount() > 0:
row = cell.getRow(0)
cellSize = [float(row[cell.getAttributeIndex(attribute)]) for attribute in ('length_a', 'length_b', 'length_c')]*angstroms
self.topology.setUnitCellDimensions(cellSize)
# Add bonds based on struct_conn records.
connectData = block.getObj('struct_conn')
if connectData is not None:
res1Col = connectData.getAttributeIndex('ptnr1_label_seq_id')
res2Col = connectData.getAttributeIndex('ptnr2_label_seq_id')
atom1Col = connectData.getAttributeIndex('ptnr1_label_atom_id')
atom2Col = connectData.getAttributeIndex('ptnr2_label_atom_id')
asym1Col = connectData.getAttributeIndex('ptnr1_label_asym_id')
asym2Col = connectData.getAttributeIndex('ptnr2_label_asym_id')
typeCol = connectData.getAttributeIndex('conn_type_id')
connectBonds = []
for row in connectData.getRowList():
type = row[typeCol][:6]
if type in ('covale', 'disulf', 'modres'):
key1 = (row[res1Col], row[asym1Col], row[atom1Col])
key2 = (row[res2Col], row[asym2Col], row[atom2Col])
if key1 in atomTable and key2 in atomTable:
connectBonds.append((atomTable[key1], atomTable[key2]))
if len(connectBonds) > 0:
# Only add bonds that don't already exist.
existingBonds = set(top.bonds())
for bond in connectBonds:
if bond not in existingBonds and (bond[1], bond[0]) not in existingBonds:
top.addBond(bond[0], bond[1])
existingBonds.add(bond)
def getTopology(self):
"""Get the Topology of the model."""
return self.topology
def getNumFrames(self):
"""Get the number of frames stored in the file."""
return len(self._positions)
def getPositions(self, asNumpy=False, frame=0):
"""Get the atomic positions.
Parameters:
- asNumpy (boolean=False) if true, the values are returned as a numpy array instead of a list of Vec3s
- frame (int=0) the index of the frame for which to get positions
"""
if asNumpy:
if self._numpyPositions is None:
self._numpyPositions = [None]*len(self._positions)
if self._numpyPositions[frame] is None:
self._numpyPositions[frame] = Quantity(numpy.array(self._positions[frame].value_in_unit(nanometers)), nanometers)
return self._numpyPositions[frame]
return self._positions[frame]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment