''' A parser for AIML files ''' from __future__ import print_function from xml.sax.handler import ContentHandler from xml.sax.xmlreader import Locator import sys import xml.sax import xml.sax.handler from .constants import * class AimlParserError(Exception): pass class AimlHandler(ContentHandler): ''' A SAX handler for AIML files ''' # The legal states of the AIML parser _STATE_OutsideAiml = 0 _STATE_InsideAiml = 1 _STATE_InsideCategory = 2 _STATE_InsidePattern = 3 _STATE_AfterPattern = 4 _STATE_InsideThat = 5 _STATE_AfterThat = 6 _STATE_InsideTemplate = 7 _STATE_AfterTemplate = 8 def __init__(self, encoding=None): self.categories = {} self._encoding = encoding self._state = self._STATE_OutsideAiml self._version = "" self._namespace = "" self._forwardCompatibleMode = False self._currentPattern = "" self._currentThat = "" self._currentTopic = "" self._insideTopic = False self._currentUnknown = "" # the name of the current unknown element # This is set to true when a parse error occurs in a category. self._skipCurrentCategory = False # Counts the number of parse errors in a particular AIML document. # query with getNumErrors(). If 0, the document is AIML-compliant. self._numParseErrors = 0 # TODO: select the proper validInfo table based on the version number. self._validInfo = self._validationInfo101 # This stack of bools is used when parsing
  • elements inside # elements, to keep track of whether or not an # attribute-less "default"
  • element has been found yet. Only # one default
  • is allowed in each element. We need # a stack in order to correctly handle nested tags. self._foundDefaultLiStack = [] # This stack of strings indicates what the current whitespace-handling # behavior should be. Each string in the stack is either "default" or # "preserve". When a new AIML element is encountered, a new string is # pushed onto the stack, based on the value of the element's "xml:space" # attribute (if absent, the top of the stack is pushed again). When # ending an element, pop an object off the stack. self._whitespaceBehaviorStack = ["default"] self._elemStack = [] self._locator = Locator() self.setDocumentLocator(self._locator) def getNumErrors(self): "Return the number of errors found while parsing the current document." return self._numParseErrors def setEncoding(self, encoding): """ Set the text encoding to use when encoding strings read from XML. Defaults to no encoding. """ self._encoding = encoding def _location(self): "Return a string describing the current location in the source file." line = self._locator.getLineNumber() column = self._locator.getColumnNumber() return "(line %d, column %d)" % (line, column) def _pushWhitespaceBehavior(self, attr): """Push a new string onto the whitespaceBehaviorStack. The string's value is taken from the "xml:space" attribute, if it exists and has a legal value ("default" or "preserve"). Otherwise, the previous stack element is duplicated. """ assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!" try: if attr["xml:space"] == "default" or attr["xml:space"] == "preserve": self._whitespaceBehaviorStack.append(attr["xml:space"]) else: raise AimlParserError( "Invalid value for xml:space attribute "+self._location() ) except KeyError: self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1]) def startElementNS(self, name, qname, attr): print( "QNAME:", qname ) print( "NAME:", name ) uri,elem = name if (elem == "bot"): print( "name:", attr.getValueByQName("name"), "a'ite?" ) self.startElement(elem, attr) pass def startElement(self, name, attr): # Wrapper around _startElement, which catches errors in _startElement() # and keeps going. # If we're inside an unknown element, ignore everything until we're # out again. if self._currentUnknown != "": return # If we're skipping the current category, ignore everything until # it's finished. if self._skipCurrentCategory: return # process this start-element. try: self._startElement(name, attr) except AimlParserError as err: # Print the error message sys.stderr.write("PARSE ERROR: %s\n" % err) self._numParseErrors += 1 # increment error count # In case of a parse error, if we're inside a category, skip it. if self._state >= self._STATE_InsideCategory: self._skipCurrentCategory = True def _startElement(self, name, attr): if name == "aiml": # tags are only legal in the OutsideAiml state if self._state != self._STATE_OutsideAiml: raise AimlParserError( "Unexpected tag "+self._location() ) self._state = self._STATE_InsideAiml self._insideTopic = False self._currentTopic = u"" try: self._version = attr["version"] except KeyError: # This SHOULD be a syntax error, but so many AIML sets out there are missing # "version" attributes that it just seems nicer to let it slide. #raise AimlParserError( "Missing 'version' attribute in tag "+self._location() ) #print( "WARNING: Missing 'version' attribute in tag "+self._location() ) #print( " Defaulting to version 1.0" ) self._version = "1.0" self._forwardCompatibleMode = (self._version != "1.0.1") self._pushWhitespaceBehavior(attr) # Not sure about this namespace business yet... #try: # self._namespace = attr["xmlns"] # if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1": # raise AimlParserError( "Incorrect namespace for AIML v1.0.1 "+self._location() ) #except KeyError: # if self._version != "1.0": # raise AimlParserError( "Missing 'version' attribute(s) in tag "+self._location() ) elif self._state == self._STATE_OutsideAiml: # If we're outside of an AIML element, we ignore all tags. return elif name == "topic": # tags are only legal in the InsideAiml state, and only # if we're not already inside a topic. if (self._state != self._STATE_InsideAiml) or self._insideTopic: raise AimlParserError( "Unexpected tag", self._location() ) try: self._currentTopic = unicode(attr['name']) except KeyError: raise AimlParserError( "Required \"name\" attribute missing in element "+self._location() ) self._insideTopic = True elif name == "category": # tags are only legal in the InsideAiml state if self._state != self._STATE_InsideAiml: raise AimlParserError( "Unexpected tag "+self._location() ) self._state = self._STATE_InsideCategory self._currentPattern = u"" self._currentThat = u"" # If we're not inside a topic, the topic is implicitly set to * if not self._insideTopic: self._currentTopic = u"*" self._elemStack = [] self._pushWhitespaceBehavior(attr) elif name == "pattern": # tags are only legal in the InsideCategory state if self._state != self._STATE_InsideCategory: raise AimlParserError( "Unexpected tag "+self._location() ) self._state = self._STATE_InsidePattern elif name == "that" and self._state == self._STATE_AfterPattern: # are legal either inside a