'''
A parser for AIML files
'''

from __future__ import print_function

from xml.sax.handler import ContentHandler
from xml.sax.xmlreader import Locator
import sys
import xml.sax
import xml.sax.handler

from .constants import *


class AimlParserError(Exception): 
    pass


class AimlHandler(ContentHandler):
    '''
    A SAX handler for AIML files
    '''

    # The legal states of the AIML parser
    _STATE_OutsideAiml    = 0
    _STATE_InsideAiml     = 1
    _STATE_InsideCategory = 2
    _STATE_InsidePattern  = 3
    _STATE_AfterPattern   = 4
    _STATE_InsideThat     = 5
    _STATE_AfterThat      = 6
    _STATE_InsideTemplate = 7
    _STATE_AfterTemplate  = 8


    def __init__(self, encoding=None):
        self.categories = {}
        self._encoding = encoding
        self._state = self._STATE_OutsideAiml
        self._version = ""
        self._namespace = ""
        self._forwardCompatibleMode = False
        self._currentPattern = ""
        self._currentThat    = ""
        self._currentTopic   = ""
        self._insideTopic = False
        self._currentUnknown = "" # the name of the current unknown element

        # This is set to true when a parse error occurs in a category.
        self._skipCurrentCategory = False

        # Counts the number of parse errors in a particular AIML document.
        # query with getNumErrors().  If 0, the document is AIML-compliant.
        self._numParseErrors = 0

        # TODO: select the proper validInfo table based on the version number.
        self._validInfo = self._validationInfo101

        # This stack of bools is used when parsing <li> elements inside
        # <condition> elements, to keep track of whether or not an
        # attribute-less "default" <li> element has been found yet.  Only
        # one default <li> is allowed in each <condition> element.  We need
        # a stack in order to correctly handle nested <condition> tags.
        self._foundDefaultLiStack = []

        # This stack of strings indicates what the current whitespace-handling
        # behavior should be.  Each string in the stack is either "default" or
        # "preserve".  When a new AIML element is encountered, a new string is
        # pushed onto the stack, based on the value of the element's "xml:space"
        # attribute (if absent, the top of the stack is pushed again).  When
        # ending an element, pop an object off the stack.
        self._whitespaceBehaviorStack = ["default"]
        
        self._elemStack = []
        self._locator = Locator()
        self.setDocumentLocator(self._locator)

    def getNumErrors(self):
        "Return the number of errors found while parsing the current document."
        return self._numParseErrors

    def setEncoding(self, encoding):
        """
        Set the text encoding to use when encoding strings read from XML.
        Defaults to no encoding.
        """
        self._encoding = encoding

    def _location(self):
        "Return a string describing the current location in the source file."
        line = self._locator.getLineNumber()
        column = self._locator.getColumnNumber()
        return "(line %d, column %d)" % (line, column)

    def _pushWhitespaceBehavior(self, attr):
        """Push a new string onto the whitespaceBehaviorStack.

        The string's value is taken from the "xml:space" attribute, if it exists
        and has a legal value ("default" or "preserve").  Otherwise, the previous
        stack element is duplicated.
        """
        assert len(self._whitespaceBehaviorStack) > 0, "Whitespace behavior stack should never be empty!"
        try:
            if attr["xml:space"] == "default" or attr["xml:space"] == "preserve":
                self._whitespaceBehaviorStack.append(attr["xml:space"])
            else:
                raise AimlParserError( "Invalid value for xml:space attribute "+self._location() )
        except KeyError:
            self._whitespaceBehaviorStack.append(self._whitespaceBehaviorStack[-1])

    def startElementNS(self, name, qname, attr):
        print( "QNAME:", qname )
        print( "NAME:", name )
        uri,elem = name
        if (elem == "bot"): 
                    print( "name:", attr.getValueByQName("name"), "a'ite?" )
        self.startElement(elem, attr)
        pass

    def startElement(self, name, attr):
        # Wrapper around _startElement, which catches errors in _startElement()
        # and keeps going.
        
        # If we're inside an unknown element, ignore everything until we're
        # out again.
        if self._currentUnknown != "":
            return
        # If we're skipping the current category, ignore everything until
        # it's finished.
        if self._skipCurrentCategory:
            return

        # process this start-element.
        try: self._startElement(name, attr)
        except AimlParserError as err:
            # Print the error message
            sys.stderr.write("PARSE ERROR: %s\n" % err)
            
            self._numParseErrors += 1 # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True
            
    def _startElement(self, name, attr):
        if name == "aiml":
            # <aiml> tags are only legal in the OutsideAiml state
            if self._state != self._STATE_OutsideAiml:
                raise AimlParserError( "Unexpected <aiml> tag "+self._location() )
            self._state = self._STATE_InsideAiml
            self._insideTopic = False
            self._currentTopic = u""
            try: self._version = attr["version"]
            except KeyError:
                # This SHOULD be a syntax error, but so many AIML sets out there are missing
                # "version" attributes that it just seems nicer to let it slide.
                #raise AimlParserError( "Missing 'version' attribute in <aiml> tag "+self._location() )
                #print( "WARNING: Missing 'version' attribute in <aiml> tag "+self._location() )
                #print( "         Defaulting to version 1.0" )
                self._version = "1.0"
            self._forwardCompatibleMode = (self._version != "1.0.1")
            self._pushWhitespaceBehavior(attr)          
            # Not sure about this namespace business yet...
            #try:
            #   self._namespace = attr["xmlns"]
            #   if self._version == "1.0.1" and self._namespace != "http://alicebot.org/2001/AIML-1.0.1":
            #       raise AimlParserError( "Incorrect namespace for AIML v1.0.1 "+self._location() )
            #except KeyError:
            #   if self._version != "1.0":
            #       raise AimlParserError( "Missing 'version' attribute(s) in <aiml> tag "+self._location() )
        elif self._state == self._STATE_OutsideAiml:
            # If we're outside of an AIML element, we ignore all tags.
            return
        elif name == "topic":
            # <topic> tags are only legal in the InsideAiml state, and only
            # if we're not already inside a topic.
            if (self._state != self._STATE_InsideAiml) or self._insideTopic:
                raise AimlParserError( "Unexpected <topic> tag", self._location() )
            try: self._currentTopic = unicode(attr['name'])
            except KeyError:
                raise AimlParserError( "Required \"name\" attribute missing in <topic> element "+self._location() )
            self._insideTopic = True
        elif name == "category":
            # <category> tags are only legal in the InsideAiml state
            if self._state != self._STATE_InsideAiml:
                raise AimlParserError( "Unexpected <category> tag "+self._location() )
            self._state = self._STATE_InsideCategory
            self._currentPattern = u""
            self._currentThat = u""
            # If we're not inside a topic, the topic is implicitly set to *
            if not self._insideTopic: self._currentTopic = u"*"
            self._elemStack = []
            self._pushWhitespaceBehavior(attr)
        elif name == "pattern":
            # <pattern> tags are only legal in the InsideCategory state
            if self._state != self._STATE_InsideCategory:
                raise AimlParserError( "Unexpected <pattern> tag "+self._location() )
            self._state = self._STATE_InsidePattern
        elif name == "that" and self._state == self._STATE_AfterPattern:
            # <that> are legal either inside a <template> element, or
            # inside a <category> element, between the <pattern> and the
            # <template> elements.  This clause handles the latter case.
            self._state = self._STATE_InsideThat
        elif name == "template":
            # <template> tags are only legal in the AfterPattern and AfterThat
            # states
            if self._state not in [self._STATE_AfterPattern, self._STATE_AfterThat]:
                raise AimlParserError( "Unexpected <template> tag "+self._location() )
            # if no <that> element was specified, it is implicitly set to *
            if self._state == self._STATE_AfterPattern:
                self._currentThat = u"*"
            self._state = self._STATE_InsideTemplate
            self._elemStack.append(['template',{}])
            self._pushWhitespaceBehavior(attr)
        elif self._state == self._STATE_InsidePattern:
            # Certain tags are allowed inside <pattern> elements.
            if name == "bot" and "name" in attr and attr["name"] == u"name":
                # Insert a special character string that the PatternMgr will
                # replace with the bot's name.
                self._currentPattern += u" BOT_NAME "
            else:
                raise AimlParserError( ( "Unexpected <%s> tag " % name)+self._location() )
        elif self._state == self._STATE_InsideThat:
            # Certain tags are allowed inside <that> elements.
            if name == "bot" and "name" in attr and attr["name"] == u"name":
                # Insert a special character string that the PatternMgr will
                # replace with the bot's name.
                self._currentThat += u" BOT_NAME "
            else:
                raise AimlParserError( ("Unexpected <%s> tag " % name)+self._location() )
        elif self._state == self._STATE_InsideTemplate and name in self._validInfo:
            # Starting a new element inside the current pattern. First
            # we need to convert 'attr' into a native Python dictionary,
            # so it can later be marshaled.
            it = ( (unicode(k),unicode(v)) for k,v in attr.items() )
            attrDict = dict( it )
            self._validateElemStart(name, attrDict, self._version)
            # Push the current element onto the element stack.
            self._elemStack.append( [unicode(name),attrDict] )
            self._pushWhitespaceBehavior(attr)
            # If this is a condition element, push a new entry onto the
            # foundDefaultLiStack
            if name == "condition":
                self._foundDefaultLiStack.append(False)
        else:
            # we're now inside an unknown element.
            if self._forwardCompatibleMode:
                # In Forward Compatibility Mode, we ignore the element and its
                # contents.
                self._currentUnknown = name
            else:
                # Otherwise, unknown elements are grounds for error!
                raise AimlParserError( ("Unexpected <%s> tag " % name)+self._location() )

    def characters(self, ch):
        # Wrapper around _characters which catches errors in _characters()
        # and keeps going.
        if self._state == self._STATE_OutsideAiml:
            # If we're outside of an AIML element, we ignore all text
            return
        if self._currentUnknown != "":
            # If we're inside an unknown element, ignore all text
            return
        if self._skipCurrentCategory:
            # If we're skipping the current category, ignore all text.
            return
        try: self._characters(ch)
        except AimlParserError as msg:
            # Print the message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1 # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True
            
    def _characters(self, ch):
        text = unicode(ch)
        if self._state == self._STATE_InsidePattern:
            # TODO: text inside patterns must be upper-case!
            self._currentPattern += text
        elif self._state == self._STATE_InsideThat:
            self._currentThat += text
        elif self._state == self._STATE_InsideTemplate:
            # First, see whether the element at the top of the element stack
            # is permitted to contain text.
            try:
                parent = self._elemStack[-1][0]
                parentAttr = self._elemStack[-1][1]
                required, optional, canBeParent = self._validInfo[parent]
                nonBlockStyleCondition = (parent == "condition" and not ("name" in parentAttr and "value" in parentAttr))
                if not canBeParent:
                    raise AimlParserError( ("Unexpected text inside <%s> element "%parent)+self._location() )
                elif parent == "random" or nonBlockStyleCondition:
                    # <random> elements can only contain <li> subelements. However,
                    # there's invariably some whitespace around the <li> that we need
                    # to ignore. Same for non-block-style <condition> elements (i.e.
                    # those which don't have both a "name" and a "value" attribute).
                    if len(text.strip()) == 0:
                        # ignore whitespace inside these elements.
                        return
                    else:
                        # non-whitespace text inside these elements is a syntax error.
                        raise AimlParserError( ("Unexpected text inside <%s> element "%parent)+self._location() )
            except IndexError:
                # the element stack is empty. This should never happen.
                raise AimlParserError( "Element stack is empty while validating text "+self._location() )
            
            # Add a new text element to the element at the top of the element
            # stack. If there's already a text element there, simply append the
            # new characters to its contents.
            try: textElemOnStack = (self._elemStack[-1][-1][0] == "text")
            except IndexError: textElemOnStack = False
            except KeyError: textElemOnStack = False
            if textElemOnStack:
                self._elemStack[-1][-1][2] += text
            else:
                self._elemStack[-1].append(["text", {"xml:space": self._whitespaceBehaviorStack[-1]}, text])
        else:
            # all other text is ignored
            pass

    def endElementNS(self, name, qname):
        uri, elem = name
        self.endElement(elem)
        
    def endElement(self, name):
        """Wrapper around _endElement which catches errors in _characters()
        and keeps going.
        """     
        if self._state == self._STATE_OutsideAiml:
            # If we're outside of an AIML element, ignore all tags
            return
        if self._currentUnknown != "":
            # see if we're at the end of an unknown element.  If so, we can
            # stop ignoring everything.
            if name == self._currentUnknown:
                self._currentUnknown = ""
            return
        if self._skipCurrentCategory:
            # If we're skipping the current category, see if it's ending. We
            # stop on ANY </category> tag, since we're not keeping track of
            # state in ignore-mode.
            if name == "category":
                self._skipCurrentCategory = False
                self._state = self._STATE_InsideAiml
            return
        try: self._endElement(name)
        except AimlParserError as msg:
            # Print the message
            sys.stderr.write("PARSE ERROR: %s\n" % msg)
            self._numParseErrors += 1 # increment error count
            # In case of a parse error, if we're inside a category, skip it.
            if self._state >= self._STATE_InsideCategory:
                self._skipCurrentCategory = True

    def _endElement(self, name):
        """
        Verify that an AIML end element is valid in the current context.
        Raises an AimlParserError if an illegal end element is encountered.
        """
        if name == "aiml":
            # </aiml> tags are only legal in the InsideAiml state
            if self._state != self._STATE_InsideAiml:
                raise AimlParserError( "Unexpected </aiml> tag "+self._location() )
            self._state = self._STATE_OutsideAiml
            self._whitespaceBehaviorStack.pop()
        elif name == "topic":
            # </topic> tags are only legal in the InsideAiml state, and
            # only if _insideTopic is true.
            if self._state != self._STATE_InsideAiml or not self._insideTopic:
                raise AimlParserError( "Unexpected </topic> tag "+self._location() )
            self._insideTopic = False
            self._currentTopic = u""
        elif name == "category":
            # </category> tags are only legal in the AfterTemplate state
            if self._state != self._STATE_AfterTemplate:
                raise AimlParserError( "Unexpected </category> tag "+self._location() )
            self._state = self._STATE_InsideAiml
            # End the current category.  Store the current pattern/that/topic and
            # element in the categories dictionary.
            key = (self._currentPattern.strip(), self._currentThat.strip(),self._currentTopic.strip())
            self.categories[key] = self._elemStack[-1]
            self._whitespaceBehaviorStack.pop()
        elif name == "pattern":
            # </pattern> tags are only legal in the InsidePattern state
            if self._state != self._STATE_InsidePattern:
                raise AimlParserError( "Unexpected </pattern> tag "+self._location() )
            self._state = self._STATE_AfterPattern
        elif name == "that" and self._state == self._STATE_InsideThat:
            # </that> tags are only allowed inside <template> elements or in
            # the InsideThat state.  This clause handles the latter case.
            self._state = self._STATE_AfterThat
        elif name == "template":
            # </template> tags are only allowed in the InsideTemplate state.
            if self._state != self._STATE_InsideTemplate:
                raise AimlParserError( "Unexpected </template> tag "+self._location() )
            self._state = self._STATE_AfterTemplate
            self._whitespaceBehaviorStack.pop()
        elif self._state == self._STATE_InsidePattern:
            # Certain tags are allowed inside <pattern> elements.
            if name not in ["bot"]:
                raise AimlParserError( ("Unexpected </%s> tag " % name)+self._location() )
        elif self._state == self._STATE_InsideThat:
            # Certain tags are allowed inside <that> elements.
            if name not in ["bot"]:
                raise AimlParserError( ("Unexpected </%s> tag " % name)+self._location() )
        elif self._state == self._STATE_InsideTemplate:
            # End of an element inside the current template.  Append the
            # element at the top of the stack onto the one beneath it.
            elem = self._elemStack.pop()
            self._elemStack[-1].append(elem)
            self._whitespaceBehaviorStack.pop()
            # If the element was a condition, pop an item off the
            # foundDefaultLiStack as well.
            if elem[0] == "condition": self._foundDefaultLiStack.pop()
        else:
            # Unexpected closing tag
            raise AimlParserError( ("Unexpected </%s> tag " % name)+self._location() )

    # A dictionary containing a validation information for each AIML
    # element. The keys are the names of the elements.  The values are a
    # tuple of three items. The first is a list containing the names of
    # REQUIRED attributes, the second is a list of OPTIONAL attributes,
    # and the third is a boolean value indicating whether or not the
    # element can contain other elements and/or text (if False, the
    # element can only appear in an atomic context, such as <date/>).
    _validationInfo101 = {
        "bot":          ( ["name"], [], False ),
        "condition":    ( [], ["name", "value"], True ), # can only contain <li> elements
        "date":         ( [], [], False ),
        "formal":       ( [], [], True ),
        "gender":       ( [], [], True ),
        "get":          ( ["name"], [], False ),
        "gossip":       ( [], [], True ),
        "id":           ( [], [], False ),
        "input":        ( [], ["index"], False ),
        "javascript":   ( [], [], True ),
        "learn":        ( [], [], True ),
        "li":           ( [], ["name", "value"], True ),
        "lowercase":    ( [], [], True ),
        "person":       ( [], [], True ),
        "person2":      ( [], [], True ),
        "random":       ( [], [], True ), # can only contain <li> elements
        "sentence":     ( [], [], True ),
        "set":          ( ["name"], [], True),
        "size":         ( [], [], False ),
        "sr":           ( [], [], False ),
        "srai":         ( [], [], True ),
        "star":         ( [], ["index"], False ),
        "system":       ( [], [], True ),
        "template":     ( [], [], True ), # needs to be in the list because it can be a parent.
        "that":         ( [], ["index"], False ),
        "thatstar":     ( [], ["index"], False ),
        "think":        ( [], [], True ),
        "topicstar":    ( [], ["index"], False ),
        "uppercase":    ( [], [], True ),
        "version":      ( [], [], False ),
    }

    def _validateElemStart(self, name, attr, version):
        """
        Test the validity of an element starting inside a <template> element.

        This function raises an AimlParserError exception if it the tag is
        invalid.  Otherwise, no news is good news.
        """     
        # Check the element's attributes.  Make sure that all required
        # attributes are present, and that any remaining attributes are
        # valid options.        
        required, optional, canBeParent = self._validInfo[name]
        for a in required:
            if a not in attr and not self._forwardCompatibleMode:
                raise AimlParserError( ("Required \"%s\" attribute missing in <%s> element " % (a,name))+self._location() )
        for a in attr:
            if a in required: continue
            if a[0:4] == "xml:": continue # attributes in the "xml" namespace can appear anywhere
            if a not in optional and not self._forwardCompatibleMode:
                raise AimlParserError( ("Unexpected \"%s\" attribute in <%s> element " % (a,name))+self._location() )

        # special-case: several tags contain an optional "index" attribute.
        # This attribute's value must be a positive integer.
        if name in ["star", "thatstar", "topicstar"]:
            for k,v in attr.items():
                if k == "index":
                    temp = 0
                    try: temp = int(v)
                    except:
                        raise AimlParserError( ("Bad type for \"%s\" attribute (expected integer, found \"%s\") " % (k,v))+self._location() )
                    if temp < 1:
                        raise AimlParserError( ("\"%s\" attribute must have non-negative value " % (k))+self._location() )

        # See whether the containing element is permitted to contain
        # subelements. If not, this element is invalid no matter what it is.
        try:
            parent = self._elemStack[-1][0]
            parentAttr = self._elemStack[-1][1]
        except IndexError:
            # If the stack is empty, no parent is present.  This should never
            # happen.
            raise AimlParserError( ("Element stack is empty while validating <%s> " % name)+self._location() )
        required, optional, canBeParent = self._validInfo[parent]
        nonBlockStyleCondition = (parent == "condition" and not ("name" in parentAttr and "value" in parentAttr))
        if not canBeParent:
            raise AimlParserError( ("<%s> elements cannot have any contents "%parent)+self._location() )
        # Special-case test if the parent element is <condition> (the
        # non-block-style variant) or <random>: these elements can only
        # contain <li> subelements.
        elif (parent == "random" or nonBlockStyleCondition) and name!="li":
            raise AimlParserError( ("<%s> elements can only contain <li> subelements "%parent)+self._location() )
        # Special-case test for <li> elements, which can only be contained
        # by non-block-style <condition> and <random> elements, and whose
        # required attributes are dependent upon which attributes are
        # present in the <condition> parent.
        elif name=="li":
            if not (parent=="random" or nonBlockStyleCondition):
                raise AimlParserError( ("Unexpected <li> element contained by <%s> element "%parent)+self._location() )
            if nonBlockStyleCondition:
                if "name" in parentAttr:
                    # Single-predicate condition.  Each <li> element except the
                    # last must have a "value" attribute.
                    if len(attr) == 0:
                        # This could be the default <li> element for this <condition>,
                        # unless we've already found one.
                        if self._foundDefaultLiStack[-1]:
                            raise AimlParserError( "Unexpected default <li> element inside <condition> "+self._location() )
                        else:
                            self._foundDefaultLiStack[-1] = True
                    elif len(attr) == 1 and "value" in attr:
                        pass # this is the valid case
                    else:
                        raise AimlParserError( "Invalid <li> inside single-predicate <condition> "+self._location() )
                elif len(parentAttr) == 0:
                    # Multi-predicate condition.  Each <li> element except the
                    # last must have a "name" and a "value" attribute.
                    if len(attr) == 0:
                        # This could be the default <li> element for this <condition>,
                        # unless we've already found one.
                        if self._foundDefaultLiStack[-1]:
                            raise AimlParserError( "Unexpected default <li> element inside <condition> "+self._location() )
                        else:
                            self._foundDefaultLiStack[-1] = True
                    elif len(attr) == 2 and "value" in attr and "name" in attr:
                        pass # this is the valid case
                    else:
                        raise AimlParserError( "Invalid <li> inside multi-predicate <condition> "+self._location() )
        # All is well!
        return True

def create_parser():
    """Create and return an AIML parser object."""
    parser = xml.sax.make_parser()
    handler = AimlHandler("UTF-8")
    parser.setContentHandler(handler)
    #parser.setFeature(xml.sax.handler.feature_namespaces, True)
    return parser