worblehat-old/python/PyZ3950/pqf.py

#!/usr/local/bin/python2.3

try:
    from cStringIO import StringIO
except:
    from StringIO import StringIO
from PyZ3950 import z3950, oids,asn1
from PyZ3950.zdefs import make_attr
from types import IntType, StringType, ListType
from PyZ3950.CQLParser import CQLshlex


"""
Parser for PQF directly into RPN structure.
PQF docs: http://www.indexdata.dk/yaz/doc/tools.html

NB:  This does not implement /everything/ in PQF, in particular:  @attr 2=3 @and @attr 1=4 title @attr 1=1003 author  (eg that 2 should be 3 for all subsequent clauses)

"""


class PQFParser:
    lexer = None
    currentToken = None
    nextToken = None

    def __init__(self, l):
        self.lexer = l
        self.fetch_token()

    def fetch_token(self):
        """ Read ahead one token """
        tok = self.lexer.get_token()
        self.currentToken = self.nextToken
        self.nextToken = tok

    def is_boolean(self):
        if (self.currentToken.lower() in ['@and', '@or', '@not', '@prox']):
            return 1
        else:
            return 0

    def defaultClause(self, t):
        # Assign a default clause: anywhere =
        clause = z3950.AttributesPlusTerm()
        attrs = [(oids.Z3950_ATTRS_BIB1, 1, 1016), (oids.Z3950_ATTRS_BIB1, 2, 3)]
        clause.attributes = [make_attr(*e) for e in attrs]
        clause.term = t
        return ('op', ('attrTerm', clause))

    #  Grammar fns

    def query(self):
        set = self.top_set()
        qst = self.query_struct()

        # Pull in a (hopefully) null token
        self.fetch_token()
        if (self.currentToken):
            # Nope, unprocessed tokens remain
            raise(ValueError)

        rpnq = z3950.RPNQuery()
        if set:
            rpnq.attributeSet = set
        else:
            rpnq.attributeSet = oids.Z3950_ATTRS_BIB1_ov
        rpnq.rpn = qst


        return ('type_1', rpnq)

    def top_set(self):
        if (self.nextToken == '@attrset'):
            self.fetch_token()
            self.fetch_token()
            n = self.currentToken.upper()
            if (n[:14] == "1.2.840.10003."):
                return asn1.OidVal(map(int, n.split('.')))
            return oids.oids['Z3950']['ATTRS'][n]['oid']
        else:
            return None

    # This totally ignores the BNF, but does the 'right' thing
    def query_struct(self):
        self.fetch_token()
        if (self.currentToken == '@attr'):
            attrs = []
            while self.currentToken == '@attr':
                attrs.append(self.attr_spec())
                self.fetch_token()
            t = self.term()

            # Now we have attrs + term
            clause = z3950.AttributesPlusTerm()
            clause.attributes = [make_attr(*e) for e in attrs]
            clause.term = t
            return ('op', ('attrTerm', clause))
        elif (self.is_boolean()):
            # @operator query query
            return self.complex()
        elif (self.currentToken == '@set'):
            return self.result_set()
        elif (self.currentToken == "{"):
            # Parens
            s = self.query_struct()
            if (self.nextToken <> "}"):
                raise(ValueError)
            else:
                self.fetch_token()
            return s

        else:
            t = self.term()
            return self.defaultClause(t)

    def term(self):
        # Need to split to allow attrlist then @term
        type = 'general'
        if (self.currentToken == '@term'):
            self.fetch_token()
            type = self.currentToken.lower()
            types = {'general' : 'general', 'string' : 'characterString', 'numeric' : 'numeric', 'external' : 'external'}
            type = types[type]
            self.fetch_token()

        if (self.currentToken[0] == '"' and self.currentToken[-1] == '"'):
            term = self.currentToken[1:-1]
        else:
            term = self.currentToken

        return (type, term)

    def result_set(self):
        self.fetch_token()
        return ('op', ('resultSet', self.currentToken))

    def attr_spec(self):
        # @attr is CT
        self.fetch_token()
        if (self.currentToken.find('=') == -1):
            # attrset
            set = self.currentToken
            if (set[:14] == "1.2.840.10003."):
                set = asn1.OidVal(map(int, set.split('.')))
            else:
                set = oids.oids['Z3950']['ATTRS'][set.upper()]['oid']
            self.fetch_token()
        else:
            set = None
        # May raise
        (atype, val) = self.currentToken.split('=')
        if (not atype.isdigit()):
            raise ValueError
        atype = int(atype)
        if (val.isdigit()):
            val = int(val)
        return (set, atype, val)

    def complex(self):
        op = z3950.RpnRpnOp()
        op.op = self.boolean()
        op.rpn1 = self.query_struct()
        op.rpn2 = self.query_struct()
        return ('rpnRpnOp', op)

    def boolean(self):
        b = self.currentToken[1:]
        b = b.lower()
        if (b == 'prox'):
            self.fetch_token()
            exclusion = self.currentToken
            self.fetch_token()
            distance = self.currentToken
            self.fetch_token()
            ordered = self.currentToken
            self.fetch_token()
            relation = self.currentToken
            self.fetch_token()
            which = self.currentToken
            self.fetch_token()
            unit = self.currentToken

            prox = z3950.ProximityOperator()
            if (not (relation.isdigit() and exclusion.isdigit() and distance.isdigit() and unit.isdigit())):
                raise ValueError
            prox.relationType = int(relation)
            prox.exclusion = bool(exclusion)
            prox.distance = int(distance)
            if (which[0] == 'k'):
                prox.unit = ('known', int(unit))
            elif (which[0] == 'p'):
                prox.unit = ('private', int(unit))
            else:
                raise ValueError

            return (b, prox)
        elif b == 'not':
            return ('and-not', None)
        else:
            return (b, None)


def parse(q):

    query = StringIO(q)
    lexer = CQLshlex(query)
    # Override CQL's wordchars list to include /=><()
    lexer.wordchars += "!@#$%^&*-+[];,.?|~`:\\><=/'()"

    parser = PQFParser(lexer)
    return parser.query()


def rpn2pqf(rpn):
    # Turn RPN structure into PQF equivalent
    q = rpn[1]
    if (rpn[0] == 'type_1'):
        # Top level
        if (q.attributeSet):
            query = '@attrset %s '  % ( '.'.join(map(str, q.attributeSet.lst)))
        else:
            query = ""
        rest = rpn2pqf(q.rpn)
        return "%s%s" % (query, rest)
    elif (rpn[0] == 'rpnRpnOp'):
        # boolean
        if (q.op[0] in ['and', 'or']):
            query = ['@', q.op[0], ' ']
        elif (q.op[0] == 'and-not'):
            query = ['@not ']
        else:
            query = ['@prox']
            # XXX
        query.append(' ')
        query.append(rpn2pqf(q.rpn1))
        query.append(' ')
        query.append(rpn2pqf(q.rpn2))
        return ''.join(query)
    elif (rpn[0] == 'op'):
        if (q[0] == 'attrTerm'):
            query = []
            for a in q[1].attributes:
                if (a.attributeValue[0] == 'numeric'):
                    val = str(a.attributeValue[1])
                else:
                    val = a.attributeValue[1].list[0][1]
                query.append("@attr %i=%s " % (a.attributeType, val))
            query.append('"%s" ' % (q[1].term[1]))
            return ''.join(query)
        elif (q[0] == 'resultSet'):
            return "@set %s" % (q[1])