worblehat-old/python/PyZ3950/c2query.py


#!/usr/local/bin/python2.3

try:
    from cStringIO import StringIO
except:
    from StringIO import StringIO
from PyZ3950 import z3950, oids
from types import IntType, StringType, ListType
# We need "\"\""  to be one token
from PyZ3950.CQLParser import CQLshlex
from PyZ3950.CQLUtils import ZCQLConfig
from PyZ3950.zdefs import make_attr
zconfig = ZCQLConfig()

"""
http://cheshire.berkeley.edu/cheshire2.html#zfind

top        ::= query ['resultsetid' name]
query      ::= query boolean clause | clause
clause     ::=  '(' query ')'
               | attributes [relation] term
               | resultset
attributes ::= '[' { [set] type '=' value } ']' | name
boolean    ::= 'and' | 'or' | 'not' | (synonyms)
prox       ::= ('!PROX' | (synonyms)) {'/' name}
relation   ::= '>' | '<' | ...

[bib1 1=5, bib1 3=6] > term and title @ fish
"""

booleans = {'AND' : 'and',
            '.AND.' : 'and',
            '&&' : 'and',
            'OR' : 'or',
            '.OR.' : 'or',
            '||' : 'or',
            'NOT' : 'and-not',
            '.NOT.' : 'and-not',
            'ANDNOT' : 'and-not',
            '.ANDNOT.' : 'and-not',
            '!!' : 'and-not'
            }

relations = {'<' : 1,
             'LT' : 1,
             '.LT.' : 1,
             '<=' : 2,
             'LE' : 2,
             '.LE.' : 2,
             '=' : 3,
             '>=' : 4,
             'GE' : 4,
             '.GE.' : 4,
             '>' : 5,
             'GT' : 5,
             '.GT.' : 5,
             '<>' : 6,
             '!=' : 6,
             'NE' : 6,
             '.NE.' : 6,
             '?' : 100,
             'PHON' : 100,
             '.PHON.' : 100,
             '%' : 101,
             'STEM' : 101,
             '.STEM.' : 101,
             '@' : 102,
             'REL' : 102,
             '.REL.' : 102,
             '<=>' : 104,
             'WITHIN' : 104,
             '.WITHIN.' : 104}

geoRelations = {'>=<' : 7,
                '.OVERLAPS.' : 7,
                '>#<' : 8,
                '.FULLY_ENCLOSED_WITHIN.' : 8,
                '<#>' : 9,
                '.ENCLOSES.' : 9,
                '<>#' : 10,
                '.OUTSIDE_OF.' : 10,
                '+-+' : 11,
                '.NEAR.' : 11,
                '.#.' : 12,
                '.MEMBERS_CONTAIN.' : 12,
                '!.#.' : 13,
                '.MEMBERS_NOT_CONTAIN.' : 13,
                ':<:' : 14,
                '.BEFORE.' : 14,
                ':<=:' : 15,
                '.BEFORE_OR_DURING.' : 15,
                ':=:' : 16,
                '.DURING.' : 16,
                ':>=:' : 17,
                '.DURING_OR_AFTER.' : 17,
                ':>:' : 18,
                '.AFTER.' : 18}

proxBooleans = {'!PROX' : (2, 0, 2),
                '!ADJ' : (2, 0, 2),
                '!NEAR' : (20, 0, 2),
                '!FAR' : (20, 0, 4),
                '!OPROX' : (2, 1, 2),
                '!OADJ' : (2, 1, 2),
                '!ONEAR' : (20, 1, 2),
                '!OFAR' : (20, 1, 4)}

proxUnits = {'C' : 1,
             'CHAR' : 1,
             'W' : 2,
             'WORD' : 2,
             'S' : 3,
             'SENT' : 3,
             'SENTENCE' : 3,
             'P' : 4,
             'PARA' : 4,
             'PARAGRAPH' : 4,
             'SECTION' : 5,
             'CHAPTER' : 6,
             'DOCUMENT' : 7,
             'ELEMENT' : 8,
             'SUBELEMENT' : 9,
             'ELEMENTTYPE' : 10,
             'BYTE' : 11}

privateBooleans = {'!FUZZY_AND' : 1,
                   '!FUZZY_OR' : 2,
                   '!FUZZY_NOT' : 3,
                   '!RESTRICT_FROM' : 4,
                   '!RESTRICT_TO' : 5,
                   '!MERGE_SUM' : 6,
                   '!MERGE_MEAN' : 7,
                   '!MERGE_NORM' : 8}

xzconfig = ZCQLConfig()

class C2Parser:
    lexer = None
    currentToken = None
    nextToken = None

    def __init__(self, l):
        self.lexer = l
        self.fetch_token()


    def fetch_token(self):
        tok = self.lexer.get_token()
        self.currentToken = self.nextToken
        self.nextToken = tok

    def is_boolean(self, tok=None):
        if (tok == None):
            tok = self.currentToken
        if (privateBooleans.has_key(tok.upper())):
            return 1
        elif (booleans.has_key(tok.upper())):
            return 2
        elif (proxBooleans.has_key(tok.upper())):
            return 3
        else:
            return 0


    def top(self):

        rpn = self.query()
        # Check for resultsetid
        if (self.currentToken.lower() == 'resultsetid'):
            self.fetch_token()
            resultset = self.currentToken
        else:
            resultset = None

        rpnq = z3950.RPNQuery()
        rpnq.attributeSet = oids.Z3950_ATTRS_BIB1_ov
        rpnq.rpn = rpn
        q = ('type_1', rpnq)
        return (q, resultset)

    def query(self):
        self.fetch_token()
        left = self.subquery()
        while 1:
            if not self.currentToken:
                break
            bool = self.is_boolean()
            if bool:
                bool = self.boolean()
                right = self.subquery()
                # Put left into triple, make triple new left
                op = z3950.RpnRpnOp()
                op.rpn1 = left
                op.rpn2 = right
                op.op = bool
                wrap = ('rpnRpnOp', op)
                left = wrap
            else:
                break
        return left


    def subquery(self):
        if self.currentToken == "(":
            object = self.query()
            if (self.currentToken <> ")"):
                raise ValueError
            else:
                self.fetch_token()
        else:
            object = self.clause()
        return object

    def boolean(self):
        tok = self.currentToken.upper()
        self.fetch_token()
        if (booleans.has_key(tok)):
            return (booleans[tok], None)
        elif (privateBooleans.has_key(tok)):
            # Generate cutesie prox trick
            type = privateBooleans[tok]
            prox = z3950.ProximityOperator()
            prox.proximityUnitCode = ('private', type)
            prox.distance = 0
            prox.ordered = 0
            prox.relationType = 3
            return ('op', ('prox', prox))

        elif (proxBooleans.has_key(tok)):
            # Generate prox
            prox = z3950.ProximityOperator()
            stuff = proxBooleans[tok]
            prox.distance = stuff[0]
            prox.ordered = stuff[1]
            prox.relationType = stuff[2]
            prox.proximityUnitCode = ('known', 2)

            # Now look for /
            while (self.currentToken == "/"):
                self.fetch_token()
                if (self.currentToken.isdigit()):
                    prox.distance = int(self.currentToken)
                elif (proxUnits.has_key(self.currentToken.upper())):
                    prox.proximityUnitCode = ('known', proxUnits[self.currentToken.upper()])
                else:
                    raise ValueError
                self.fetch_token()
            return ('op', ('prox', prox))
        else:
            # Argh!
            raise ValueError

    def clause(self):

        if (self.is_boolean(self.nextToken) or not self.nextToken or self.nextToken.lower() == 'resultsetid' or self.nextToken == ")"):
            # Must be a resultset
            tok = self.currentToken
            self.fetch_token()
            return ('op', ('resultSet', tok))

        elif (self.currentToken == '['):
            # List of attributes
            attrs = []
            oidHash = oids.oids['Z3950']['ATTRS']
            while (1):
                self.fetch_token()

                if (self.currentToken == ']'):
                    break

                if (oidHash.has_key(self.currentToken)):
                    attrSet = oidHash[self.currentToken]['ov']
                    self.fetch_token()
                elif (self.currentToken[:8] == '1.2.840.'):
                    attrSet = asn1.OidVal(map(int, self.currentToken.split('.')))
                    self.fetch_token()
                else:
                    attrSet = None

                if (self.currentToken[-1] == ','):
                    tok = self.currentToken[:-1]
                else:
                    tok = self.currentToken

                if (tok.isdigit()):
                    # 1 = foo
                    atype = int(tok)
                    self.fetch_token()
                    if (self.currentToken == '='):
                        # = foo
                        self.fetch_token()

                    if (self.currentToken[0] == '='):
                        # =foo
                        tok = self.currentToken[1:]
                    else:
                        tok = self.currentToken

                    if (tok[-1] == ','):
                        tok = tok[:-1]

                    if (tok.isdigit()):
                        val = int(tok)
                    else:
                        val = tok
                        if (val[0] == "'" and val[-1] == "'"):
                            val = val[1:-1]
                elif (tok[-1] == '='):
                    #1= foo
                    tok = tok[:-1]
                    if (tok.isdigit()):
                        atype = int(tok)
                    self.fetch_token()
                    if (self.currentToken[-1] == ","):
                        tok = self.currentToken[:-1]
                    else:
                        tok = self.currentToken
                    if (tok.isdigit()):
                        val = int(self.currentToken)
                    else:
                        val = tok
                        if (val[0] == "'" and val[-1] == "'"):
                            val = val[1:-1]

                elif (tok.find('=') > -1):
                    # 1=foo
                    (atype, val) = self.currentToken.split('=')
                    atype = int(atype)
                    if (val[-1] == ","):
                        val = val[:-1]
                    if (val.isdigit()):
                        val = int(val)
                    elif (val[0] == "'" and val[-1] == "'"):
                        val = val[1:-1]
                else:
                    # ???
                    raise ValueError
                attrs.append([attrSet, atype, val])

        else:
            # Check for named index
            if (zconfig.BIB1.has_key(self.currentToken.lower())):
                attrs = [[oids.Z3950_ATTRS_BIB1_ov, 1, zconfig.BIB1[self.currentToken.lower()]]]
            else:
                # Just pass through the name
                attrs = [[oids.Z3950_ATTRS_BIB1_ov, 1, self.currentToken]]

        self.fetch_token()
        # Check for relation
        tok = self.currentToken.upper()
        if (relations.has_key(tok)):
            val = relations[tok]
            found = 0
            for a in attrs:
                if (a[0] in [oids.Z3950_ATTRS_BIB1, None] and a[1] == 2):
                    found =1
                    a[2] = val
                    break
            if (not found):
                attrs.append([None, 2, val])
            self.fetch_token()
        elif (geoRelations.has_key(tok)):
            val = geoRelations[tok]
            found = 0
            for a in attrs:
                if (a[0] in [oids.Z3950_ATTRS_BIB1, oids.Z3950_ATTRS_GEO, None] and a[1] == 2):
                    found = 1
                    a[2] = val
                    break
            if (not found):
                attrs.append([oids.Z3950_ATTRS_GEO, 2, val])
            self.fetch_token()

        if (self.currentToken.find(' ')):
            # Already quoted
            term = self.currentToken
        else:
            # Accumulate
            term = []
            while (self.currentToken and not self.is_boolean(self.currentToken) and self.currentToken.lower() != 'resultsetid'):
                term.append(self.currenToken)
            term = ' '.join(term)

        self.fetch_token()

        # Phew. Now build AttributesPlusTerm
        clause = z3950.AttributesPlusTerm()
        clause.attributes = [make_attr(*e) for e in attrs]
        clause.term = ('general', term)
        return ('op', ('attrTerm', clause))


def parse(q):

    query = StringIO(q)
    lexer = CQLshlex(query)
    # Override CQL's wordchars list to include /=><
    lexer.wordchars += "!@#$%^&*-+;,.?|~`:\\><='"
    lexer.wordchars = lexer.wordchars.replace('[', '')
    lexer.wordchars = lexer.wordchars.replace(']', '')


    parser = C2Parser(lexer)
    return parser.top()