worblehat-old/python/PyZ3950/ccl.py

#!/usr/bin/env python

"""Implements part of CCL, the Common Command Language, ISO 8777.  I'm
working from the description in the YAZ toolkit
(http://www.indexdata.dk/yaz/doc/tools.php), rather than the ISO
spec.  Two extensions:
- qualifiers can be literal "(attrtyp, attrval)" pairs, so, e.g., the
following is a legitimate for ISBN: "(1,7)=0312033095"
- the optional ATTRSET (attrset/query) which must appear at the beginning
of the string.
Allowed values are:
BIB1 (default)
XD1
UTIL
ZTHES1
EXP1
or an oid expressed as a dotted string.  (A leading dot implies a
prefix of 1.2.840.1003.3, so, e.g., .1 is the same as BIB1.)

Eventually I will support v3-style mixing attribute sets within
a single query, but for now I don't.
"""

from __future__ import nested_scopes
import string

in_setup = 0

try:
    from PyZ3950 import z3950
    from PyZ3950 import oids
    from PyZ3950 import asn1

    _attrdict = {
        'bib1'  :  oids.Z3950_ATTRS_BIB1_ov,
        'zthes1': oids.Z3950_ATTRS_ZTHES_ov,
        'xd1': oids.Z3950_ATTRS_XD1_ov,
        'utility': oids.Z3950_ATTRS_UTIL_ov,
        'exp1':  oids.Z3950_ATTRS_EXP1_ov
        }

except ImportError, err:
    print "Error importing (OK during setup)", err
    in_setup = 1

class QuerySyntaxError(Exception): pass
class ParseError(QuerySyntaxError): pass
class LexError(QuerySyntaxError): pass
class UnimplError(QuerySyntaxError): pass

tokens = ('LPAREN', 'RPAREN', 'COMMA',
          'SET', 'ATTRSET','QUAL',  'QUOTEDVALUE', 'RELOP',  'WORD',
          'LOGOP', 'SLASH')

t_LPAREN= r'\('
t_RPAREN= r'\)'
t_COMMA = r','
t_SLASH = r'/'
def t_ATTRSET(t):
    r'(?i)ATTRSET'
    return t

def t_SET (t): # need to def as function to override parsing as WORD, gr XXX
    r'(SET)'
    return t

relop_to_attrib = {
    '<': 1,
    '<=': 2,
    '=': 3,
    '>=': 4,
    '>': 5,
    '<>': 6}

t_RELOP = "|".join (["(%s)" % r for r in relop_to_attrib.keys()])
# XXX Index Data docs say 'doesn't follow ... ISO8777'?

# XXX expand to rd. addt'l defns from file?

qual_dict = { # These are bib-1 attribute values, see
# http://www.loc.gov/z3950/agency/defns/bib1.html and  ftp://ftp.loc.gov/pub/z3950/defs/bib1.txt
    'TI': (1,4),
    'AU': (1,1003), # use 1003 to work w/ both NLC-BNC and LC
    'ISBN': (1,7),
    'LCCN': (1,9),
    'ANY': (1,1016),
    'FIF': (3, 1), # first-in-field
    'AIF': (3,3), # any-in-field (default)
    'RTRUNC': (5,1),
    'NOTRUNC': (5,100) # (default)
    }
default_quals = ['ANY'] # XXX should be per-attr-set
default_relop = '='

def t_QUAL(t):
    return t

def mk_quals ():
    quals = ("|".join (map (lambda x: '(' + x + ')', qual_dict.keys())))
    t_QUAL.__doc__ = "(?i)" + quals + r"|(\([0-9]+,[0-9]+\))"

def t_QUOTEDVALUE(t):
    r"(\".*?\")"
    if t.value[0] == '"':
        t.value = t.value[1:-1]
    return t

word_init = "[a-z]|[A-Z]|[0-9]|&|:"
word_non_init = ",|\.|\'"

t_WORD = "(%s)(%s|%s)*" % (word_init, word_init, word_non_init)

def t_LOGOP(t):
    r'(?i)(AND)|(OR)|(NOT)'
    return t


t_ignore = " \t"

def t_error(t):
    raise LexError ('t_error: ' + str (t))


from ply import lex


def relex ():
    global lexer
    mk_quals ()
    lexer = lex.lex()

relex ()

def add_qual (qual_name, val):
    """Add a qualifier definition, and regenerate the lexer."""
    qual_dict[qual_name] = val
    relex ()

from ply import yacc

#if in_setup:
#    import yacc
#else:
#    from PyZ3950 import yacc

class Node:
    def __init__(self,type,children=None,leaf=None):
        self.type = type
        if children:
            self.children = children
        else:
            self.children = [ ]
        self.leaf = leaf
    def str_child (self, child, depth):
        if isinstance (child, Node): # ugh
            return child.str_depth (depth)
        indent = " " * (4 * depth)
        return indent + str (child) + "\n"
    def str_depth (self, depth): # ugh
        indent = " " * (4 * depth)
        l = ["%s%s %s" % (indent, self.type, self.leaf)]
        l.append ("".join (map (lambda s: self.str_child (s, depth + 1),
                                self.children)))
        return "\n".join (l)
    def __str__(self):
        return "\n" + self.str_depth (0)

def p_top (t):
    'top : cclfind_or_attrset'
    t[0] = t[1]

def p_cclfind_or_attrset_1 (t):
    'cclfind_or_attrset : cclfind'
    t[0] = t[1]

def p_cclfind_or_attrset_2 (t):
    'cclfind_or_attrset : ATTRSET LPAREN WORD SLASH cclfind RPAREN'
    t[0] = Node ('attrset', [t[5]], t[3])

def p_ccl_find_1(t):
    'cclfind : cclfind LOGOP elements'
    t[0] = Node ('op', [t[1],t[3]], t[2])

def p_ccl_find_2(t):
    'cclfind : elements'
    t[0] = t[1]

def p_elements_1(t):
    'elements : LPAREN cclfind RPAREN'
    t[0] = t[2]

class QuallistVal:
    def __init__ (self, quallist, val):
        self.quallist = quallist
        self.val = val
    def __str__ (self):
        return "QV: %s %s" % (str(self.quallist),str (self.val))
    def __getitem__ (self, i):
        if i == 0: return self.quallist
        if i == 1: return self.val
        raise IndexError ('QuallistVal err ' + str (i))

def xlate_qualifier (x):
    if x[0] == '(' and x[-1] == ')':
        t = x[1:-1].split (',') # t must be of len 2 b/c of lexer
        return (string.atoi (t[0]), string.atoi (t[1]))
    return qual_dict[(x.upper ())]


def p_elements_2 (t):
    'elements : SET RELOP WORD'
    if t[2] <> '=':
        raise QuerySyntaxError (str (t[1], str (t[2]), str (t[3])))
    t[0] = Node ('set', leaf = t[3])

def p_elements_3(t):
    'elements : val'
    t[0] = Node ('relop', QuallistVal (map (xlate_qualifier, default_quals), t[1]), default_relop)

def p_elements_4(t):
    'elements : quallist RELOP val'
    t[0] = Node ('relop', QuallistVal(map (xlate_qualifier, t[1]),t[3]), t[2])

# XXX p_elements_5 would be quals followed by recursive def'n, not yet implemented
# XXX p_elements_6 would be quals followed by range, not yet implemented.

def p_quallist_1 (t):
    'quallist : QUAL'
    t[0] = [t[1]]

def p_quallist_2 (t):
    'quallist : quallist COMMA QUAL'
    t[0] = t[1] + [t[3]]

def p_val_1(t):
    'val : QUOTEDVALUE'
    t[0] = t[1]

def p_val_2(t):
    'val : val WORD'
    t[0] = t[1] + " " + t[2]

def p_val_3(t):
    'val : WORD'
    t[0] = t[1]


# XXX also don't yet handle proximity operator

def p_error(t):
    raise ParseError ('Parse p_error ' + str (t))

precedence = (
    ('left', 'LOGOP'),
    )

yacc.yacc (debug=0, tabmodule = 'PyZ3950_parsetab')
#yacc.yacc (debug=0, tabpackage = 'PyZ3950', tabmodule='PyZ3950_parsetab')


def attrset_to_oid (attrset):
    l = attrset.lower ()
    if _attrdict.has_key (l):
        return _attrdict [l]
    split_l = l.split ('.')
    if split_l[0] == '':
        split_l = oids.Z3950_ATTRS + split_l[1:]
    try:
        intlist = map (string.atoi, split_l)
    except ValueError:
        raise ParseError ('Bad OID: ' + l)
    return asn1.OidVal (intlist)


def tree_to_q (ast):
    if ast.type == 'op':
        myrpnRpnOp = z3950.RpnRpnOp ()
        myrpnRpnOp.rpn1 = tree_to_q(ast.children[0])
        myrpnRpnOp.rpn2 = tree_to_q(ast.children[1])
        op = ast.leaf.lower ()
        if op == 'not': op = 'and-not' # CCL spec of 'not' vs. Z39.50 spec of 'and-not'
        myrpnRpnOp.op = (op, None)
        return ('rpnRpnOp', myrpnRpnOp)
    elif ast.type == 'relop':
        # XXX but e.g. LC (http://lcweb.loc.gov/z3950/lcserver.html)
        # doesn't support other relation attributes, either.
        try:
            relattr = relop_to_attrib [ast.leaf]
        except KeyError:  # should never happen, how could we have lexed it?
            raise UnimplError (ast.leaf)
        def make_aelt (qual):
            val = ('numeric', qual [1])
            return z3950.AttributeElement (attributeType = qual[0],
                                           attributeValue = val)
        apt  = z3950.AttributesPlusTerm ()
        quallist = ast.children.quallist
        if ast.leaf <> '=':
            quallist.append ((2,relattr)) # 2 is relation attribute
            # see http://www.loc.gov/z3950/agency/markup/13.html ATR.1.1
        apt.attributes = map (make_aelt, quallist)
        apt.term = ('general', ast.children.val) # XXX update for V3?
        return ('op', ('attrTerm', apt))
    elif ast.type == 'set':
        return ('op', ('resultSet', ast.leaf))

    raise UnimplError("Bad ast type " + str(ast.type))

def mk_rpn_query (query):
    """Transform a CCL query into an RPN query."""
    # need to copy or create a new lexer because it contains globals
    # PLY 1.0 lacks __copy__
    # PLY 1.3.1-1.5 have __copy__, but it's broken and returns None
    # I sent David Beazley a patch, so future PLY releases will
    # presumably work correctly.
    # Recreating the lexer each time is noticeably slower, so this solution
    # is suboptimal for PLY <= 1.5, but better than being thread-unsafe.
    # Perhaps I should have per-thread lexer instead XXX
    # with example/twisted/test.py set to parse_only, I get 277 parses/sec
    # with fixed PLY, vs. 63 parses/sec with broken PLY, on my 500 MHz PIII
    # laptop.

    copiedlexer = None
    if hasattr (lexer, '__copy__'):
        copiedlexer = lexer.__copy__ ()
    if copiedlexer == None:
        copiedlexer = lex.lex ()
    ast = yacc.parse (query, copiedlexer)
    return ast_to_rpn (ast)

def ast_to_rpn (ast):
    if ast.type == 'attrset':
        attrset = attrset_to_oid (ast.leaf)
        ast = ast.children [0]
    else:
        attrset = oids.Z3950_ATTRS_BIB1_ov
    rpnq = z3950.RPNQuery (attributeSet = attrset)
    rpnq.rpn = tree_to_q (ast)
    return ('type_1', rpnq)

def testlex (s):
    lexer.input (s)
    while 1:
        token = lexer.token ()
        if not token:
            break
        print token

def testyacc (s):
    copylex = lexer.__copy__ ()
    ast = yacc.parse (s, lexer = copylex)
    print "AST:", ast
    print "RPN Query:", ast_to_rpn (ast)

if __name__ == '__main__':
    testfn = testyacc
    #    testfn = testlex
    testfn ('attrset (BIB1/ au="Gaiman, Neil" or ti=Sandman)')
    while 1:
        s = raw_input ('Query: ')
        if len (s) == 0:
            break
        testfn (s)
#    testyacc ()
#    testlex ()