#!/usr/bin/env python """Implements part of CCL, the Common Command Language, ISO 8777. I'm working from the description in the YAZ toolkit (http://www.indexdata.dk/yaz/doc/tools.php), rather than the ISO spec. Two extensions: - qualifiers can be literal "(attrtyp, attrval)" pairs, so, e.g., the following is a legitimate for ISBN: "(1,7)=0312033095" - the optional ATTRSET (attrset/query) which must appear at the beginning of the string. Allowed values are: BIB1 (default) XD1 UTIL ZTHES1 EXP1 or an oid expressed as a dotted string. (A leading dot implies a prefix of 1.2.840.1003.3, so, e.g., .1 is the same as BIB1.) Eventually I will support v3-style mixing attribute sets within a single query, but for now I don't. """ from __future__ import nested_scopes import string in_setup = 0 try: from PyZ3950 import z3950 from PyZ3950 import oids from PyZ3950 import asn1 _attrdict = { 'bib1' : oids.Z3950_ATTRS_BIB1_ov, 'zthes1': oids.Z3950_ATTRS_ZTHES_ov, 'xd1': oids.Z3950_ATTRS_XD1_ov, 'utility': oids.Z3950_ATTRS_UTIL_ov, 'exp1': oids.Z3950_ATTRS_EXP1_ov } except ImportError, err: print "Error importing (OK during setup)", err in_setup = 1 class QuerySyntaxError(Exception): pass class ParseError(QuerySyntaxError): pass class LexError(QuerySyntaxError): pass class UnimplError(QuerySyntaxError): pass tokens = ('LPAREN', 'RPAREN', 'COMMA', 'SET', 'ATTRSET','QUAL', 'QUOTEDVALUE', 'RELOP', 'WORD', 'LOGOP', 'SLASH') t_LPAREN= r'\(' t_RPAREN= r'\)' t_COMMA = r',' t_SLASH = r'/' def t_ATTRSET(t): r'(?i)ATTRSET' return t def t_SET (t): # need to def as function to override parsing as WORD, gr XXX r'(SET)' return t relop_to_attrib = { '<': 1, '<=': 2, '=': 3, '>=': 4, '>': 5, '<>': 6} t_RELOP = "|".join (["(%s)" % r for r in relop_to_attrib.keys()]) # XXX Index Data docs say 'doesn't follow ... ISO8777'? # XXX expand to rd. addt'l defns from file? qual_dict = { # These are bib-1 attribute values, see # http://www.loc.gov/z3950/agency/defns/bib1.html and ftp://ftp.loc.gov/pub/z3950/defs/bib1.txt 'TI': (1,4), 'AU': (1,1003), # use 1003 to work w/ both NLC-BNC and LC 'ISBN': (1,7), 'LCCN': (1,9), 'ANY': (1,1016), 'FIF': (3, 1), # first-in-field 'AIF': (3,3), # any-in-field (default) 'RTRUNC': (5,1), 'NOTRUNC': (5,100) # (default) } default_quals = ['ANY'] # XXX should be per-attr-set default_relop = '=' def t_QUAL(t): return t def mk_quals (): quals = ("|".join (map (lambda x: '(' + x + ')', qual_dict.keys()))) t_QUAL.__doc__ = "(?i)" + quals + r"|(\([0-9]+,[0-9]+\))" def t_QUOTEDVALUE(t): r"(\".*?\")" if t.value[0] == '"': t.value = t.value[1:-1] return t word_init = "[a-z]|[A-Z]|[0-9]|&|:" word_non_init = ",|\.|\'" t_WORD = "(%s)(%s|%s)*" % (word_init, word_init, word_non_init) def t_LOGOP(t): r'(?i)(AND)|(OR)|(NOT)' return t t_ignore = " \t" def t_error(t): raise LexError ('t_error: ' + str (t)) from ply import lex def relex (): global lexer mk_quals () lexer = lex.lex() relex () def add_qual (qual_name, val): """Add a qualifier definition, and regenerate the lexer.""" qual_dict[qual_name] = val relex () from ply import yacc #if in_setup: # import yacc #else: # from PyZ3950 import yacc class Node: def __init__(self,type,children=None,leaf=None): self.type = type if children: self.children = children else: self.children = [ ] self.leaf = leaf def str_child (self, child, depth): if isinstance (child, Node): # ugh return child.str_depth (depth) indent = " " * (4 * depth) return indent + str (child) + "\n" def str_depth (self, depth): # ugh indent = " " * (4 * depth) l = ["%s%s %s" % (indent, self.type, self.leaf)] l.append ("".join (map (lambda s: self.str_child (s, depth + 1), self.children))) return "\n".join (l) def __str__(self): return "\n" + self.str_depth (0) def p_top (t): 'top : cclfind_or_attrset' t[0] = t[1] def p_cclfind_or_attrset_1 (t): 'cclfind_or_attrset : cclfind' t[0] = t[1] def p_cclfind_or_attrset_2 (t): 'cclfind_or_attrset : ATTRSET LPAREN WORD SLASH cclfind RPAREN' t[0] = Node ('attrset', [t[5]], t[3]) def p_ccl_find_1(t): 'cclfind : cclfind LOGOP elements' t[0] = Node ('op', [t[1],t[3]], t[2]) def p_ccl_find_2(t): 'cclfind : elements' t[0] = t[1] def p_elements_1(t): 'elements : LPAREN cclfind RPAREN' t[0] = t[2] class QuallistVal: def __init__ (self, quallist, val): self.quallist = quallist self.val = val def __str__ (self): return "QV: %s %s" % (str(self.quallist),str (self.val)) def __getitem__ (self, i): if i == 0: return self.quallist if i == 1: return self.val raise IndexError ('QuallistVal err ' + str (i)) def xlate_qualifier (x): if x[0] == '(' and x[-1] == ')': t = x[1:-1].split (',') # t must be of len 2 b/c of lexer return (string.atoi (t[0]), string.atoi (t[1])) return qual_dict[(x.upper ())] def p_elements_2 (t): 'elements : SET RELOP WORD' if t[2] <> '=': raise QuerySyntaxError (str (t[1], str (t[2]), str (t[3]))) t[0] = Node ('set', leaf = t[3]) def p_elements_3(t): 'elements : val' t[0] = Node ('relop', QuallistVal (map (xlate_qualifier, default_quals), t[1]), default_relop) def p_elements_4(t): 'elements : quallist RELOP val' t[0] = Node ('relop', QuallistVal(map (xlate_qualifier, t[1]),t[3]), t[2]) # XXX p_elements_5 would be quals followed by recursive def'n, not yet implemented # XXX p_elements_6 would be quals followed by range, not yet implemented. def p_quallist_1 (t): 'quallist : QUAL' t[0] = [t[1]] def p_quallist_2 (t): 'quallist : quallist COMMA QUAL' t[0] = t[1] + [t[3]] def p_val_1(t): 'val : QUOTEDVALUE' t[0] = t[1] def p_val_2(t): 'val : val WORD' t[0] = t[1] + " " + t[2] def p_val_3(t): 'val : WORD' t[0] = t[1] # XXX also don't yet handle proximity operator def p_error(t): raise ParseError ('Parse p_error ' + str (t)) precedence = ( ('left', 'LOGOP'), ) yacc.yacc (debug=0, tabmodule = 'PyZ3950_parsetab') #yacc.yacc (debug=0, tabpackage = 'PyZ3950', tabmodule='PyZ3950_parsetab') def attrset_to_oid (attrset): l = attrset.lower () if _attrdict.has_key (l): return _attrdict [l] split_l = l.split ('.') if split_l[0] == '': split_l = oids.Z3950_ATTRS + split_l[1:] try: intlist = map (string.atoi, split_l) except ValueError: raise ParseError ('Bad OID: ' + l) return asn1.OidVal (intlist) def tree_to_q (ast): if ast.type == 'op': myrpnRpnOp = z3950.RpnRpnOp () myrpnRpnOp.rpn1 = tree_to_q(ast.children[0]) myrpnRpnOp.rpn2 = tree_to_q(ast.children[1]) op = ast.leaf.lower () if op == 'not': op = 'and-not' # CCL spec of 'not' vs. Z39.50 spec of 'and-not' myrpnRpnOp.op = (op, None) return ('rpnRpnOp', myrpnRpnOp) elif ast.type == 'relop': # XXX but e.g. LC (http://lcweb.loc.gov/z3950/lcserver.html) # doesn't support other relation attributes, either. try: relattr = relop_to_attrib [ast.leaf] except KeyError: # should never happen, how could we have lexed it? raise UnimplError (ast.leaf) def make_aelt (qual): val = ('numeric', qual [1]) return z3950.AttributeElement (attributeType = qual[0], attributeValue = val) apt = z3950.AttributesPlusTerm () quallist = ast.children.quallist if ast.leaf <> '=': quallist.append ((2,relattr)) # 2 is relation attribute # see http://www.loc.gov/z3950/agency/markup/13.html ATR.1.1 apt.attributes = map (make_aelt, quallist) apt.term = ('general', ast.children.val) # XXX update for V3? return ('op', ('attrTerm', apt)) elif ast.type == 'set': return ('op', ('resultSet', ast.leaf)) raise UnimplError("Bad ast type " + str(ast.type)) def mk_rpn_query (query): """Transform a CCL query into an RPN query.""" # need to copy or create a new lexer because it contains globals # PLY 1.0 lacks __copy__ # PLY 1.3.1-1.5 have __copy__, but it's broken and returns None # I sent David Beazley a patch, so future PLY releases will # presumably work correctly. # Recreating the lexer each time is noticeably slower, so this solution # is suboptimal for PLY <= 1.5, but better than being thread-unsafe. # Perhaps I should have per-thread lexer instead XXX # with example/twisted/test.py set to parse_only, I get 277 parses/sec # with fixed PLY, vs. 63 parses/sec with broken PLY, on my 500 MHz PIII # laptop. copiedlexer = None if hasattr (lexer, '__copy__'): copiedlexer = lexer.__copy__ () if copiedlexer == None: copiedlexer = lex.lex () ast = yacc.parse (query, copiedlexer) return ast_to_rpn (ast) def ast_to_rpn (ast): if ast.type == 'attrset': attrset = attrset_to_oid (ast.leaf) ast = ast.children [0] else: attrset = oids.Z3950_ATTRS_BIB1_ov rpnq = z3950.RPNQuery (attributeSet = attrset) rpnq.rpn = tree_to_q (ast) return ('type_1', rpnq) def testlex (s): lexer.input (s) while 1: token = lexer.token () if not token: break print token def testyacc (s): copylex = lexer.__copy__ () ast = yacc.parse (s, lexer = copylex) print "AST:", ast print "RPN Query:", ast_to_rpn (ast) if __name__ == '__main__': testfn = testyacc # testfn = testlex testfn ('attrset (BIB1/ au="Gaiman, Neil" or ti=Sandman)') while 1: s = raw_input ('Query: ') if len (s) == 0: break testfn (s) # testyacc () # testlex ()