Projects/worblehat-old
Projects
/
worblehat-old
Archived
12
0
Fork 0
This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
worblehat-old/python/PyZ3950/ccl.py

366 lines
10 KiB
Python

#!/usr/bin/env python
"""Implements part of CCL, the Common Command Language, ISO 8777. I'm
working from the description in the YAZ toolkit
(http://www.indexdata.dk/yaz/doc/tools.php), rather than the ISO
spec. Two extensions:
- qualifiers can be literal "(attrtyp, attrval)" pairs, so, e.g., the
following is a legitimate for ISBN: "(1,7)=0312033095"
- the optional ATTRSET (attrset/query) which must appear at the beginning
of the string.
Allowed values are:
BIB1 (default)
XD1
UTIL
ZTHES1
EXP1
or an oid expressed as a dotted string. (A leading dot implies a
prefix of 1.2.840.1003.3, so, e.g., .1 is the same as BIB1.)
Eventually I will support v3-style mixing attribute sets within
a single query, but for now I don't.
"""
from __future__ import nested_scopes
import string
in_setup = 0
try:
from PyZ3950 import z3950
from PyZ3950 import oids
from PyZ3950 import asn1
_attrdict = {
'bib1' : oids.Z3950_ATTRS_BIB1_ov,
'zthes1': oids.Z3950_ATTRS_ZTHES_ov,
'xd1': oids.Z3950_ATTRS_XD1_ov,
'utility': oids.Z3950_ATTRS_UTIL_ov,
'exp1': oids.Z3950_ATTRS_EXP1_ov
}
except ImportError, err:
print "Error importing (OK during setup)", err
in_setup = 1
class QuerySyntaxError(Exception): pass
class ParseError(QuerySyntaxError): pass
class LexError(QuerySyntaxError): pass
class UnimplError(QuerySyntaxError): pass
tokens = ('LPAREN', 'RPAREN', 'COMMA',
'SET', 'ATTRSET','QUAL', 'QUOTEDVALUE', 'RELOP', 'WORD',
'LOGOP', 'SLASH')
t_LPAREN= r'\('
t_RPAREN= r'\)'
t_COMMA = r','
t_SLASH = r'/'
def t_ATTRSET(t):
r'(?i)ATTRSET'
return t
def t_SET (t): # need to def as function to override parsing as WORD, gr XXX
r'(SET)'
return t
relop_to_attrib = {
'<': 1,
'<=': 2,
'=': 3,
'>=': 4,
'>': 5,
'<>': 6}
t_RELOP = "|".join (["(%s)" % r for r in relop_to_attrib.keys()])
# XXX Index Data docs say 'doesn't follow ... ISO8777'?
# XXX expand to rd. addt'l defns from file?
qual_dict = { # These are bib-1 attribute values, see
# http://www.loc.gov/z3950/agency/defns/bib1.html and ftp://ftp.loc.gov/pub/z3950/defs/bib1.txt
'TI': (1,4),
'AU': (1,1003), # use 1003 to work w/ both NLC-BNC and LC
'ISBN': (1,7),
'LCCN': (1,9),
'ANY': (1,1016),
'FIF': (3, 1), # first-in-field
'AIF': (3,3), # any-in-field (default)
'RTRUNC': (5,1),
'NOTRUNC': (5,100) # (default)
}
default_quals = ['ANY'] # XXX should be per-attr-set
default_relop = '='
def t_QUAL(t):
return t
def mk_quals ():
quals = ("|".join (map (lambda x: '(' + x + ')', qual_dict.keys())))
t_QUAL.__doc__ = "(?i)" + quals + r"|(\([0-9]+,[0-9]+\))"
def t_QUOTEDVALUE(t):
r"(\".*?\")"
if t.value[0] == '"':
t.value = t.value[1:-1]
return t
word_init = "[a-z]|[A-Z]|[0-9]|&|:"
word_non_init = ",|\.|\'"
t_WORD = "(%s)(%s|%s)*" % (word_init, word_init, word_non_init)
def t_LOGOP(t):
r'(?i)(AND)|(OR)|(NOT)'
return t
t_ignore = " \t"
def t_error(t):
raise LexError ('t_error: ' + str (t))
from ply import lex
def relex ():
global lexer
mk_quals ()
lexer = lex.lex()
relex ()
def add_qual (qual_name, val):
"""Add a qualifier definition, and regenerate the lexer."""
qual_dict[qual_name] = val
relex ()
from ply import yacc
#if in_setup:
# import yacc
#else:
# from PyZ3950 import yacc
class Node:
def __init__(self,type,children=None,leaf=None):
self.type = type
if children:
self.children = children
else:
self.children = [ ]
self.leaf = leaf
def str_child (self, child, depth):
if isinstance (child, Node): # ugh
return child.str_depth (depth)
indent = " " * (4 * depth)
return indent + str (child) + "\n"
def str_depth (self, depth): # ugh
indent = " " * (4 * depth)
l = ["%s%s %s" % (indent, self.type, self.leaf)]
l.append ("".join (map (lambda s: self.str_child (s, depth + 1),
self.children)))
return "\n".join (l)
def __str__(self):
return "\n" + self.str_depth (0)
def p_top (t):
'top : cclfind_or_attrset'
t[0] = t[1]
def p_cclfind_or_attrset_1 (t):
'cclfind_or_attrset : cclfind'
t[0] = t[1]
def p_cclfind_or_attrset_2 (t):
'cclfind_or_attrset : ATTRSET LPAREN WORD SLASH cclfind RPAREN'
t[0] = Node ('attrset', [t[5]], t[3])
def p_ccl_find_1(t):
'cclfind : cclfind LOGOP elements'
t[0] = Node ('op', [t[1],t[3]], t[2])
def p_ccl_find_2(t):
'cclfind : elements'
t[0] = t[1]
def p_elements_1(t):
'elements : LPAREN cclfind RPAREN'
t[0] = t[2]
class QuallistVal:
def __init__ (self, quallist, val):
self.quallist = quallist
self.val = val
def __str__ (self):
return "QV: %s %s" % (str(self.quallist),str (self.val))
def __getitem__ (self, i):
if i == 0: return self.quallist
if i == 1: return self.val
raise IndexError ('QuallistVal err ' + str (i))
def xlate_qualifier (x):
if x[0] == '(' and x[-1] == ')':
t = x[1:-1].split (',') # t must be of len 2 b/c of lexer
return (string.atoi (t[0]), string.atoi (t[1]))
return qual_dict[(x.upper ())]
def p_elements_2 (t):
'elements : SET RELOP WORD'
if t[2] <> '=':
raise QuerySyntaxError (str (t[1], str (t[2]), str (t[3])))
t[0] = Node ('set', leaf = t[3])
def p_elements_3(t):
'elements : val'
t[0] = Node ('relop', QuallistVal (map (xlate_qualifier, default_quals), t[1]), default_relop)
def p_elements_4(t):
'elements : quallist RELOP val'
t[0] = Node ('relop', QuallistVal(map (xlate_qualifier, t[1]),t[3]), t[2])
# XXX p_elements_5 would be quals followed by recursive def'n, not yet implemented
# XXX p_elements_6 would be quals followed by range, not yet implemented.
def p_quallist_1 (t):
'quallist : QUAL'
t[0] = [t[1]]
def p_quallist_2 (t):
'quallist : quallist COMMA QUAL'
t[0] = t[1] + [t[3]]
def p_val_1(t):
'val : QUOTEDVALUE'
t[0] = t[1]
def p_val_2(t):
'val : val WORD'
t[0] = t[1] + " " + t[2]
def p_val_3(t):
'val : WORD'
t[0] = t[1]
# XXX also don't yet handle proximity operator
def p_error(t):
raise ParseError ('Parse p_error ' + str (t))
precedence = (
('left', 'LOGOP'),
)
yacc.yacc (debug=0, tabmodule = 'PyZ3950_parsetab')
#yacc.yacc (debug=0, tabpackage = 'PyZ3950', tabmodule='PyZ3950_parsetab')
def attrset_to_oid (attrset):
l = attrset.lower ()
if _attrdict.has_key (l):
return _attrdict [l]
split_l = l.split ('.')
if split_l[0] == '':
split_l = oids.Z3950_ATTRS + split_l[1:]
try:
intlist = map (string.atoi, split_l)
except ValueError:
raise ParseError ('Bad OID: ' + l)
return asn1.OidVal (intlist)
def tree_to_q (ast):
if ast.type == 'op':
myrpnRpnOp = z3950.RpnRpnOp ()
myrpnRpnOp.rpn1 = tree_to_q(ast.children[0])
myrpnRpnOp.rpn2 = tree_to_q(ast.children[1])
op = ast.leaf.lower ()
if op == 'not': op = 'and-not' # CCL spec of 'not' vs. Z39.50 spec of 'and-not'
myrpnRpnOp.op = (op, None)
return ('rpnRpnOp', myrpnRpnOp)
elif ast.type == 'relop':
# XXX but e.g. LC (http://lcweb.loc.gov/z3950/lcserver.html)
# doesn't support other relation attributes, either.
try:
relattr = relop_to_attrib [ast.leaf]
except KeyError: # should never happen, how could we have lexed it?
raise UnimplError (ast.leaf)
def make_aelt (qual):
val = ('numeric', qual [1])
return z3950.AttributeElement (attributeType = qual[0],
attributeValue = val)
apt = z3950.AttributesPlusTerm ()
quallist = ast.children.quallist
if ast.leaf <> '=':
quallist.append ((2,relattr)) # 2 is relation attribute
# see http://www.loc.gov/z3950/agency/markup/13.html ATR.1.1
apt.attributes = map (make_aelt, quallist)
apt.term = ('general', ast.children.val) # XXX update for V3?
return ('op', ('attrTerm', apt))
elif ast.type == 'set':
return ('op', ('resultSet', ast.leaf))
raise UnimplError("Bad ast type " + str(ast.type))
def mk_rpn_query (query):
"""Transform a CCL query into an RPN query."""
# need to copy or create a new lexer because it contains globals
# PLY 1.0 lacks __copy__
# PLY 1.3.1-1.5 have __copy__, but it's broken and returns None
# I sent David Beazley a patch, so future PLY releases will
# presumably work correctly.
# Recreating the lexer each time is noticeably slower, so this solution
# is suboptimal for PLY <= 1.5, but better than being thread-unsafe.
# Perhaps I should have per-thread lexer instead XXX
# with example/twisted/test.py set to parse_only, I get 277 parses/sec
# with fixed PLY, vs. 63 parses/sec with broken PLY, on my 500 MHz PIII
# laptop.
copiedlexer = None
if hasattr (lexer, '__copy__'):
copiedlexer = lexer.__copy__ ()
if copiedlexer == None:
copiedlexer = lex.lex ()
ast = yacc.parse (query, copiedlexer)
return ast_to_rpn (ast)
def ast_to_rpn (ast):
if ast.type == 'attrset':
attrset = attrset_to_oid (ast.leaf)
ast = ast.children [0]
else:
attrset = oids.Z3950_ATTRS_BIB1_ov
rpnq = z3950.RPNQuery (attributeSet = attrset)
rpnq.rpn = tree_to_q (ast)
return ('type_1', rpnq)
def testlex (s):
lexer.input (s)
while 1:
token = lexer.token ()
if not token:
break
print token
def testyacc (s):
copylex = lexer.__copy__ ()
ast = yacc.parse (s, lexer = copylex)
print "AST:", ast
print "RPN Query:", ast_to_rpn (ast)
if __name__ == '__main__':
testfn = testyacc
# testfn = testlex
testfn ('attrset (BIB1/ au="Gaiman, Neil" or ti=Sandman)')
while 1:
s = raw_input ('Query: ')
if len (s) == 0:
break
testfn (s)
# testyacc ()
# testlex ()