407 lines
12 KiB
Python
407 lines
12 KiB
Python
|
|
||
|
#!/usr/local/bin/python2.3
|
||
|
|
||
|
try:
|
||
|
from cStringIO import StringIO
|
||
|
except:
|
||
|
from StringIO import StringIO
|
||
|
from PyZ3950 import z3950, oids
|
||
|
from types import IntType, StringType, ListType
|
||
|
# We need "\"\"" to be one token
|
||
|
from PyZ3950.CQLParser import CQLshlex
|
||
|
from PyZ3950.CQLUtils import ZCQLConfig
|
||
|
from PyZ3950.zdefs import make_attr
|
||
|
zconfig = ZCQLConfig()
|
||
|
|
||
|
"""
|
||
|
http://cheshire.berkeley.edu/cheshire2.html#zfind
|
||
|
|
||
|
top ::= query ['resultsetid' name]
|
||
|
query ::= query boolean clause | clause
|
||
|
clause ::= '(' query ')'
|
||
|
| attributes [relation] term
|
||
|
| resultset
|
||
|
attributes ::= '[' { [set] type '=' value } ']' | name
|
||
|
boolean ::= 'and' | 'or' | 'not' | (synonyms)
|
||
|
prox ::= ('!PROX' | (synonyms)) {'/' name}
|
||
|
relation ::= '>' | '<' | ...
|
||
|
|
||
|
[bib1 1=5, bib1 3=6] > term and title @ fish
|
||
|
"""
|
||
|
|
||
|
booleans = {'AND' : 'and',
|
||
|
'.AND.' : 'and',
|
||
|
'&&' : 'and',
|
||
|
'OR' : 'or',
|
||
|
'.OR.' : 'or',
|
||
|
'||' : 'or',
|
||
|
'NOT' : 'and-not',
|
||
|
'.NOT.' : 'and-not',
|
||
|
'ANDNOT' : 'and-not',
|
||
|
'.ANDNOT.' : 'and-not',
|
||
|
'!!' : 'and-not'
|
||
|
}
|
||
|
|
||
|
relations = {'<' : 1,
|
||
|
'LT' : 1,
|
||
|
'.LT.' : 1,
|
||
|
'<=' : 2,
|
||
|
'LE' : 2,
|
||
|
'.LE.' : 2,
|
||
|
'=' : 3,
|
||
|
'>=' : 4,
|
||
|
'GE' : 4,
|
||
|
'.GE.' : 4,
|
||
|
'>' : 5,
|
||
|
'GT' : 5,
|
||
|
'.GT.' : 5,
|
||
|
'<>' : 6,
|
||
|
'!=' : 6,
|
||
|
'NE' : 6,
|
||
|
'.NE.' : 6,
|
||
|
'?' : 100,
|
||
|
'PHON' : 100,
|
||
|
'.PHON.' : 100,
|
||
|
'%' : 101,
|
||
|
'STEM' : 101,
|
||
|
'.STEM.' : 101,
|
||
|
'@' : 102,
|
||
|
'REL' : 102,
|
||
|
'.REL.' : 102,
|
||
|
'<=>' : 104,
|
||
|
'WITHIN' : 104,
|
||
|
'.WITHIN.' : 104}
|
||
|
|
||
|
geoRelations = {'>=<' : 7,
|
||
|
'.OVERLAPS.' : 7,
|
||
|
'>#<' : 8,
|
||
|
'.FULLY_ENCLOSED_WITHIN.' : 8,
|
||
|
'<#>' : 9,
|
||
|
'.ENCLOSES.' : 9,
|
||
|
'<>#' : 10,
|
||
|
'.OUTSIDE_OF.' : 10,
|
||
|
'+-+' : 11,
|
||
|
'.NEAR.' : 11,
|
||
|
'.#.' : 12,
|
||
|
'.MEMBERS_CONTAIN.' : 12,
|
||
|
'!.#.' : 13,
|
||
|
'.MEMBERS_NOT_CONTAIN.' : 13,
|
||
|
':<:' : 14,
|
||
|
'.BEFORE.' : 14,
|
||
|
':<=:' : 15,
|
||
|
'.BEFORE_OR_DURING.' : 15,
|
||
|
':=:' : 16,
|
||
|
'.DURING.' : 16,
|
||
|
':>=:' : 17,
|
||
|
'.DURING_OR_AFTER.' : 17,
|
||
|
':>:' : 18,
|
||
|
'.AFTER.' : 18}
|
||
|
|
||
|
proxBooleans = {'!PROX' : (2, 0, 2),
|
||
|
'!ADJ' : (2, 0, 2),
|
||
|
'!NEAR' : (20, 0, 2),
|
||
|
'!FAR' : (20, 0, 4),
|
||
|
'!OPROX' : (2, 1, 2),
|
||
|
'!OADJ' : (2, 1, 2),
|
||
|
'!ONEAR' : (20, 1, 2),
|
||
|
'!OFAR' : (20, 1, 4)}
|
||
|
|
||
|
proxUnits = {'C' : 1,
|
||
|
'CHAR' : 1,
|
||
|
'W' : 2,
|
||
|
'WORD' : 2,
|
||
|
'S' : 3,
|
||
|
'SENT' : 3,
|
||
|
'SENTENCE' : 3,
|
||
|
'P' : 4,
|
||
|
'PARA' : 4,
|
||
|
'PARAGRAPH' : 4,
|
||
|
'SECTION' : 5,
|
||
|
'CHAPTER' : 6,
|
||
|
'DOCUMENT' : 7,
|
||
|
'ELEMENT' : 8,
|
||
|
'SUBELEMENT' : 9,
|
||
|
'ELEMENTTYPE' : 10,
|
||
|
'BYTE' : 11}
|
||
|
|
||
|
privateBooleans = {'!FUZZY_AND' : 1,
|
||
|
'!FUZZY_OR' : 2,
|
||
|
'!FUZZY_NOT' : 3,
|
||
|
'!RESTRICT_FROM' : 4,
|
||
|
'!RESTRICT_TO' : 5,
|
||
|
'!MERGE_SUM' : 6,
|
||
|
'!MERGE_MEAN' : 7,
|
||
|
'!MERGE_NORM' : 8}
|
||
|
|
||
|
xzconfig = ZCQLConfig()
|
||
|
|
||
|
class C2Parser:
|
||
|
lexer = None
|
||
|
currentToken = None
|
||
|
nextToken = None
|
||
|
|
||
|
def __init__(self, l):
|
||
|
self.lexer = l
|
||
|
self.fetch_token()
|
||
|
|
||
|
|
||
|
def fetch_token(self):
|
||
|
tok = self.lexer.get_token()
|
||
|
self.currentToken = self.nextToken
|
||
|
self.nextToken = tok
|
||
|
|
||
|
def is_boolean(self, tok=None):
|
||
|
if (tok == None):
|
||
|
tok = self.currentToken
|
||
|
if (privateBooleans.has_key(tok.upper())):
|
||
|
return 1
|
||
|
elif (booleans.has_key(tok.upper())):
|
||
|
return 2
|
||
|
elif (proxBooleans.has_key(tok.upper())):
|
||
|
return 3
|
||
|
else:
|
||
|
return 0
|
||
|
|
||
|
|
||
|
def top(self):
|
||
|
|
||
|
rpn = self.query()
|
||
|
# Check for resultsetid
|
||
|
if (self.currentToken.lower() == 'resultsetid'):
|
||
|
self.fetch_token()
|
||
|
resultset = self.currentToken
|
||
|
else:
|
||
|
resultset = None
|
||
|
|
||
|
rpnq = z3950.RPNQuery()
|
||
|
rpnq.attributeSet = oids.Z3950_ATTRS_BIB1_ov
|
||
|
rpnq.rpn = rpn
|
||
|
q = ('type_1', rpnq)
|
||
|
return (q, resultset)
|
||
|
|
||
|
def query(self):
|
||
|
self.fetch_token()
|
||
|
left = self.subquery()
|
||
|
while 1:
|
||
|
if not self.currentToken:
|
||
|
break
|
||
|
bool = self.is_boolean()
|
||
|
if bool:
|
||
|
bool = self.boolean()
|
||
|
right = self.subquery()
|
||
|
# Put left into triple, make triple new left
|
||
|
op = z3950.RpnRpnOp()
|
||
|
op.rpn1 = left
|
||
|
op.rpn2 = right
|
||
|
op.op = bool
|
||
|
wrap = ('rpnRpnOp', op)
|
||
|
left = wrap
|
||
|
else:
|
||
|
break
|
||
|
return left
|
||
|
|
||
|
|
||
|
def subquery(self):
|
||
|
if self.currentToken == "(":
|
||
|
object = self.query()
|
||
|
if (self.currentToken <> ")"):
|
||
|
raise ValueError
|
||
|
else:
|
||
|
self.fetch_token()
|
||
|
else:
|
||
|
object = self.clause()
|
||
|
return object
|
||
|
|
||
|
def boolean(self):
|
||
|
tok = self.currentToken.upper()
|
||
|
self.fetch_token()
|
||
|
if (booleans.has_key(tok)):
|
||
|
return (booleans[tok], None)
|
||
|
elif (privateBooleans.has_key(tok)):
|
||
|
# Generate cutesie prox trick
|
||
|
type = privateBooleans[tok]
|
||
|
prox = z3950.ProximityOperator()
|
||
|
prox.proximityUnitCode = ('private', type)
|
||
|
prox.distance = 0
|
||
|
prox.ordered = 0
|
||
|
prox.relationType = 3
|
||
|
return ('op', ('prox', prox))
|
||
|
|
||
|
elif (proxBooleans.has_key(tok)):
|
||
|
# Generate prox
|
||
|
prox = z3950.ProximityOperator()
|
||
|
stuff = proxBooleans[tok]
|
||
|
prox.distance = stuff[0]
|
||
|
prox.ordered = stuff[1]
|
||
|
prox.relationType = stuff[2]
|
||
|
prox.proximityUnitCode = ('known', 2)
|
||
|
|
||
|
# Now look for /
|
||
|
while (self.currentToken == "/"):
|
||
|
self.fetch_token()
|
||
|
if (self.currentToken.isdigit()):
|
||
|
prox.distance = int(self.currentToken)
|
||
|
elif (proxUnits.has_key(self.currentToken.upper())):
|
||
|
prox.proximityUnitCode = ('known', proxUnits[self.currentToken.upper()])
|
||
|
else:
|
||
|
raise ValueError
|
||
|
self.fetch_token()
|
||
|
return ('op', ('prox', prox))
|
||
|
else:
|
||
|
# Argh!
|
||
|
raise ValueError
|
||
|
|
||
|
def clause(self):
|
||
|
|
||
|
if (self.is_boolean(self.nextToken) or not self.nextToken or self.nextToken.lower() == 'resultsetid' or self.nextToken == ")"):
|
||
|
# Must be a resultset
|
||
|
tok = self.currentToken
|
||
|
self.fetch_token()
|
||
|
return ('op', ('resultSet', tok))
|
||
|
|
||
|
elif (self.currentToken == '['):
|
||
|
# List of attributes
|
||
|
attrs = []
|
||
|
oidHash = oids.oids['Z3950']['ATTRS']
|
||
|
while (1):
|
||
|
self.fetch_token()
|
||
|
|
||
|
if (self.currentToken == ']'):
|
||
|
break
|
||
|
|
||
|
if (oidHash.has_key(self.currentToken)):
|
||
|
attrSet = oidHash[self.currentToken]['ov']
|
||
|
self.fetch_token()
|
||
|
elif (self.currentToken[:8] == '1.2.840.'):
|
||
|
attrSet = asn1.OidVal(map(int, self.currentToken.split('.')))
|
||
|
self.fetch_token()
|
||
|
else:
|
||
|
attrSet = None
|
||
|
|
||
|
if (self.currentToken[-1] == ','):
|
||
|
tok = self.currentToken[:-1]
|
||
|
else:
|
||
|
tok = self.currentToken
|
||
|
|
||
|
if (tok.isdigit()):
|
||
|
# 1 = foo
|
||
|
atype = int(tok)
|
||
|
self.fetch_token()
|
||
|
if (self.currentToken == '='):
|
||
|
# = foo
|
||
|
self.fetch_token()
|
||
|
|
||
|
if (self.currentToken[0] == '='):
|
||
|
# =foo
|
||
|
tok = self.currentToken[1:]
|
||
|
else:
|
||
|
tok = self.currentToken
|
||
|
|
||
|
if (tok[-1] == ','):
|
||
|
tok = tok[:-1]
|
||
|
|
||
|
if (tok.isdigit()):
|
||
|
val = int(tok)
|
||
|
else:
|
||
|
val = tok
|
||
|
if (val[0] == "'" and val[-1] == "'"):
|
||
|
val = val[1:-1]
|
||
|
elif (tok[-1] == '='):
|
||
|
#1= foo
|
||
|
tok = tok[:-1]
|
||
|
if (tok.isdigit()):
|
||
|
atype = int(tok)
|
||
|
self.fetch_token()
|
||
|
if (self.currentToken[-1] == ","):
|
||
|
tok = self.currentToken[:-1]
|
||
|
else:
|
||
|
tok = self.currentToken
|
||
|
if (tok.isdigit()):
|
||
|
val = int(self.currentToken)
|
||
|
else:
|
||
|
val = tok
|
||
|
if (val[0] == "'" and val[-1] == "'"):
|
||
|
val = val[1:-1]
|
||
|
|
||
|
elif (tok.find('=') > -1):
|
||
|
# 1=foo
|
||
|
(atype, val) = self.currentToken.split('=')
|
||
|
atype = int(atype)
|
||
|
if (val[-1] == ","):
|
||
|
val = val[:-1]
|
||
|
if (val.isdigit()):
|
||
|
val = int(val)
|
||
|
elif (val[0] == "'" and val[-1] == "'"):
|
||
|
val = val[1:-1]
|
||
|
else:
|
||
|
# ???
|
||
|
raise ValueError
|
||
|
attrs.append([attrSet, atype, val])
|
||
|
|
||
|
else:
|
||
|
# Check for named index
|
||
|
if (zconfig.BIB1.has_key(self.currentToken.lower())):
|
||
|
attrs = [[oids.Z3950_ATTRS_BIB1_ov, 1, zconfig.BIB1[self.currentToken.lower()]]]
|
||
|
else:
|
||
|
# Just pass through the name
|
||
|
attrs = [[oids.Z3950_ATTRS_BIB1_ov, 1, self.currentToken]]
|
||
|
|
||
|
self.fetch_token()
|
||
|
# Check for relation
|
||
|
tok = self.currentToken.upper()
|
||
|
if (relations.has_key(tok)):
|
||
|
val = relations[tok]
|
||
|
found = 0
|
||
|
for a in attrs:
|
||
|
if (a[0] in [oids.Z3950_ATTRS_BIB1, None] and a[1] == 2):
|
||
|
found =1
|
||
|
a[2] = val
|
||
|
break
|
||
|
if (not found):
|
||
|
attrs.append([None, 2, val])
|
||
|
self.fetch_token()
|
||
|
elif (geoRelations.has_key(tok)):
|
||
|
val = geoRelations[tok]
|
||
|
found = 0
|
||
|
for a in attrs:
|
||
|
if (a[0] in [oids.Z3950_ATTRS_BIB1, oids.Z3950_ATTRS_GEO, None] and a[1] == 2):
|
||
|
found = 1
|
||
|
a[2] = val
|
||
|
break
|
||
|
if (not found):
|
||
|
attrs.append([oids.Z3950_ATTRS_GEO, 2, val])
|
||
|
self.fetch_token()
|
||
|
|
||
|
if (self.currentToken.find(' ')):
|
||
|
# Already quoted
|
||
|
term = self.currentToken
|
||
|
else:
|
||
|
# Accumulate
|
||
|
term = []
|
||
|
while (self.currentToken and not self.is_boolean(self.currentToken) and self.currentToken.lower() != 'resultsetid'):
|
||
|
term.append(self.currenToken)
|
||
|
term = ' '.join(term)
|
||
|
|
||
|
self.fetch_token()
|
||
|
|
||
|
# Phew. Now build AttributesPlusTerm
|
||
|
clause = z3950.AttributesPlusTerm()
|
||
|
clause.attributes = [make_attr(*e) for e in attrs]
|
||
|
clause.term = ('general', term)
|
||
|
return ('op', ('attrTerm', clause))
|
||
|
|
||
|
|
||
|
def parse(q):
|
||
|
|
||
|
query = StringIO(q)
|
||
|
lexer = CQLshlex(query)
|
||
|
# Override CQL's wordchars list to include /=><
|
||
|
lexer.wordchars += "!@#$%^&*-+;,.?|~`:\\><='"
|
||
|
lexer.wordchars = lexer.wordchars.replace('[', '')
|
||
|
lexer.wordchars = lexer.wordchars.replace(']', '')
|
||
|
|
||
|
|
||
|
parser = C2Parser(lexer)
|
||
|
return parser.top()
|
||
|
|