#!/usr/bin/python # Author: Rob Sanderson (azaroth@liv.ac.uk) # Distributed and Usable under the GPL # Version: 1.7 # Most Recent Changes: contexts, new modifier style for 1.1 # # With thanks to Adam from IndexData and Mike Taylor for their valuable input from shlex import shlex from xml.sax.saxutils import escape from xml.dom.minidom import Node, parseString from PyZ3950.SRWDiagnostics import * # Don't use cStringIO as it borks Unicode (apparently) from StringIO import StringIO import types # Parsing strictness flags errorOnEmptyTerm = 0 # index = "" (often meaningless) errorOnQuotedIdentifier = 0 # "/foo/bar" = "" (unnecessary BNF restriction) errorOnDuplicatePrefix = 0 # >a=b >a=c "" (impossible due to BNF) fullResultSetNameCheck = 1 # srw.rsn=foo and srw.rsn=foo (mutant!!) # Base values for CQL serverChoiceRelation = "scr" serverChoiceIndex = "cql.serverchoice" order = ['=', '>', '>=', '<', '<=', '<>'] modifierSeparator = "/" booleans = ['and', 'or', 'not', 'prox'] reservedPrefixes = {"srw" : "http://www.loc.gov/zing/cql/srw-indexes/v1.0/", "cql" : "info:srw/cql-context-set/1/cql-v1.1"} XCQLNamespace = "http://www.loc.gov/zing/cql/xcql/" # End of 'configurable' stuff class PrefixableObject: "Root object for triple and searchClause" prefixes = {} parent = None config = None def __init__(self): self.prefixes = {} self.parent = None self.config = None def toXCQL(self, depth=0): # Just generate our prefixes space = " " * depth xml = ['%s\n' % (space)] for p in self.prefixes.keys(): xml.append("%s \n%s %s\n%s %s\n%s \n" % (space, space, escape(p), space, escape(self.prefixes[p]), space)) xml.append("%s\n" % (space)) return ''.join(xml) def addPrefix(self, name, identifier): if (errorOnDuplicatePrefix and (self.prefixes.has_key(name) or reservedPrefixes.has_key(name))): # Maybe error diag = Diagnostic45() diag.details = name raise diag; self.prefixes[name] = identifier def resolvePrefix(self, name): # Climb tree if (reservedPrefixes.has_key(name)): return reservedPrefixes[name] elif (self.prefixes.has_key(name)): return self.prefixes[name] elif (self.parent <> None): return self.parent.resolvePrefix(name) elif (self.config <> None): # Config is some sort of server config which specifies defaults return self.config.resolvePrefix(name) else: # Top of tree, no config, no resolution->Unknown indexset # For client we need to allow no prefix? #diag = Diagnostic15() #diag.details = name #raise diag return None class PrefixedObject: "Root object for relation, relationModifier and index" prefix = "" prefixURI = "" value = "" parent = None def __init__(self, val): # All prefixed things are case insensitive val = val.lower() if val and val[0] == '"' and val[-1] == '"': if errorOnQuotedIdentifier: diag = Diagnostic14() diag.details = val raise diag else: val = val[1:-1] self.value = val self.splitValue() def __str__(self): if (self.prefix): return "%s.%s" % (self.prefix, self.value) else: return self.value def splitValue(self): f = self.value.find(".") if (self.value.count('.') > 1): diag = Diagnostic15() diag.details = "Multiple '.' characters: %s" % (self.value) raise(diag) elif (f == 0): diag = Diagnostic15() diag.details = "Null indexset: %s" % (irt.index) raise(diag) elif f >= 0: self.prefix = self.value[:f].lower() self.value = self.value[f+1:].lower() def resolvePrefix(self): if (not self.prefixURI): self.prefixURI = self.parent.resolvePrefix(self.prefix) return self.prefixURI class ModifiableObject: # Treat modifiers as keys on boolean/relation? modifiers = [] def __getitem__(self, k): if (type(k) == types.IntType): try: return self.modifiers[k] except: return None for m in self.modifiers: if (str(m.type) == k or m.type.value == k): return m return None class Triple (PrefixableObject): "Object to represent a CQL triple" leftOperand = None boolean = None rightOperand = None def toXCQL(self, depth=0): "Create the XCQL representation of the object" space = " " * depth if (depth == 0): xml = ['\n' % (XCQLNamespace)] else: xml = ['%s\n' % (space)] if self.prefixes: xml.append(PrefixableObject.toXCQL(self, depth+1)) xml.append(self.boolean.toXCQL(depth+1)) xml.append("%s \n" % (space)) xml.append(self.leftOperand.toXCQL(depth+2)) xml.append("%s \n" % (space)) xml.append("%s \n" % (space)) xml.append(self.rightOperand.toXCQL(depth+2)) xml.append("%s \n" % (space)) xml.append("%s\n" % (space)) return ''.join(xml) def toCQL(self): txt = [] if (self.prefixes): for p in self.prefixes.keys(): if (p <> ''): txt.append('>%s="%s"' % (p, self.prefixes[p])) else: txt.append('>"%s"' % (self.prefixes[p])) prefs = ' '.join(txt) return "(%s %s %s %s)" % (prefs, self.leftOperand.toCQL(), self.boolean.toCQL(), self.rightOperand.toCQL()) else: return "(%s %s %s)" % (self.leftOperand.toCQL(), self.boolean.toCQL(), self.rightOperand.toCQL()) def getResultSetId(self, top=None): if fullResultSetNameCheck == 0 or self.boolean.value in ['not', 'prox']: return "" if top == None: topLevel = 1 top = self; else: topLevel = 0 # Iterate over operands and build a list rsList = [] if isinstance(self.leftOperand, Triple): rsList.extend(self.leftOperand.getResultSetId(top)) else: rsList.append(self.leftOperand.getResultSetId(top)) if isinstance(self.rightOperand, Triple): rsList.extend(self.rightOperand.getResultSetId(top)) else: rsList.append(self.rightOperand.getResultSetId(top)) if topLevel == 1: # Check all elements are the same, if so we're a fubar form of present if (len(rsList) == rsList.count(rsList[0])): return rsList[0] else: return "" else: return rsList class SearchClause (PrefixableObject): "Object to represent a CQL searchClause" index = None relation = None term = None def __init__(self, ind, rel, t): PrefixableObject.__init__(self) self.index = ind self.relation = rel self.term = t ind.parent = self rel.parent = self t.parent = self def toXCQL(self, depth=0): "Produce XCQL version of the object" space = " " * depth if (depth == 0): xml = ['\n' % (XCQLNamespace)] else: xml = ['%s\n' % (space)] if self.prefixes: xml.append(PrefixableObject.toXCQL(self, depth+1)) xml.append(self.index.toXCQL(depth+1)) xml.append(self.relation.toXCQL(depth+1)) xml.append(self.term.toXCQL(depth+1)) xml.append("%s\n" % (space)) return ''.join(xml) def toCQL(self): text = [] for p in self.prefixes.keys(): if (p <> ''): text.append('>%s="%s"' % (p, self.prefixes[p])) else: text.append('>"%s"' % (self.prefixes[p])) text.append('%s %s "%s"' % (self.index, self.relation.toCQL(), self.term)) return ' '.join(text) def getResultSetId(self, top=None): idx = self.index idx.resolvePrefix() if (idx.prefixURI == reservedPrefixes['cql'] and idx.value.lower() == 'resultsetid'): return self.term.value else: return "" class Index(PrefixedObject): "Object to represent a CQL index" def toXCQL(self, depth=0): if (depth == 0): ns = ' xmlns="%s"' % (XCQLNamespace) else: ns = "" return "%s%s\n" % (" "*depth, ns, escape(str(self))) def toCQL(self): return str(self) class Relation(PrefixedObject, ModifiableObject): "Object to represent a CQL relation" def __init__(self, rel, mods=[]): self.prefix = "cql" PrefixedObject.__init__(self, rel) self.modifiers = mods for m in mods: m.parent = self def toXCQL(self, depth=0): "Create XCQL representation of object" if (depth == 0): ns = ' xmlns="%s"' % (XCQLNamespace) else: ns = "" space = " " * depth xml = ["%s\n" % (space, ns)] xml.append("%s %s\n" % (space, escape(self.value))) if self.modifiers: xml.append("%s \n" % (space)) for m in self.modifiers: xml.append(m.toXCQL(depth+2)) xml.append("%s \n" % (space)) xml.append("%s\n" % (space)) return ''.join(xml) def toCQL(self): txt = [self.value] txt.extend(map(str, self.modifiers)) return '/'.join(txt) class Term: value = "" def __init__(self, v): if (v <> ""): # Unquoted literal if v in ['>=', '<=', '>', '<', '<>', "/", '=']: diag = Diagnostic25() diag.details = self.value raise diag # Check existence of meaningful term nonanchor = 0 for c in v: if c != "^": nonanchor = 1 break if not nonanchor: diag = Diagnostic32() diag.details = "Only anchoring charater(s) in term: " + v raise diag # Unescape quotes if (v[0] == '"' and v[-1] == '"'): v = v[1:-1] v = v.replace('\\"', '"') if (not v and errorOnEmptyTerm): diag = Diagnostic27() raise diag # Check for badly placed \s startidx = 0 idx = v.find("\\", startidx) while (idx > -1): startidx = idx+1 if not irt.term[idx+1] in ['?', '\\', '*', '^']: diag = Diagnostic26() diag.details = irt.term raise diag v = v.find("\\", startidx) elif (errorOnEmptyTerm): diag = Diagnostic27() raise diag self.value = v def __str__(self): return self.value def toXCQL(self, depth=0): if (depth == 0): ns = ' xmlns="%s"' % (XCQLNamespace) else: ns = "" return "%s%s\n" % (" "*depth, ns, escape(self.value)) class Boolean(ModifiableObject): "Object to represent a CQL boolean" value = "" parent = None def __init__(self, bool, mods=[]): self.value = bool self.modifiers = mods self.parent = None def toXCQL(self, depth=0): "Create XCQL representation of object" space = " " * depth xml = ["%s\n" % (space)] xml.append("%s %s\n" % (space, escape(self.value))) if self.modifiers: xml.append("%s \n" % (space)) for m in self.modifiers: xml.append(m.toXCQL(depth+2)) xml.append("%s \n" % (space)) xml.append("%s\n" % (space)) return ''.join(xml) def toCQL(self): txt = [self.value] for m in self.modifiers: txt.append(m.toCQL()) return '/'.join(txt) def resolvePrefix(self, name): return self.parent.resolvePrefix(name) class ModifierType(PrefixedObject): # Same as index, but we'll XCQLify in ModifierClause parent = None prefix = "cql" class ModifierClause: "Object to represent a relation modifier" parent = None type = None comparison = "" value = "" def __init__(self, type, comp="", val=""): self.type = ModifierType(type) self.type.parent = self self.comparison = comp self.value = val def __str__(self): if (self.value): return "%s%s%s" % (str(self.type), self.comparison, self.value) else: return "%s" % (str(self.type)) def toXCQL(self, depth=0): if (self.value): return "%s\n%s%s\n%s%s\n%s%s\n%s\n" % (" " * depth, " " * (depth+1), escape(str(self.type)), " " * (depth+1), escape(self.comparison), " " * (depth+1), escape(self.value), " " * depth) else: return "%s%s\n" % (" " * depth, escape(str(self.type))) def toCQL(self): return str(self) def resolvePrefix(self, name): # Need to skip parent, which has its own resolvePrefix # eg boolean or relation, neither of which is prefixable return self.parent.parent.resolvePrefix(name) # Requires changes for: <= >= <>, and escaped \" in " # From shlex.py (std library for 2.2+) class CQLshlex(shlex): "shlex with additions for CQL parsing" quotes = '"' commenters = "" nextToken = "" def __init__(self, thing): shlex.__init__(self, thing) self.wordchars += "!@#$%^&*-+{}[];,.?|~`:\\" self.wordchars += ''.join(map(chr, range(128,254))) def read_token(self): "Read a token from the input stream (no pushback or inclusions)" while 1: if (self.nextToken != ""): self.token = self.nextToken self.nextToken = "" # Bah. SUPER ugly non portable if self.token == "/": self.state = ' ' break nextchar = self.instream.read(1) if nextchar == '\n': self.lineno = self.lineno + 1 if self.debug >= 3: print "shlex: in state ", repr(self.state), " I see character:", repr(nextchar) if self.state is None: self.token = '' # past end of file break elif self.state == ' ': if not nextchar: self.state = None # end of file break elif nextchar in self.whitespace: if self.debug >= 2: print "shlex: I see whitespace in whitespace state" if self.token: break # emit current token else: continue elif nextchar in self.commenters: self.instream.readline() self.lineno = self.lineno + 1 elif nextchar in self.wordchars: self.token = nextchar self.state = 'a' elif nextchar in self.quotes: self.token = nextchar self.state = nextchar elif nextchar in ['<', '>']: self.token = nextchar self.state = '<' else: self.token = nextchar if self.token: break # emit current token else: continue elif self.state == '<': # Only accumulate <=, >= or <> if self.token == ">" and nextchar == "=": self.token = self.token + nextchar self.state = ' ' break elif self.token == "<" and nextchar in ['>', '=']: self.token = self.token + nextchar self.state = ' ' break elif not nextchar: self.state = None break elif nextchar == "/": self.state = "/" self.nextToken = "/" break elif nextchar in self.wordchars: self.state='a' self.nextToken = nextchar break elif nextchar in self.quotes: self.state=nextchar self.nextToken = nextchar break else: self.state = ' ' break elif self.state in self.quotes: self.token = self.token + nextchar # Allow escaped quotes if nextchar == self.state and self.token[-2] != '\\': self.state = ' ' break elif not nextchar: # end of file if self.debug >= 2: print "shlex: I see EOF in quotes state" # Override SHLEX's ValueError to throw diagnostic diag = Diagnostic14() diag.details = self.token[:-1] raise diag elif self.state == 'a': if not nextchar: self.state = None # end of file break elif nextchar in self.whitespace: if self.debug >= 2: print "shlex: I see whitespace in word state" self.state = ' ' if self.token: break # emit current token else: continue elif nextchar in self.commenters: self.instream.readline() self.lineno = self.lineno + 1 elif nextchar in self.wordchars or nextchar in self.quotes: self.token = self.token + nextchar elif nextchar in ['>', '<']: self.nextToken = nextchar self.state = '<' break else: self.pushback = [nextchar] + self.pushback if self.debug >= 2: print "shlex: I see punctuation in word state" self.state = ' ' if self.token: break # emit current token else: continue result = self.token self.token = '' if self.debug > 1: if result: print "shlex: raw token=" + `result` else: print "shlex: raw token=EOF" return result class CQLParser: "Token parser to create object structure for CQL" parser = "" currentToken = "" nextToken = "" def __init__(self, p): """ Initialise with shlex parser """ self.parser = p self.fetch_token() # Fetches to next self.fetch_token() # Fetches to curr def is_boolean(self, token): "Is the token a boolean" token = token.lower() return token in booleans def fetch_token(self): """ Read ahead one token """ tok = self.parser.get_token() self.currentToken = self.nextToken self.nextToken = tok def prefixes(self): "Create prefixes dictionary" prefs = {} while (self.currentToken == ">"): # Strip off maps self.fetch_token() if self.nextToken == "=": # Named map name = self.currentToken self.fetch_token() # = is current self.fetch_token() # id is current identifier = self.currentToken self.fetch_token() else: name = "" identifier = self.currentToken self.fetch_token() if (errorOnDuplicatePrefix and prefs.has_key(name)): # Error condition diag = Diagnostic45() diag.details = name raise diag; if len(identifier) > 1 and identifier[0] == '"' and identifier[-1] == '"': identifier = identifier[1:-1] prefs[name.lower()] = identifier return prefs def query(self): """ Parse query """ prefs = self.prefixes() left = self.subQuery() while 1: if not self.currentToken: break; bool = self.is_boolean(self.currentToken) if bool: boolobject = self.boolean() right = self.subQuery() # Setup Left Object trip = tripleType() trip.leftOperand = left trip.boolean = boolobject trip.rightOperand = right left.parent = trip right.parent = trip boolobject.parent = trip left = trip else: break; for p in prefs.keys(): left.addPrefix(p, prefs[p]) return left def subQuery(self): """ Find either query or clause """ if self.currentToken == "(": self.fetch_token() # Skip ( object = self.query() if self.currentToken == ")": self.fetch_token() # Skip ) else: diag = Diagnostic13() diag.details = self.currentToken raise diag else: prefs = self.prefixes() if (prefs): object = self.query() for p in prefs.keys(): object.addPrefix(p, prefs[p]) else: object = self.clause() return object def clause(self): """ Find searchClause """ bool = self.is_boolean(self.nextToken) if not bool and not (self.nextToken in [')', '(', '']): index = indexType(self.currentToken) self.fetch_token() # Skip Index rel = self.relation() if (self.currentToken == ''): diag = Diagnostic10() diag.details = "Expected Term, got end of query." raise(diag) term = termType(self.currentToken) self.fetch_token() # Skip Term irt = searchClauseType(index, rel, term) elif self.currentToken and (bool or self.nextToken in [')', '']): irt = searchClauseType(indexType(serverChoiceIndex), relationType(serverChoiceRelation), termType(self.currentToken)) self.fetch_token() elif self.currentToken == ">": prefs = self.prefixes() # iterate to get object object = self.clause() for p in prefs.keys(): object.addPrefix(p, prefs[p]); return object else: diag = Diagnostic10() diag.details = "Expected Boolean or Relation but got: " + self.currentToken raise diag return irt def modifiers(self): mods = [] while (self.currentToken == modifierSeparator): self.fetch_token() mod = self.currentToken mod = mod.lower() if (mod == modifierSeparator): diag = Diagnostic20() diag.details = "Null modifier" raise diag self.fetch_token() comp = self.currentToken if (comp in order): self.fetch_token() value = self.currentToken self.fetch_token() else: comp = "" value = "" mods.append(ModifierClause(mod, comp, value)) return mods def boolean(self): """ Find boolean """ self.currentToken = self.currentToken.lower() if self.currentToken in booleans: bool = booleanType(self.currentToken) self.fetch_token() bool.modifiers = self.modifiers() for b in bool.modifiers: b.parent = bool else: diag = Diagnostic37() diag.details = self.currentToken raise diag return bool def relation(self): """ Find relation """ self.currentToken = self.currentToken.lower() rel = relationType(self.currentToken) self.fetch_token() rel.modifiers = self.modifiers() for r in rel.modifiers: r.parent = rel return rel class XCQLParser: """ Parser for XCQL using some very simple DOM """ def firstChildElement(self, elem): """ Find first child which is an Element """ for c in elem.childNodes: if c.nodeType == Node.ELEMENT_NODE: return c return None def firstChildData(self,elem): """ Find first child which is Data """ for c in elem.childNodes: if c.nodeType == Node.TEXT_NODE: return c return None def searchClause(self, elem): """ Process a """ sc = searchClauseType() for c in elem.childNodes: if c.nodeType == Node.ELEMENT_NODE: if c.localName == "index": sc.index = indexType(self.firstChildData(c).data.lower()) elif c.localName == "term": sc.term = termType(self.firstChildData(c).data) elif c.localName == "relation": sc.relation = self.relation(c) elif c.localName == "prefixes": sc.prefixes = self.prefixes(c) else: raise(ValueError, c.localName) return sc def triple(self, elem): """ Process a """ trip = tripleType() for c in elem.childNodes: if c.nodeType == Node.ELEMENT_NODE: if c.localName == "boolean": trip.boolean = self.boolean(c) elif c.localName == "prefixes": trip.prefixes = self.prefixes(c) elif c.localName == "leftOperand": c2 = self.firstChildElement(c) if c2.localName == "searchClause": trip.leftOperand = self.searchClause(c2) else: trip.leftOperand = self.triple(c2) else: c2 = self.firstChildElement(c) if c2.localName == "searchClause": trip.rightOperand = self.searchClause(c2) else: trip.rightOperand = self.triple(c2) return trip def relation(self, elem): """ Process a """ rel = relationType() for c in elem.childNodes: if c.nodeType == Node.ELEMENT_NODE: if c.localName == "value": rel.value = c.firstChild.data.lower() elif c.localName == "modifiers": mods = [] for c2 in c.childNodes: if c2.nodeType == Node.ELEMENT_NODE: if c2.localName == "modifier": for c3 in c2.childNodes: if c3.localName == "value": val = self.firstChildData(c2).data.lower() mods.append(val) rel.modifiers = mods return rel def boolean(self, elem): "Process a " bool = booleanType() for c in elem.childNodes: if c.nodeType == Node.ELEMENT_NODE: if c.localName == "value": bool.value = self.firstChildData(c).data.lower() else: # Can be in any order, so we need to extract, then order mods = {} for c2 in c.childNodes: if c2.nodeType == Node.ELEMENT_NODE: if c2.localName == "modifier": type = "" value = "" for c3 in c2.childNodes: if c3.nodeType == Node.ELEMENT_NODE: if c3.localName == "value": value = self.firstChildData(c3).data.lower() elif c3.localName == "type": type = self.firstChildData(c3).data mods[type] = value modlist = [] for t in booleanModifierTypes[1:]: if mods.has_key(t): modlist.append(mods[t]) else: modlist.append('') bool.modifiers = modlist return bool def prefixes(self, elem): "Process " prefs = {} for c in elem.childNodes: if c.nodeType == Node.ELEMENT_NODE: # prefix name = "" identifier = "" for c2 in c.childNodes: if c2.nodeType == Node.ELEMENT_NODE: if c2.localName == "name": name = self.firstChildData(c2).data.lower() elif c2.localName == "identifier": identifier = self.firstChildData(c2).data prefs[name] = identifier return prefs def xmlparse(s): """ API. Return a seachClause/triple object from XML string """ doc = parseString(s) q = xcqlparse(doc.firstChild) return q def xcqlparse(query): """ API. Return a searchClause/triple object from XML DOM objects""" # Requires only properties of objects so we don't care how they're generated p = XCQLParser() if query.localName == "searchClause": return p.searchClause(query) else: return p.triple(query) def parse(query): """ API. Return a searchClause/triple object from CQL string""" try: query = query.encode("utf-8") except: diag = Diagnostic10() diag.details = "Cannot parse non utf-8 characters" raise diag q = StringIO(query) lexer = CQLshlex(q) parser = CQLParser(lexer) object = parser.query() if parser.currentToken != '': diag = Diagnostic10() diag.details = "Unprocessed tokens remain: " + repr(parser.currentToken) raise diag else: del lexer del parser del q return object # Assign our objects to generate tripleType = Triple booleanType = Boolean relationType = Relation searchClauseType = SearchClause modifierClauseType = ModifierClause modifierTypeType = ModifierType indexType = Index termType = Term try: from CQLUtils import * tripleType = CTriple booleanType = CBoolean relationType = CRelation searchClauseType = CSearchClause modifierClauseType = CModifierClause modifierTypeType = CModifierType indexType = CIndex termType = CTerm except: # Nested scopes. Utils needs our classes to parent # We need its classes to build (maybe) pass if (__name__ == "__main__"): import sys; s = sys.stdin.readline() try: q = parse(s); except SRWDiagnostic, diag: # Print a full version, not just str() print "Diagnostic Generated." print " Code: " + str(diag.code) print " Details: " + str(diag.details) print " Message: " + str(diag.message) else: print q.toXCQL()[:-1];