#!/usr/bin/env python """Parses MARC-format data. The MARC class has a constructor which takes binary MARC data. """ # This file should be available from # http://www.pobox.com/~asl2/software/PyZ3950/ # and is licensed under the X Consortium license: # Copyright (c) 2001, Aaron S. Lav, asl2@pobox.com # All rights reserved. # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, and/or sell copies of the Software, and to permit persons # to whom the Software is furnished to do so, provided that the above # copyright notice(s) and this permission notice appear in all copies of # the Software and that both the above copyright notice(s) and this # permission notice appear in supporting documentation. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT # OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR # HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL # INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # Except as contained in this notice, the name of a copyright holder # shall not be used in advertising or otherwise to promote the sale, use # or other dealings in this Software without prior written authorization # of the copyright holder. import sys import string from xml.sax.saxutils import escape class MarcError (Exception): pass def is_fixed (num): return num < 10 fieldsep = '\x1e' sep = '\x1f' # XXX or 1D for pseudo-marc output from z3950.c recsep = '\x1d' # Attributes for SGML DTD (!!!) If not present, then I1 I2 attrHash = { 22 : ['ISDSLvl', 'I2'], 24 : ['StdNum', 'DiffInd'], 28 : ['PubNmTyp', 'NteAdEnty'], 33 : ['DateType', 'EventTyp'], 34 : ['ScapeTyp', 'I2'], 41 : ['TransInd', 'I2'], 45 : ['TimePrd', 'I2'], 50 : ['InLofC', 'CNSrc'], 55 : ['InNLC', 'CNCLSSrc'], 60 : ['InNLM', 'CNSrc'], 70 : ['InNAL', 'I2'], 72 : ['I1', 'CodeSrc'], 82 : ['Edition', 'CNSrc'], 86 : ['NumbrSrc', 'I2'], 100 : ['NameType', 'I2'], 110: ['NameType', 'I2'], 111 : ['NameType', 'I2'], 130: ['NFChars', 'I2'], 150 : ['I1', 'NFChars'], 151: ['I1', 'NFChars'], 210 : ['AddEnty', 'I2'], 211: ['AddEnty', 'NFChars'], 212 : ['AddEnty', 'I2'], 214: ['AddEnty', 'NFChars'], 222 : ['I1', 'NFChars'], 240: ['PrntDisp', 'NFChars'], 242 : ['AddEnty', 'NFChars'], 243: ['PrntDisp', 'NFChars'], 245 : ['AddEnty', 'NFChars'], 246: ['NCAddEty', 'TitleTyp'],247 : ['AddEnty', 'NoteCntl'], 270: ['Level', 'AddrType'], 355 : ['CntlElmt', 'I2'], 362: ['DTFormat', 'I2'], 400 : ['NameType', 'Pronoun'], 410: ['NameType', 'Pronoun'], 411 : ['NameType', 'Pronoun'], 430: ['I1', 'NFChars'], 440 : ['I1', 'NFChars'], 450: ['I1', 'NFChars'], 451 : ['I1', 'NFChars'], 490: ['Traced', 'I2'], 505 : ['DCC', 'CDLevel'], 510: ['CoverLoc', 'I2'], 511 : ['DCC', 'I2'], 516: ['DCC', 'I2'], 521 : ['DCC', 'I2'], 520: ['DCC', 'I2'], 522 : ['DCC', 'I2'], 524: ['DCC', 'I2'], 535 : ['Holds', 'I2'], 537: ['DCC', 'I2'], 551 : ['I1', 'NFChars'], 555: ['DCC', 'I2'], 556 : ['DCC', 'I2'], 565: ['DCC', 'I2'], 567 : ['DCC', 'I2'], 581: ['DCC', 'I2'], 582 : ['DCC', 'I2'], 586: ['DCC', 'I2'], 600 : ['NameType', 'SubjSys'], 610: ['NameType', 'SubjSys'], 611 : ['NameType', 'SubjSys'], 630: ['NFChars', 'SubjSys'], 650 : ['SubjLvl', 'SubjSys'], 651: ['I1', 'SubjSys'], 653 : ['IndexLvl', 'I2'], 654: ['IndexLvl', 'I2'], 655 : ['Type', 'Source'], 656: ['I1', 'Source'], 656 : ['I1', 'Source'], 700: ['NameType','EntryType'],710 : ['NameType','EntryType'], 711: ['NameType','EntryType'],730 : ['NFChars','EntryType'], 740: ['NFChars','EntryType'], 760 : ['NoteCntl', 'I2'], 762: ['NoteCntl', 'I2'], 765 : ['NoteCntl', 'I2'], 767: ['NoteCntl', 'I2'], 772 : ['NoteCntl', 'I2'], 773: ['NoteCntl', 'I2'], 775 : ['NoteCntl', 'I2'], 776: ['NoteCntl', 'I2'], 777 : ['NoteCntl', 'I2'], 780: ['NoteCntl', 'RelType'], 785 : ['NoteCntl', 'RelType'], 787: ['NoteCntl', 'I2'], 800 : ['NameType', 'I2'], 810: ['NameType', 'I2'], 811 : ['NameType', 'I2'], 830: ['I1', 'NFChars'], 852 : ['Scheme', 'Order'], 853: ['CmprsExpnd', 'Eval'], 853 : ['CmprsExpnd', 'Eval'], 856: ['AccsMeth', 'I2'], 863 : ['EncLevel', 'HoldForm'], 864: ['EncLevel','HoldForm'], 865 : ['EncLevel', 'HoldForm'], 866: ['EncLevel','Notation'], 867 : ['EncLevel', 'Notation'], 868: ['EncLevel','Notation'], 886 : ['FldType', 'I2']} subfieldHash = {'1' : "one", '2' : "two", '3' : "three", '4' : "four", '5' : "five", '6' : "six", '7' : "seven", '8' : "eight", '9' : "nine", '0' : "zero"} # takes text, turns it into tuple of (ind1, ind2, list of (subfield, val)) # where subfield may repeat within the list. # We need a structure like this in order to correctly parse both records: # 650 0 $aWorld War, 1939-1945$xCampaigns$zTunisia # 650 0 $aReal property$zMississippi$zTippah County$xMaps # (taken from _USMARC Format for Bibliographic Data_, Prepared by Network # Development and MARC Standards Office, Cataloging Distribution Service, # Library of Congress, section 650 p. 5, page printed Dec 1991, looseleaf # binder issued in 1988. def parse_sub (field): if len (field) < 4: if field == ' ': # Is this legit? I've seen it, so handle correctly. # specifically for au=Johansen, Arnold S from z3950.bibsys.no:2100 return (' ', ' ', []) return None if field [2] <> sep: print "Bad field [2]", repr (field[2]) return None ind1 = field[0] ind2 = field[1] sublist = [] splitlist = string.split (field[2:], sep) for sub in splitlist: if (sub == ''): # we begin w/ sep, so there's an empty prefix continue sublist.append ((sub[0], string.strip(sub[1:]))) return (ind1, ind2, sublist) class MARC: """Parses data into 'fields' attribute, indexed by field number. Each value is a list. For fixed fields, it's a list of the string data (one string for each occurence of the field in the original data). For other fields, each list element is a tuple of (indicator 1, indicator 2, subdata), where subdata is a list of tuples of (subfield indicator, subfield data). Yes, this is kinda lame and I really should have used structures, but this was some of the first Python code I ever wrote. """ hdrbits = [5,6,7,8,17,18,19] # Status, Type, Bib. Level, Type of Ctrl., Enc. Level, # Descr. Cat. Form, Linked Rcd Reqt are all part of pseudoentry 0 def __init__(self, MARC = None, strict = 1): """Parses MARC data. According to Bill Oldroyd (Bill.Oldroyd at bl.uk), some servers don't set the character set and/or other bits of the MARC header properly, so it's useful to set strict=0 when dealing with such servers.""" self.fields = {} self.ok = 0 self.marc = MARC if MARC == None: return # we'll write to it later reclen = self.extract_int (0,4) self.reclen = reclen baseaddr = self.extract_int (12, 16) zerostr = "" for ind in self.hdrbits: zerostr = zerostr + self.marc[ind] self.fields [0] = [zerostr] if strict: assert (self.marc[9] == ' ') # 'a' would be UCS/Unicode assert (self.marc[10] == '2' and self.marc[11] == '2') assert (self.marc[20:22] == '45') pos = 24 lastpos = baseaddr while pos < baseaddr: tag = self.marc[pos:pos+3] if tag [0] == '\035' or tag [0] == '\036': break fieldlen = self.extract_int (pos + 3, pos + 6) startpos = self.extract_int (pos + 7, pos + 11) pos = pos + 12 start = baseaddr + startpos end = start + fieldlen line = self.marc[start:end] lastpos = startpos if line [-1] == '\x1E': line = line[:-1] else: print "Weird, no hex 1E for", tag, repr(line) field = string.atoi (tag) if is_fixed (field): self.fields[field] = [line] # 1-elt list for orthogonality of processing else: ps = parse_sub (line) if ps == None: raise MarcError (line) self.fields.setdefault (field, []).append (ps) self.ok = 1 # XXX should do more error-checking def __str__ (self): k = self.fields.keys () k.sort () lst = [] for field in k: lst.append (self.stringify_field (field)) return "MARC: \n" + "\n".join (lst) def stringify_field (self, k): f = self.fields [k] if is_fixed (k): return str (k) + " " + f[0] else: str_l = [] for l in f: def fmt (x): return '$%s%s' % (x[0], x[1]) sl = map (fmt, l[2]) str_l.append (str(k) + " " + l[0] + l[1] + " ".join (sl)) return "\n".join (str_l) def extract_int (self, start, end): return string.atoi (self.marc[start:end+1]) def get_MARC (self): hdrlist = [' '] * 24 zerostr = self.fields [0][0] for i in range (len (zerostr)): hdrlist [self.hdrbits [i]] = zerostr [i] hdrlist [10] = '2' # replace these with data map, assert on read hdrlist [11] = '2' hdrlist [20] = '4' hdrlist [21] = '5' hdrlist [22] = '0' hdrlist [23] = '0' # later - 0-4 log. record length, 12-16 base addr of data # directory: 3 of tag, 4 of field len, 5 of starting pos (rel. # to base address of data, 12-16 fields = self.fields.keys () data = '' directory = '' for field in fields: if field == 0: # pseudofield continue for fielddat in self.fields [field]: start = len (data) if is_fixed (field): data += fielddat else: sublist = (fielddat [0] + fielddat [1] + "".join (map (lambda s: sep + s[0] + s[1], fielddat[2]))) data += sublist data += fieldsep # XXX is this right? length = len (data) - start directory += "%.03d%.04d%.05d" % (field, length, start) def id (x): return x data += fieldsep + recsep hdrlist [0:5] = map (id, "%.05d" % (len (hdrlist) + len (directory) + len (data),)) hdrlist [12:17] = map (id,"%.05d" % (len (hdrlist) + len (directory),)) return "".join (hdrlist) + directory + data def toMARCXML(self): " Convert record to MarcXML Schema " keys = self.fields.keys() keys.sort() xmllist = ["\n", " %s\n" % (self.get_MARC()[:24])] for key in keys: if key == 0: # XXX Skip?? What are these?? pass elif key < 10: xmllist.append(" %s\n" % (key, self.fields[key][0])) else: for instance in self.fields[key]: if key < 100: keystr = "0" + str(key) else: keystr = str(key) xmllist.append(" \n" % (keystr, instance[0], instance[1])) for sub in instance[2]: xmllist.append(" %s\n" % (sub[0], escape(sub[1]))) xmllist.append(" \n") xmllist.append("") xml = ''.join(xmllist) return xml def toOAIMARC(self): """Convert record to OAI MARC XML Schema. Note Well that OAI-MHP 2.0 recommends using MarcXML""" keys = self.fields.keys() keys.sort() marc = self.get_MARC() # What should these attributes really be? xmllist = ['\n' % (marc[6], marc[7])] for key in keys: if key == 0: # Skip?? What are these? pass elif key < 10: xmllist.append(" %s\n" % (key, self.fields[key][0])) else: for instance in self.fields[key]: xmllist.append(" \n" % (key, instance[0], instance[1])) for sub in instance[2]: xmllist.append(" %s\n" % (sub[0], escape(sub[1]))) xmllist.append(" \n") xmllist.append("") xml = ''.join(xmllist) return xml def sgml_processCode(self, k): if attrHash.has_key(k): i1 = attrHash[k][0] i2 = attrHash[k][1] else: i1 = "I1" i2 = "I2" if k < 100: keystr = "0%d" % (k) else: keystr = str(k) sgmllist = [] for instance in self.fields[k]: sgmllist.append(' \n' % (keystr, i1, instance[0], i2, instance[1])) for sub in instance[2]: stag = sub[0] if subfieldHash.has_key(stag): stag = subfieldHash[stag] sgmllist.append(' <%s>%s\n' % (stag, escape(sub[1]), stag)) sgmllist.append(' \n' % (keystr)) sgml = ''.join(sgmllist) return sgml def toSGML(self): """ Convert record to USMARC SGML """ keys = self.fields.keys() keys.sort() # Extract field ranges cflds = [] numbcode = [] mainenty = [] titles = [] edimprnt = [] physdesc = [] series = [] notes = [] subjaccs = [] addenty = [] linkenty = [] saddenty = [] holdaltg = [] fld9xx = [] # Ugly for k in keys: if k == 0: pass elif k < 10: cflds.append(k) elif k < 100: numbcode.append(k) elif k < 200: mainenty.append(k) elif k < 250: titles.append(k) elif k < 300: edimprnt.append(k) elif k < 400: physdesc.append(k) elif k < 500: series.append(k) elif k < 600: notes.append(k) elif k < 700: subjaccs.append(k) elif k < 760: addenty.append(k) elif k < 800: linkenty.append(k) elif k < 840: saddenty.append(k) elif k < 900: holdaltg.append(k) else: fld9xx.append(k) marc = self.get_MARC() sgml = ["\n"] sgml.append(" \n") sgml.append(" %s\n" % (marc[:5])) sgml.append(" %s\n" % (marc[5])) sgml.append(" %s\n" % (marc[6])) sgml.append(" %s\n" % (marc[7])) sgml.append(" %s\n" % (marc[8:10])) sgml.append(" %s\n" % (marc[10])) sgml.append(" %s\n" % (marc[11])) sgml.append(" %s\n" % (marc[12:17])) sgml.append(" %s\n" % (marc[17])) sgml.append(" %s\n" % (marc[18])) sgml.append(" %s\n" % (marc[19])) sgml.append(" \n") sgml.append(" %s\n" % (marc[20])) sgml.append(" %s\n" % (marc[21])) sgml.append(" %s\n" % (marc[22])) sgml.append(" %s\n" % (marc[23])) sgml.append(" \n") sgml.append(" \n") sgml.append(" \n") sgml.append(" \n") sgml.append(" \n") for k in cflds: sgml.append(" %s\n" % (k, self.fields[k][0], k)) sgml.append(" \n") sgml.append(" \n") sgml.append(" \n") for k in numbcode: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if mainenty: sgml.append(" \n") for k in mainenty: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if titles: sgml.append(" \n") for k in titles: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if edimprnt: sgml.append(" \n") for k in edimprnt: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if physdesc: sgml.append(" \n") for k in physdesc: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if series: sgml.append(" \n") for k in series: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if notes: sgml.append(" \n") for k in notes: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if subjaccs: sgml.append(" \n") for k in subjaccs: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if addenty: sgml.append(" \n") for k in addenty: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if linkenty: sgml.append(" \n") for k in linkenty: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if saddenty: sgml.append(" \n") for k in saddenty: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if holdaltg: sgml.append(" \n") for k in holdaltg: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") if fld9xx: sgml.append(" \n") for k in fld9xx: sgml.append(self.sgml_processCode(k)) sgml.append(" \n") sgml.append(" \n") sgml.append(" \n") sgml.append("") return ''.join(sgml) def toSimpleDC(self): """ Convert Marc into DC according to LC Crosswalk """ xml = ['\n'] # Title -> 245 if self.fields.has_key(245): instance = self.fields[245][0][2] a = '' b = '' for sub in instance: if sub[0] == 'a': a = sub[1] elif sub[0] == 'b': b = sub[1] if a and b and a[-1] in [',', '.', ';', ':']: a += " " + b elif a and b: a += "; " + b elif b and not a: a = b xml.append(" %s\n" % (a)) # Creator -> 100,110,111,700,710,711 authorKeys = [100, 110, 111, 700, 710, 711] for k in authorKeys: if self.fields.has_key(k): for instance in self.fields[k]: a = '' h = '' d = '' for sub in instance[2]: if sub[0] == 'a': a = sub[1] elif sub[0] == 'h': h = sub[1] elif sub[0] == 'd': d = sub[1] if h: a += ", " + h if d: a += " (" + d + ")" xml.append(" %s\n" % (a)) # Subject -> 600,610, 611, 630, 650, 653 # Just dump in directly... subjectList = [600, 610, 611, 630, 650, 653] for s in subjectList: if self.fields.has_key(s): for instance in self.fields[s]: subject = '' for sub in instance[2]: subject += sub[1] + " -- " subject = subject[:-4] xml.append(" %s\n" % (subject)) # Publisher -> 260$a$b if self.fields.has_key(260): for instance in self.fields[260]: a = b = '' for sub in instance[2]: if sub[0] == 'a': a = sub[1] elif sub[0] == 'b': b = sub[1] if b[-1] in [',', ';', ':']: b = b[:-1] elif sub[0] == 'c': d = sub[1] if d[-1] == '.': d = d[:-1] xml.append(" %s\n" % (d)) if b: a += " " + b if a: xml.append(" %s\n" % (a)) # Type -> 655 if self.fields.has_key(655): for instance in self.fields[655]: gf = '' for sub in instance[2]: gf += sub[1] + " -- " gf = gf[:-4] xml.append(" %s\n" % (gf)) # Non Standard: Identifier -> ISSN/ISBN for k in [20,22]: if self.fields.has_key(k): for instance in self.fields[k]: for sub in instance[2]: if sub[0] == 'a': xml.append(" %s\n" % (sub[1])) # Non Standard: Description -> 300 if self.fields.has_key(300): for instance in self.fields[300]: desc = '' for sub in instance[2]: desc += sub[1] + " " desc = desc[:-1] xml.append(" %s\n" % (desc)) xml.append("") return ''.join(xml) def toMODS(self): """ Tranform MARC record into MODS according to CrossWalk """ xml = ["\n"] # --- TitleInfo Fields --- if self.fields.has_key(245): instance = self.fields[245][0][2] xml.append(" \n ") insubtitle = 0 for sub in instance: if (sub[0] in ['a', 'f', 'g', 'k']): xml.append(escape(sub[1])) xml.append(' ') elif (sub[0] == 'b'): xml.append("\n %s " % (escape(sub[1]))) insubtitle = 1 if (insubtitle): xml.append("\n \n") else: xml.append("\n \n") if self.fields.has_key(210): instance = self.fields[210][0][2] subf = {} for sub in instance: subf[sub[0]] = escape(sub[1]) xml.append(' \n %s\n' % (subf['a'])) if (subf.has_key('b')): xml.append(' %s\n' % (subf['b'])) xml.append(' \n') if self.fields.has_key(242): instance = self.fields[242][0][2] subf = {} for sub in instance: subf[sub[0]] = escape(sub[1]) if (subf.has_key('i')): label = ' displayLabel="%s"' % (subf['i']) else: label = '' xml.append(' \n %s\n' % (label, subf['a'])) if (subf.has_key('b')): xml.append(' %s\n' % (subf['b'])) if (subf.has_key('n')): xml.append(' %s\n' % (subf['n'])) if (subf.has_key('p')): xml.append(' %s\n' % (subf['p'])) xml.append(' \n') if self.fields.has_key(246): full = self.fields[246][0] subfield2 = full[1] instance = full[2] subf = {} for sub in instance: subf[sub[0]] = escape(sub[1]) if (subfield2 == 1): xml.append(' \n %s\n' % (subf['a'])) else: xml.append(' \n %s\n' % (subf['a'])) if (subf.has_key('b')): xml.append(' %s\n' % (subf['b'])) if (subf.has_key('n')): xml.append(' %s\n' % (subf['n'])) if (subf.has_key('p')): xml.append(' %s\n' % (subf['p'])) xml.append(' \n') if self.fields.has_key(130): uniform = self.fields[130][0][2] elif self.fields.has_key(240): uniform = self.fields[240][0][2] else: uniform = [] if (uniform): subf = {} for sub in uniform: subf[sub[0]] = escape(sub[1]) xml.append(' \n %s\n' % (subf['a'])) if (subf.has_key('n')): xml.append(' %s\n' % (subf['n'])) if (subf.has_key('p')): xml.append(' %s\n' % (subf['p'])) xml.append(' \n') # --- Name Fields --- # Creator -> 100,110,111, 700,710,711 authorKeyTypes = {100 : 'personal', 110 : 'corporate', 111 : 'conference', 700 : 'personal', 710 : 'corporate', 711 : 'conference'} for k in authorKeyTypes.keys(): if self.fields.has_key(k): for instance in self.fields[k]: subf = {} for sub in instance[2]: subf[sub[0]] = escape(sub[1]) xml.append(' \n' % (k)) xml.append(' \n' % (authorKeyTypes[k])) xml.append(' creator\n') xml.append(' %s\n' % (subf['a'])) if (subf.has_key('d')): xml.append(' %s\n' % (subf['d'])) if (subf.has_key('b')): if (k in [100,700]): xml.append(' %s\n' % (subf['b'])) else: xml.append(' %s\n' % (subf['b'])) if (subf.has_key('e')): xml.append(' %s\n' % (subf['e'])) if (subf.has_key('4')): xml.append(' %s\n' % (subf['4'])) xml.append(' \n') ldr = self.fields[0][0] type = ldr[1] types = {'a' : 'text', 't' : 'text', 'e' : 'cartographic', 'f' : 'cartographic', 'c' : 'notated music', 'd' : 'notated music', 'i' : 'sound recording - nonmusical', 'j' : 'sound recording - musical', 'k' : 'still image', 'g' : 'moving image', 'r' : 'three dimensional object', 'm' : 'software, multimedia', 'p' : 'mixed material'} if (types.has_key(type)): xml.append(' %s\n' % (types[type])) if (self.fields.has_key(8)): instance = self.fields[8][0] # XXX LONG set of checks for type and various 008 positions :( if (len(instance) > 33 and instance[33] == '0'): xml.append(' non fiction\n') if self.fields.has_key(655): for instance in self.fields[655]: gf = '' for sub in instance[2]: gf += escape(sub[1]) + " -- " gf = gf[:-4] xml.append(" %s\n" % (gf)) # PublicationInfo from 260 f260 = self.fields.get(260, []) f44 = self.fields.get(44, []) f46 = self.fields.get(46, []) f250 = self.fields.get(250, []) f310 = self.fields.get(310, []) f321 = self.fields.get(321, []) f8 = self.fields.get(8, []) if f260 or f46 or f250 or f310 or f321: xml.append(' \n') if (f8 and len(f8[0]) > 18 ): loc = f8[0][15:18] if (loc <> ' ' and loc <> '|||'): xml.append(' %s\n' % (loc)) if (f44): for s in f44[0][2]: if (s[0] == 'c'): xml.append(' %s\n' % (escape(s[1]))) if (f260): instance = self.fields[260][0][2] subf260 = {} for sub in instance: subf260[sub[0]] = escape(sub[1]) if (subf260.has_key('a')): xml.append(' %s\n' % (subf260['a'])) if (subf260.has_key('b')): xml.append(' %s\n' % (subf260['b'])) if (subf260.has_key('c')): xml.append(' %s\n' % (subf260['c'])) if (f8 and len(f8[0]) > 6): f8type = f8[0][6] if (f8type in ['e', 'p', 'r', 's', 't']): date = f8[0][7:11] if (date <> ' '): xml.append(' %s\n' % (date)) if (f8type in ['c', 'd', 'i', 'k', 'm', 'u', 'q']): if (f8type == 'q'): attrib = ' qualifier="questionable"' else: attrib = "" start = f8[0][7:11] if (start <> ' '): xml.append(' %s\n' % (attrib, start)) end = f8[0][11:15] if (end <> ' '): xml.append(' %s\n' % (attrib, end)) if (f260): if subf260.has_key('g'): xml.append(' %s\n' % (escape(subf260['g']))) if (f46): instance = f46[0][2] subf46 = {} for s in instance: subf46[s[0]] = escape(s[1]) if (subf46.has_key('k')): xml.append(' %s\n' % (subf46['k'])) if (subf46.has_key('l')): xml.append(' %s\n' % (subf46['l'])) if (subf46.has_key('m')): xml.append(' %s\n' % (subf46['m'])) if (subf46.has_key('n')): xml.append(' %s\n' % (subf46['n'])) if (subf46.has_key('j')): xml.append(' %s\n' % (subf46['j'])) if (f250): for s in f250[0][2]: if (s[0] == 'a'): xml.append(' %s\n' % (escape(s[1]))) break if (self.fields.has_key(0) and len(self.fields[0][0]) > 2): f0type = self.fields[0][0][2] if (f0type in ['b', 'i', 's']): xml.append(' continuing\n') elif (f0type in ['a', 'c', 'd', 'm']): xml.append(' monographic\n') if (f310): subf310 = {'a' : '', 'b' : ''} for s in f310[0][2]: subf310[s[0]] = escape(s[1]) xml.append(' %s %s\n' % (subf310['a'], subf310['b'])) if (f321): subf321 = {'a' : '', 'b' : ''} for s in f321[0][2]: subf321[s[0]] = escape(s[1]) xml.append(' %s %s\n' % (subf321['a'], subf321['b'])) xml.append(' \n') # --- Language --- if (f8 and len(f8[0]) > 38): lang = f8[0][35:38] if (lang <> ' '): xml.append(' %s\n' % (lang)) if self.fields.has_key(41): a = two = '' for sub in self.fields[41][0][2]: if sub[0] == 'a': a = sub[1] elif sub[0] == '2': two = sub[1] elif sub[0] == 'd' and not a: a = sub[1] elif sub[0] == 'e' and not a: a = sub[1] if a and not two: xml.append(' %s\n' % (escape(a))) elif a: xml.append(' %s\n' % (escape(two), escape(a))) # --- Physical Description --- # XXX: Better field 008, 242,245,246$h, 256$a f300 = self.fields.get(300, []) if (f8 and len(f8[0]) > 23): f8_23 = self.fields[8][0][23] else: f8_23 = ' ' if (f300 or f8_23 == ' '): xml.append(" \n") if (f8_23 == ' '): xml.append('
print
\n') if f300: desclist = [] for s in f300[0][2]: desclist.append(escape(s[1])) desc = ' '.join(desclist) xml.append(" %s\n" % (desc)) xml.append("
\n") # Abstract if self.fields.has_key(520): xml.append(' ') for sub in self.fields[520]: if sub[0] == 'a' or sub[0] == 'b': xml.append(escape(sub[1])) xml.append("\n") # --- Table of Contents --- if (self.fields.has_key(505)): desclist = [] for s in self.fields[505][0][2]: if (s[0] in ['a', 'g', 'r', 't']): desclist.append(escape(s[1])) toc = ' '.join(desclist) xml.append(' %s\n' % (toc)) # XXX TargetAudience (field 8 again) # --- Note --- if (self.fields.has_key(500)): for n in (self.fields[500]): xml.append(' '); for s in n: if (s[0] == 'a'): xml.append(escape(s[1])) xml.append('\n') # --- Subject --- subjectList = [600, 610, 611, 630, 650, 651, 653] for s in subjectList: if self.fields.has_key(s): for instance in self.fields[s]: xml.append(" \n") if (s in [600, 610, 611]): stype = {600 : 'personal', 610 : 'corporate', 611 : 'conference'}[s] xml.append(' \n' % (stype)) for sub in instance[2]: val = escape(sub[1]) if (sub[0] == 'a'): xml.append(' %s\n' % (val)) elif (sub[0] == 'b'): attrib = '' if (s == 600): attrib = ' type="termsOfAddress"' xml.append(' %s\n' % (attrib, val)) elif (sub[0] == 'd'): xml.append(' %s\n' % (val)) elif (sub[0] == 'e'): xml.append(' %s\n' % (val)) elif (sub[0] == '4'): xml.append(' %s\n' % (val)) elif (sub[0] == 'u'): xml.append(' %s\n' % (val)) elif sub[0] in ['v', 'x']: xml.append(' %s\n' % (val)) elif sub[0] == 'y': xml.append(' %s\n' % (val)) elif sub[0] == 'z': xml.append(' %s\n' % (val)) xml.append(' \n') elif (s == 630): for sub in instance[2]: val = escape(sub[1]) if (sub[0] == 'a'): xml.append(' %s\n' % (val)) elif (sub[0] == 'p'): xml.append(' %s\n' % (val)) elif (sub[0] == 'n'): xml.append(' %s\n' % (val)) elif sub[0] in ['v', 'x']: xml.append(' %s\n' % (val)) elif sub[0] == 'y': xml.append(' %s\n' % (val)) elif sub[0] == 'z': xml.append(' %s\n' % (val)) elif (s in [650, 653]): for sub in instance[2]: val = escape(sub[1]) if (sub[0] == 'a'): xml.append(' %s\n' % (val)) elif sub[0] in ['v', 'x']: xml.append(' %s\n' % (val)) elif sub[0] == 'y': xml.append(' %s\n' % (val)) elif sub[0] == 'z': xml.append(' %s\n' % (val)) elif (s == 651): for sub in instance[2]: val = escape(sub[1]) if (sub[0] == 'a'): xml.append(' %s\n' % (val)) elif sub[0] in ['v', 'x']: xml.append(' %s\n' % (val)) elif sub[0] == 'y': xml.append(' %s\n' % (val)) elif sub[0] == 'z': xml.append(' %s\n' % (val)) xml.append(" \n") if (self.fields.has_key(45)): full = self.fields[45][0] if (full[0] in ['0', '1']): for x in self.fields[2]: if (x[0] == 'b'): xml.append(' %s\n' % (escape(x[1]))) if (self.fields.has_key(43)): for sub in self.fields[43][0][2]: if (sub[0] == 'a'): xml.append(' %s\n' % (escape(sub[1]))) elif (sub[0] == 'a'): xml.append(' %s\n' % (escape(sub[1]))) if (self.fields.has_key(752)): xml.append(' \n') for sub in self.fields[43][0][2]: val = escape(sub[1]) if (sub[0] == 'a'): xml.append(' %s\n' % (val)) elif (sub[0] == 'b'): xml.append(' %s\n' % (val)) elif (sub[0] == 'c'): xml.append(' %s\n' % (val)) elif (sub[0] == 'd'): xml.append(' %s\n' % (val)) xml.append(' ') if (self.fields.has_key(255)): subf = {} xml.append(' \n') for s in self.fields[255][0][2]: subf[s[0]] = escape(s[1]) if (subf.has_key('c')): xml.append(' %s\n' % (subf['c'])) if (subf.has_key('a')): xml.append(' %s\n' % (subf['a'])) if (subf.has_key('b')): xml.append(' %s\n' % (subf['c'])) xml.append(' \n') if (self.fields.has_key(656)): for s in self.fields[656][0][2]: if (s[0] == 'a'): xml.append(' %s\n') # XXX: 34 # XXX: Classification, 84 cfields = {50 : 'lcc', 82 : 'ddc', 80 : 'udc', 60 : 'nlm'} for k in cfields: if (self.fields.has_key(k)): for sub in self.fields[k][0][2]: stuff = [] if (sub[0] == 'a'): stuff.append(escape(sub[1])) elif (sub[0] == 'b'): stuff.append(escape(sub[1])) txt = ' '.join(stuff) xml.append(' %s\n' % (cfields[k], txt)) if (self.fields.has_key(86)): full = self.fields[86][0] ind1 = full[0] if (ind1 == '0'): auth = 'sudocs' elif (ind1 == '1'): auth = 'candocs' else: auth = '' if (auth): for s in full[2]: if (s[0] == 'a'): xml.append(' %s\n' % (auth, escape(s[1]))) # XXX: relatedItem, 7XX # --- Identifier --- if self.fields.has_key(20): for instance in self.fields[20]: for sub in instance[2]: if sub[0] == 'a': xml.append(' %s\n' % (escape(sub[1]))) if self.fields.has_key(22): for instance in self.fields[22]: for sub in instance[2]: if sub[0] == 'a': xml.append(' %s\n' % (escape(sub[1]))) if self.fields.has_key(24): for instance in self.fields[24]: for sub in instance[2]: if sub[0] == 'a': xml.append(' %s\n' % (escape(sub[1]))) if self.fields.has_key(28): for instance in self.fields[28]: for sub in instance[2]: if sub[0] == 'a': xml.append(' %s\n' % (escape(sub[1]))) # XXX: location, accessCondition # --- recordInformation --- xml.append(' \n') if (self.fields.has_key(40)): for instance in self.fields[40]: for sub in instance[2]: if sub[0] == 'a': xml.append(' %s\n' % (escape(sub[1]))) if (self.fields.has_key(8)): date = self.fields[8][0][0:6] if (date <> ' '): xml.append(' %s\n' % (date)) if (self.fields.has_key(1)): xml.append(' %s\n' % (self.fields[1][0])) if (self.fields.has_key(40)): instance = self.fields[40][0][2] for s in instance: if (s[0] == 'b'): xml.append(' %s\n' % (escape(s[1]))) xml.append(' \n') xml.append("
") txt = ''.join(xml) return txt from PyZ3950 import marc_to_unicode # see http://www.loc.gov/marc/specifications/speccharmarc8.html import unicodedata class MARC8_to_Unicode: """Converts MARC-8 to Unicode. Note that currently, unicode strings aren't normalized, and some codecs (e.g. iso8859-1) will fail on such strings. When I can require python 2.3, this will go away. Warning: MARC-8 EACC (East Asian characters) makes some distinctions which aren't captured in Unicode. The LC tables give the option of mapping such characters either to a Unicode private use area, or a substitute character which (usually) gives the sense. I've picked the second, so this means that the MARC data should be treated as primary and the Unicode data used for display purposes only. (If you know of either of fonts designed for use with LC's private-use Unicode assignments, or of attempts to standardize Unicode characters to allow round-trips from EACC, or if you need the private-use Unicode character translations, please inform me, asl2@pobox.com.""" basic_latin = 0x42 ansel = 0x45 def __init__ (self, G0 = basic_latin, G1 = ansel): self.g0 = G0 self.g1 = G1 def is_multibyte (self, charset): return charset == 0x31 def translate (self, s): uni_list = [] combinings = [] pos = 0 while pos < len (s): if s[pos] == '\x1b': if (s[pos +1] == s[pos+2] and (s[pos +1] == '$' or s[pos+1] == '(')): self.g0 = ord (s[pos+3]) pos = pos + 4 continue mb_flag = self.is_multibyte (self.g0) if mb_flag: d = (ord (s[pos]) * 65536 + ord (s[pos+1]) * 256 + ord (s[pos+2])) pos += 3 else: d = ord (s[pos]) pos += 1 if (d < 0x20 or (d > 0x80 and d < 0xa0)): uni = unichr (d) continue if d > 0x80 and not mb_flag: (uni, cflag) = marc_to_unicode.codesets [self.g1] [d] else: (uni, cflag) = marc_to_unicode.codesets [self.g0] [d] if cflag: combinings.append (unichr (uni)) else: uni_list.append (unichr (uni)) if len (combinings) > 0: uni_list += combinings combinings = [] # what to do if combining chars left over? uni_str = u"".join (uni_list) # unicodedata.normalize not available until Python 2.3 if hasattr (unicodedata, 'normalize'): uni_str = unicodedata.normalize ('NFC', uni_str) return uni_str def test_convert (s, enc): conv = MARC8_to_Unicode () converted = conv.translate (s) converted = unicodedata.normalize ('NFC', converted) print converted.encode (enc) print repr (converted) if __name__ == '__main__': # My console is usually set to iso-8859-1. Sorry if yours is different. test_convert('''The oldest cuisine in the world : cooking in Mesopotamia / Jean Bott\xe2ero ; translated by Teresa Lavender Fagan.''', 'iso-8859-1') test_convert ( """$6 245-02/$1$a \x1b$$1!M>!`o!#!KPa!\\O!#!\x1b((B/$c \x1b$$1!1?!R_!#!-bb!#!!Gm!>`!#!\x1b((B; \x1b$$1!RY!YF!#!9Z6!#!!J(!Yi!#!\x1b((B;\x1b$$1!#!!BX!O>!#!!4`!4)!#!!\\e!#!!Hk!:M!#!\x1b((B... [et al.] ; \x1b$$1!Iq!MH!#!!9%!];!#!!KG!#!\x1b((B= Great garnishes / author, Huang Su-Huei ; translator, Yen-Jen Lai ; collaborators, Cheng-Tzu Chiu ... [et al.] ; photographers, Aki Ohno.""", 'utf-8') for f in sys.argv[1:]: marc_file = open(f, 'rb') marc_text = marc_file.read () while 1: marc_data1 = MARC(marc_text) print str (marc_data1) new = marc_data1.get_MARC () marc_data2 = MARC (marc_text) k1 = marc_data1.fields.keys () k2 = marc_data2.fields.keys () assert (k1 == k2) for field in k1: same = (marc_data1.fields [field] == marc_data2.fields [field]) assert (same) marc_text = marc_text[marc_data1.reclen:] if len (marc_text) == 0: break marc_file.close ()