From 795b420167556cf897cfee26c0b148a57e28fe63 Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Mon, 7 Mar 2011 20:40:06 +0900 Subject: [PATCH] Update merge script, separate mismatched files --- kanjivg.py | 73 ++++++++++++++++++++----------------------- mergexml.py | 83 +++++++++++++++++-------------------------------- updatepublic.sh | 11 +++---- 3 files changed, 66 insertions(+), 101 deletions(-) diff --git a/kanjivg.py b/kanjivg.py index 255b7a2d8..52644658a 100644 --- a/kanjivg.py +++ b/kanjivg.py @@ -18,7 +18,7 @@ from xmlhandler import * # Sample licence header -licenseString = """Copyright (C) 2009/2010 Ulrich Apel. +licenseString = """Copyright (C) 2009/2010/2011 Ulrich Apel. This work is distributed under the conditions of the Creative Commons Attribution-Share Alike 3.0 Licence. This means you are free: * to Share - to copy, distribute and transmit the work @@ -56,32 +56,35 @@ def realchr(i): class Kanji: """Describes a kanji. The root stroke group is accessible from the root member.""" - def __init__(self, id): - self.id = id - self.midashi = None + def __init__(self, code, variant): + # Unicode of char being represented (int) + self.code = code + # Variant of the character, if any + self.variant = variant self.root = None - def toSVG(self, out, indent = 0): - self.root.toSVG(out, self.id, [0], [1]) + # String identifier used to uniquely identify the kanji + def kId(self): + ret = "%05x" % (self.code,) + if self.variant: ret += "-%s" % (self.variant,) + return ret - def toXML(self, out, indent = 0): - out.write("\t" * indent + '\n' % (self.midashi, self.id)) - self.root.toXML(out, 0) - out.write("\t" * indent + '\n') + def toSVG(self, out, indent = 0): + self.root.toSVG(out, self.kId(), [0], [1]) def simplify(self): self.root.simplify() def getStrokes(self): return self.root.getStrokes() - + class StrokeGr: """Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked.""" def __init__(self, parent): self.parent = parent if parent: parent.childs.append(self) - # Element of strokegr, or midashi for kanji + # Element of strokegr self.element = None # A more common, safer element this one derives of self.original = None @@ -284,29 +287,19 @@ class StructuredStrokeGroup: class KanjisHandler(BasicHandler): """XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name.""" - def __init__(self): + def __init__(self, code, variant): BasicHandler.__init__(self) - self.kanjis = {} - self.currentKanji = None + self.kanji = Kanji(code, variant) self.groups = [] + self.compCpt = {} self.metComponents = set() def handle_start_kanji(self, attrs): - id = str(attrs["id"]) - self.currentKanji = Kanji(id) - self.currentKanji.midashi = unicode(attrs["midashi"]) - # Check that the ID matches the midashi - midashiNumber = "%04x" % (realord(self.currentKanji.midashi)) - if midashiNumber != id[:len(midashiNumber)]: - print "Warning: id does not match midashi (%s(%s) %s)" % (self.currentKanji.midashi, midashiNumber, id) - self.kanjis[id] = self.currentKanji - - self.compCpt = {} + pass def handle_end_kanji(self): if len(self.groups) != 0: print "WARNING: stroke groups remaining after reading kanji!" - self.currentKanji = None self.groups = [] def handle_start_strokegr(self, attrs): @@ -333,17 +326,17 @@ class KanjisHandler(BasicHandler): if group.original: self.metComponents.add(group.original) if group.number: - if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id) + if not group.part: print "%s: Number specified, but part missing" % (self.kanji.kId()) # The group must exist already if group.part > 1: if not self.compCpt.has_key(group.element + str(group.number)): - print "%s: Missing numbered group" % (self.currentKanji.id) + print "%s: Missing numbered group" % (self.kanji.kId()) elif self.compCpt[group.element + str(group.number)] != group.part - 1: - print "%s: Incorrectly numbered group" % (self.currentKanji.id) + print "%s: Incorrectly numbered group" % (self.kanji.kId()) # The group must not exist else: if self.compCpt.has_key(group.element + str(group.number)): - print "%s: Duplicate numbered group" % (self.currentKanji.id) + print "%s: Duplicate numbered group" % (self.kanji.kId()) self.compCpt[group.element + str(group.number)] = group.part # No number, just a part - groups restart with part 1, otherwise must # increase correctly @@ -351,17 +344,17 @@ class KanjisHandler(BasicHandler): # The group must exist already if group.part > 1: if not self.compCpt.has_key(group.element): - print "%s: Incorrectly started multi-part group" % (self.currentKanji.id) + print "%s: Incorrectly started multi-part group" % (self.kanji.kId()) elif self.compCpt[group.element] != group.part - 1: - print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id) + print "%s: Incorrectly splitted multi-part group" % (self.kanji.kId()) self.compCpt[group.element] = group.part def handle_end_strokegr(self): group = self.groups.pop() if len(self.groups) == 0: - if self.currentKanji.root: + if self.kanji.root: print "WARNING: overwriting root of kanji!" - self.currentKanji.root = group + self.kanji.root = group def handle_start_stroke(self, attrs): if len(self.groups) == 0: parent = None @@ -410,17 +403,17 @@ class SVGHandler(BasicHandler): if group.original: self.metComponents.add(group.original) if group.number: - if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id) + if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.kId()) # The group must exist already if group.part > 1: if not self.compCpt.has_key(group.element + str(group.number)): - print "%s: Missing numbered group" % (self.currentKanji.id) + print "%s: Missing numbered group" % (self.currentKanji.kId()) elif self.compCpt[group.element + str(group.number)] != group.part - 1: - print "%s: Incorrectly numbered group" % (self.currentKanji.id) + print "%s: Incorrectly numbered group" % (self.currentKanji.kId()) # The group must not exist else: if self.compCpt.has_key(group.element + str(group.number)): - print "%s: Duplicate numbered group" % (self.currentKanji.id) + print "%s: Duplicate numbered group" % (self.currentKanji.kId()) self.compCpt[group.element + str(group.number)] = group.part # No number, just a part - groups restart with part 1, otherwise must # increase correctly @@ -428,9 +421,9 @@ class SVGHandler(BasicHandler): # The group must exist already if group.part > 1: if not self.compCpt.has_key(group.element): - print "%s: Incorrectly started multi-part group" % (self.currentKanji.id) + print "%s: Incorrectly started multi-part group" % (self.currentKanji.kId()) elif self.compCpt[group.element] != group.part - 1: - print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id) + print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.kId()) self.compCpt[group.element] = group.part def handle_end_g(self): diff --git a/mergexml.py b/mergexml.py index dfe799a3d..8db123e55 100755 --- a/mergexml.py +++ b/mergexml.py @@ -75,53 +75,66 @@ class KanjiStrokeHandler(BasicHandler): if attrs.has_key("id") and attrs["id"] == "Vektorbild": self.active = True if __name__ == "__main__": + os.mkdir("output") + os.mkdir("output/SVG") + os.mkdir("output/SVGMismatch") files = os.listdir("XML") - kanjis = [] - mismatch = [] handled = set() metComponents = set() for f in files: - # Let's keep the variations out of the process for now... - if '-' in f: continue if not f.endswith(".xml"): continue - descHandler = KanjisHandler() - xml.sax.parse(os.path.join("XML", f), descHandler) - handled.add(realchr(int(f[:-4], 16))) + kId = f[:-4] + if "-" in kId: code, variant = kId.split("-") + else: code, variant = kId, None + # Parse XML + descHandler = KanjisHandler(int(code, 16), variant) + xml.sax.parse(os.path.join("XML", f), descHandler) + handled.add(kId) + + # Parse SVG parser = xml.sax.make_parser() svgHandler = KanjiStrokeHandler() parser.setContentHandler(svgHandler) parser.setFeature(xml.sax.handler.feature_external_ges, False) parser.setFeature(xml.sax.handler.feature_external_pes, False) - svgFile = os.path.join("SVG", f[:-3] + "svg") + svgFile = os.path.join("SVG", kId + ".svg") if os.path.exists(svgFile): parser.parse(svgFile) metComponents = metComponents.union(descHandler.metComponents) - kanji = descHandler.kanjis.values()[0] + kanji = descHandler.kanji desc = kanji.getStrokes() svg = svgHandler.strokes - if len(desc) != len(svg): mismatch.append((descHandler.kanjis.values()[0].root.element, len(desc), len(svg))) + # Copy SVG into kanji desc for i in range(min(len(desc), len(svg))): desc[i].svg = svg[i] + # Add dummy strokes for SVG orphans for i in range(len(desc), len(svg)): s = Stroke(kanji.root) s.stype = "Missing stroke" s.svg = svg[i] kanji.root.childs.append(s) - kanjis.append(kanji) + + if len(desc) != len(svg): dst = "SVGMismatch" + else: dst = "SVG" + out = codecs.open("output/%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8") + createSVG(out, kanji) + # Now parse orphan SVGs (probably just kana and romaji) files = os.listdir("SVG") for f in files: - # Let's keep the variations out of the process for now... - if '-' in f: continue if not f.endswith(".svg"): continue - if realchr(int(f[:-4], 16)) in handled: continue + kId = f[:-4] + if "-" in kId: code, variant = kId.split("-") + else: code, variant = kId, None + + if f[:-4] in handled: continue parser = xml.sax.make_parser() svgHandler = KanjiStrokeHandler() parser.setContentHandler(svgHandler) @@ -129,48 +142,10 @@ if __name__ == "__main__": parser.setFeature(xml.sax.handler.feature_external_pes, False) parser.parse(os.path.join("SVG", f)) - kanji = Kanji(f[:-4]) - kanji.midashi = unichr(int(f[:-4], 16)) + kanji = Kanji(int(code, 16), variant) kanji.root = StrokeGr(None) for s in svgHandler.strokes: stroke = Stroke(kanji.root) stroke.svg = s kanji.root.childs.append(stroke) - kanjis.append(kanji) - - # Stroke count mismatch kanji - mismatch.sort() - misout = codecs.open("Main.StrokeCountMismatch", "w", "utf-8") - misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=') - misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji have a stroke order mismatch between their XML and SVG descriptions:%0a") - for i in range(len(mismatch)): - misout.write("* %s: XML %d, SVG %d" % (mismatch[i][0], mismatch[i][1], mismatch[i][2])) - misout.write("%0a") - - # Missing components - misout = codecs.open("Main.MissingKanji", "w", "utf-8") - misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=') - misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji are referenced as components but no data is available for them:%0a") - for k in metComponents.difference(handled): - misout.write("* %s" % (k,)) - misout.write("%0a") - - # Finally write the output files - os.mkdir("data") - for kanji in kanjis: - out = codecs.open("data/" + str(kanji.id) + ".svg", "w", "utf-8") - createSVG(out, kanji) - - # Finally write the output file - #curDate = str(datetime.date.today()) - #kanjis.sort(lambda x,y: cmp(x.id, y.id)) - #out = codecs.open("kanjivg-%s.xml" % (curDate.replace("-", ""),), "w", "utf-8") - #out.write('\n') - #out.write("\n\n") - #out.write("\n"); - #for kanji in kanjis: - #kanji.toXML(out) - #out.write("\n"); + # TODO merge with upper part - kana and romaji should not be considered mismatched diff --git a/updatepublic.sh b/updatepublic.sh index 2850f46a9..356737eeb 100755 --- a/updatepublic.sh +++ b/updatepublic.sh @@ -1,13 +1,10 @@ #!/bin/sh -rm -Rf kanjivg.xml.gz generated +rm -Rf kanjivg.xml.gz data ./mergexml.py -outFile="kanjivg-`date +\"%Y%m%d\"`.xml" -mkdir -p currentdata/SVG -./createsvgfiles.py $outFile -tar czf currentdata.tar.gz currentdata +outFile="kanjivg-`date +\"%Y%m%d\"`.tar.gz" +tar cvzf $outFile data gzip $outFile scp $outFile.gz gnurou@gnurou.org:/srv/http/kanjivg/upload/Main/ -scp currentdata.tar.gz gnurou@gnurou.org:/home/gnurou scp Main.StrokeCountMismatch Main.MissingKanji gnurou@gnurou.org:/srv/http/kanjivg/wiki.d -ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz" +#ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"