1
0
mirror of https://github.com/KanjiVG/kanjivg.git synced 2026-01-27 16:43:12 +01:00

Update merge script, separate mismatched files

This commit is contained in:
Alexandre Courbot
2011-03-07 20:40:06 +09:00
parent 0e2eae27a7
commit 795b420167
3 changed files with 66 additions and 101 deletions

View File

@@ -18,7 +18,7 @@
from xmlhandler import *
# Sample licence header
licenseString = """Copyright (C) 2009/2010 Ulrich Apel.
licenseString = """Copyright (C) 2009/2010/2011 Ulrich Apel.
This work is distributed under the conditions of the Creative Commons
Attribution-Share Alike 3.0 Licence. This means you are free:
* to Share - to copy, distribute and transmit the work
@@ -56,32 +56,35 @@ def realchr(i):
class Kanji:
"""Describes a kanji. The root stroke group is accessible from the root member."""
def __init__(self, id):
self.id = id
self.midashi = None
def __init__(self, code, variant):
# Unicode of char being represented (int)
self.code = code
# Variant of the character, if any
self.variant = variant
self.root = None
def toSVG(self, out, indent = 0):
self.root.toSVG(out, self.id, [0], [1])
# String identifier used to uniquely identify the kanji
def kId(self):
ret = "%05x" % (self.code,)
if self.variant: ret += "-%s" % (self.variant,)
return ret
def toXML(self, out, indent = 0):
out.write("\t" * indent + '<kanji midashi="%s" id="%s">\n' % (self.midashi, self.id))
self.root.toXML(out, 0)
out.write("\t" * indent + '</kanji>\n')
def toSVG(self, out, indent = 0):
self.root.toSVG(out, self.kId(), [0], [1])
def simplify(self):
self.root.simplify()
def getStrokes(self):
return self.root.getStrokes()
class StrokeGr:
"""Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked."""
def __init__(self, parent):
self.parent = parent
if parent: parent.childs.append(self)
# Element of strokegr, or midashi for kanji
# Element of strokegr
self.element = None
# A more common, safer element this one derives of
self.original = None
@@ -284,29 +287,19 @@ class StructuredStrokeGroup:
class KanjisHandler(BasicHandler):
"""XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
def __init__(self):
def __init__(self, code, variant):
BasicHandler.__init__(self)
self.kanjis = {}
self.currentKanji = None
self.kanji = Kanji(code, variant)
self.groups = []
self.compCpt = {}
self.metComponents = set()
def handle_start_kanji(self, attrs):
id = str(attrs["id"])
self.currentKanji = Kanji(id)
self.currentKanji.midashi = unicode(attrs["midashi"])
# Check that the ID matches the midashi
midashiNumber = "%04x" % (realord(self.currentKanji.midashi))
if midashiNumber != id[:len(midashiNumber)]:
print "Warning: id does not match midashi (%s(%s) %s)" % (self.currentKanji.midashi, midashiNumber, id)
self.kanjis[id] = self.currentKanji
self.compCpt = {}
pass
def handle_end_kanji(self):
if len(self.groups) != 0:
print "WARNING: stroke groups remaining after reading kanji!"
self.currentKanji = None
self.groups = []
def handle_start_strokegr(self, attrs):
@@ -333,17 +326,17 @@ class KanjisHandler(BasicHandler):
if group.original: self.metComponents.add(group.original)
if group.number:
if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id)
if not group.part: print "%s: Number specified, but part missing" % (self.kanji.kId())
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element + str(group.number)):
print "%s: Missing numbered group" % (self.currentKanji.id)
print "%s: Missing numbered group" % (self.kanji.kId())
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
print "%s: Incorrectly numbered group" % (self.currentKanji.id)
print "%s: Incorrectly numbered group" % (self.kanji.kId())
# The group must not exist
else:
if self.compCpt.has_key(group.element + str(group.number)):
print "%s: Duplicate numbered group" % (self.currentKanji.id)
print "%s: Duplicate numbered group" % (self.kanji.kId())
self.compCpt[group.element + str(group.number)] = group.part
# No number, just a part - groups restart with part 1, otherwise must
# increase correctly
@@ -351,17 +344,17 @@ class KanjisHandler(BasicHandler):
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element):
print "%s: Incorrectly started multi-part group" % (self.currentKanji.id)
print "%s: Incorrectly started multi-part group" % (self.kanji.kId())
elif self.compCpt[group.element] != group.part - 1:
print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id)
print "%s: Incorrectly splitted multi-part group" % (self.kanji.kId())
self.compCpt[group.element] = group.part
def handle_end_strokegr(self):
group = self.groups.pop()
if len(self.groups) == 0:
if self.currentKanji.root:
if self.kanji.root:
print "WARNING: overwriting root of kanji!"
self.currentKanji.root = group
self.kanji.root = group
def handle_start_stroke(self, attrs):
if len(self.groups) == 0: parent = None
@@ -410,17 +403,17 @@ class SVGHandler(BasicHandler):
if group.original: self.metComponents.add(group.original)
if group.number:
if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id)
if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.kId())
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element + str(group.number)):
print "%s: Missing numbered group" % (self.currentKanji.id)
print "%s: Missing numbered group" % (self.currentKanji.kId())
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
print "%s: Incorrectly numbered group" % (self.currentKanji.id)
print "%s: Incorrectly numbered group" % (self.currentKanji.kId())
# The group must not exist
else:
if self.compCpt.has_key(group.element + str(group.number)):
print "%s: Duplicate numbered group" % (self.currentKanji.id)
print "%s: Duplicate numbered group" % (self.currentKanji.kId())
self.compCpt[group.element + str(group.number)] = group.part
# No number, just a part - groups restart with part 1, otherwise must
# increase correctly
@@ -428,9 +421,9 @@ class SVGHandler(BasicHandler):
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element):
print "%s: Incorrectly started multi-part group" % (self.currentKanji.id)
print "%s: Incorrectly started multi-part group" % (self.currentKanji.kId())
elif self.compCpt[group.element] != group.part - 1:
print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id)
print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.kId())
self.compCpt[group.element] = group.part
def handle_end_g(self):

View File

@@ -75,53 +75,66 @@ class KanjiStrokeHandler(BasicHandler):
if attrs.has_key("id") and attrs["id"] == "Vektorbild": self.active = True
if __name__ == "__main__":
os.mkdir("output")
os.mkdir("output/SVG")
os.mkdir("output/SVGMismatch")
files = os.listdir("XML")
kanjis = []
mismatch = []
handled = set()
metComponents = set()
for f in files:
# Let's keep the variations out of the process for now...
if '-' in f: continue
if not f.endswith(".xml"): continue
descHandler = KanjisHandler()
xml.sax.parse(os.path.join("XML", f), descHandler)
handled.add(realchr(int(f[:-4], 16)))
kId = f[:-4]
if "-" in kId: code, variant = kId.split("-")
else: code, variant = kId, None
# Parse XML
descHandler = KanjisHandler(int(code, 16), variant)
xml.sax.parse(os.path.join("XML", f), descHandler)
handled.add(kId)
# Parse SVG
parser = xml.sax.make_parser()
svgHandler = KanjiStrokeHandler()
parser.setContentHandler(svgHandler)
parser.setFeature(xml.sax.handler.feature_external_ges, False)
parser.setFeature(xml.sax.handler.feature_external_pes, False)
svgFile = os.path.join("SVG", f[:-3] + "svg")
svgFile = os.path.join("SVG", kId + ".svg")
if os.path.exists(svgFile):
parser.parse(svgFile)
metComponents = metComponents.union(descHandler.metComponents)
kanji = descHandler.kanjis.values()[0]
kanji = descHandler.kanji
desc = kanji.getStrokes()
svg = svgHandler.strokes
if len(desc) != len(svg): mismatch.append((descHandler.kanjis.values()[0].root.element, len(desc), len(svg)))
# Copy SVG into kanji desc
for i in range(min(len(desc), len(svg))):
desc[i].svg = svg[i]
# Add dummy strokes for SVG orphans
for i in range(len(desc), len(svg)):
s = Stroke(kanji.root)
s.stype = "Missing stroke"
s.svg = svg[i]
kanji.root.childs.append(s)
kanjis.append(kanji)
if len(desc) != len(svg): dst = "SVGMismatch"
else: dst = "SVG"
out = codecs.open("output/%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8")
createSVG(out, kanji)
# Now parse orphan SVGs (probably just kana and romaji)
files = os.listdir("SVG")
for f in files:
# Let's keep the variations out of the process for now...
if '-' in f: continue
if not f.endswith(".svg"): continue
if realchr(int(f[:-4], 16)) in handled: continue
kId = f[:-4]
if "-" in kId: code, variant = kId.split("-")
else: code, variant = kId, None
if f[:-4] in handled: continue
parser = xml.sax.make_parser()
svgHandler = KanjiStrokeHandler()
parser.setContentHandler(svgHandler)
@@ -129,48 +142,10 @@ if __name__ == "__main__":
parser.setFeature(xml.sax.handler.feature_external_pes, False)
parser.parse(os.path.join("SVG", f))
kanji = Kanji(f[:-4])
kanji.midashi = unichr(int(f[:-4], 16))
kanji = Kanji(int(code, 16), variant)
kanji.root = StrokeGr(None)
for s in svgHandler.strokes:
stroke = Stroke(kanji.root)
stroke.svg = s
kanji.root.childs.append(stroke)
kanjis.append(kanji)
# Stroke count mismatch kanji
mismatch.sort()
misout = codecs.open("Main.StrokeCountMismatch", "w", "utf-8")
misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=')
misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji have a stroke order mismatch between their XML and SVG descriptions:%0a")
for i in range(len(mismatch)):
misout.write("* %s: XML %d, SVG %d" % (mismatch[i][0], mismatch[i][1], mismatch[i][2]))
misout.write("%0a")
# Missing components
misout = codecs.open("Main.MissingKanji", "w", "utf-8")
misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=')
misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji are referenced as components but no data is available for them:%0a")
for k in metComponents.difference(handled):
misout.write("* %s" % (k,))
misout.write("%0a")
# Finally write the output files
os.mkdir("data")
for kanji in kanjis:
out = codecs.open("data/" + str(kanji.id) + ".svg", "w", "utf-8")
createSVG(out, kanji)
# Finally write the output file
#curDate = str(datetime.date.today())
#kanjis.sort(lambda x,y: cmp(x.id, y.id))
#out = codecs.open("kanjivg-%s.xml" % (curDate.replace("-", ""),), "w", "utf-8")
#out.write('<?xml version="1.0" encoding="UTF-8"?>\n')
#out.write("<!-- ")
#out.write(licenseString)
#out.write("\nThis file has been generated on %s, using the latest KanjiVG data to this date." % (curDate))
#out.write("\n-->\n\n")
#out.write("<kanjis>\n");
#for kanji in kanjis:
#kanji.toXML(out)
#out.write("</kanjis>\n");
# TODO merge with upper part - kana and romaji should not be considered mismatched

View File

@@ -1,13 +1,10 @@
#!/bin/sh
rm -Rf kanjivg.xml.gz generated
rm -Rf kanjivg.xml.gz data
./mergexml.py
outFile="kanjivg-`date +\"%Y%m%d\"`.xml"
mkdir -p currentdata/SVG
./createsvgfiles.py $outFile
tar czf currentdata.tar.gz currentdata
outFile="kanjivg-`date +\"%Y%m%d\"`.tar.gz"
tar cvzf $outFile data
gzip $outFile
scp $outFile.gz gnurou@gnurou.org:/srv/http/kanjivg/upload/Main/
scp currentdata.tar.gz gnurou@gnurou.org:/home/gnurou
scp Main.StrokeCountMismatch Main.MissingKanji gnurou@gnurou.org:/srv/http/kanjivg/wiki.d
ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"
#ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"