mirror of
https://github.com/KanjiVG/kanjivg.git
synced 2026-01-27 16:43:12 +01:00
Update merge script, separate mismatched files
This commit is contained in:
73
kanjivg.py
73
kanjivg.py
@@ -18,7 +18,7 @@
|
||||
from xmlhandler import *
|
||||
|
||||
# Sample licence header
|
||||
licenseString = """Copyright (C) 2009/2010 Ulrich Apel.
|
||||
licenseString = """Copyright (C) 2009/2010/2011 Ulrich Apel.
|
||||
This work is distributed under the conditions of the Creative Commons
|
||||
Attribution-Share Alike 3.0 Licence. This means you are free:
|
||||
* to Share - to copy, distribute and transmit the work
|
||||
@@ -56,32 +56,35 @@ def realchr(i):
|
||||
|
||||
class Kanji:
|
||||
"""Describes a kanji. The root stroke group is accessible from the root member."""
|
||||
def __init__(self, id):
|
||||
self.id = id
|
||||
self.midashi = None
|
||||
def __init__(self, code, variant):
|
||||
# Unicode of char being represented (int)
|
||||
self.code = code
|
||||
# Variant of the character, if any
|
||||
self.variant = variant
|
||||
self.root = None
|
||||
|
||||
def toSVG(self, out, indent = 0):
|
||||
self.root.toSVG(out, self.id, [0], [1])
|
||||
# String identifier used to uniquely identify the kanji
|
||||
def kId(self):
|
||||
ret = "%05x" % (self.code,)
|
||||
if self.variant: ret += "-%s" % (self.variant,)
|
||||
return ret
|
||||
|
||||
def toXML(self, out, indent = 0):
|
||||
out.write("\t" * indent + '<kanji midashi="%s" id="%s">\n' % (self.midashi, self.id))
|
||||
self.root.toXML(out, 0)
|
||||
out.write("\t" * indent + '</kanji>\n')
|
||||
def toSVG(self, out, indent = 0):
|
||||
self.root.toSVG(out, self.kId(), [0], [1])
|
||||
|
||||
def simplify(self):
|
||||
self.root.simplify()
|
||||
|
||||
def getStrokes(self):
|
||||
return self.root.getStrokes()
|
||||
|
||||
|
||||
|
||||
class StrokeGr:
|
||||
"""Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked."""
|
||||
def __init__(self, parent):
|
||||
self.parent = parent
|
||||
if parent: parent.childs.append(self)
|
||||
# Element of strokegr, or midashi for kanji
|
||||
# Element of strokegr
|
||||
self.element = None
|
||||
# A more common, safer element this one derives of
|
||||
self.original = None
|
||||
@@ -284,29 +287,19 @@ class StructuredStrokeGroup:
|
||||
|
||||
class KanjisHandler(BasicHandler):
|
||||
"""XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
|
||||
def __init__(self):
|
||||
def __init__(self, code, variant):
|
||||
BasicHandler.__init__(self)
|
||||
self.kanjis = {}
|
||||
self.currentKanji = None
|
||||
self.kanji = Kanji(code, variant)
|
||||
self.groups = []
|
||||
self.compCpt = {}
|
||||
self.metComponents = set()
|
||||
|
||||
def handle_start_kanji(self, attrs):
|
||||
id = str(attrs["id"])
|
||||
self.currentKanji = Kanji(id)
|
||||
self.currentKanji.midashi = unicode(attrs["midashi"])
|
||||
# Check that the ID matches the midashi
|
||||
midashiNumber = "%04x" % (realord(self.currentKanji.midashi))
|
||||
if midashiNumber != id[:len(midashiNumber)]:
|
||||
print "Warning: id does not match midashi (%s(%s) %s)" % (self.currentKanji.midashi, midashiNumber, id)
|
||||
self.kanjis[id] = self.currentKanji
|
||||
|
||||
self.compCpt = {}
|
||||
pass
|
||||
|
||||
def handle_end_kanji(self):
|
||||
if len(self.groups) != 0:
|
||||
print "WARNING: stroke groups remaining after reading kanji!"
|
||||
self.currentKanji = None
|
||||
self.groups = []
|
||||
|
||||
def handle_start_strokegr(self, attrs):
|
||||
@@ -333,17 +326,17 @@ class KanjisHandler(BasicHandler):
|
||||
if group.original: self.metComponents.add(group.original)
|
||||
|
||||
if group.number:
|
||||
if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id)
|
||||
if not group.part: print "%s: Number specified, but part missing" % (self.kanji.kId())
|
||||
# The group must exist already
|
||||
if group.part > 1:
|
||||
if not self.compCpt.has_key(group.element + str(group.number)):
|
||||
print "%s: Missing numbered group" % (self.currentKanji.id)
|
||||
print "%s: Missing numbered group" % (self.kanji.kId())
|
||||
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
|
||||
print "%s: Incorrectly numbered group" % (self.currentKanji.id)
|
||||
print "%s: Incorrectly numbered group" % (self.kanji.kId())
|
||||
# The group must not exist
|
||||
else:
|
||||
if self.compCpt.has_key(group.element + str(group.number)):
|
||||
print "%s: Duplicate numbered group" % (self.currentKanji.id)
|
||||
print "%s: Duplicate numbered group" % (self.kanji.kId())
|
||||
self.compCpt[group.element + str(group.number)] = group.part
|
||||
# No number, just a part - groups restart with part 1, otherwise must
|
||||
# increase correctly
|
||||
@@ -351,17 +344,17 @@ class KanjisHandler(BasicHandler):
|
||||
# The group must exist already
|
||||
if group.part > 1:
|
||||
if not self.compCpt.has_key(group.element):
|
||||
print "%s: Incorrectly started multi-part group" % (self.currentKanji.id)
|
||||
print "%s: Incorrectly started multi-part group" % (self.kanji.kId())
|
||||
elif self.compCpt[group.element] != group.part - 1:
|
||||
print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id)
|
||||
print "%s: Incorrectly splitted multi-part group" % (self.kanji.kId())
|
||||
self.compCpt[group.element] = group.part
|
||||
|
||||
def handle_end_strokegr(self):
|
||||
group = self.groups.pop()
|
||||
if len(self.groups) == 0:
|
||||
if self.currentKanji.root:
|
||||
if self.kanji.root:
|
||||
print "WARNING: overwriting root of kanji!"
|
||||
self.currentKanji.root = group
|
||||
self.kanji.root = group
|
||||
|
||||
def handle_start_stroke(self, attrs):
|
||||
if len(self.groups) == 0: parent = None
|
||||
@@ -410,17 +403,17 @@ class SVGHandler(BasicHandler):
|
||||
if group.original: self.metComponents.add(group.original)
|
||||
|
||||
if group.number:
|
||||
if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id)
|
||||
if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.kId())
|
||||
# The group must exist already
|
||||
if group.part > 1:
|
||||
if not self.compCpt.has_key(group.element + str(group.number)):
|
||||
print "%s: Missing numbered group" % (self.currentKanji.id)
|
||||
print "%s: Missing numbered group" % (self.currentKanji.kId())
|
||||
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
|
||||
print "%s: Incorrectly numbered group" % (self.currentKanji.id)
|
||||
print "%s: Incorrectly numbered group" % (self.currentKanji.kId())
|
||||
# The group must not exist
|
||||
else:
|
||||
if self.compCpt.has_key(group.element + str(group.number)):
|
||||
print "%s: Duplicate numbered group" % (self.currentKanji.id)
|
||||
print "%s: Duplicate numbered group" % (self.currentKanji.kId())
|
||||
self.compCpt[group.element + str(group.number)] = group.part
|
||||
# No number, just a part - groups restart with part 1, otherwise must
|
||||
# increase correctly
|
||||
@@ -428,9 +421,9 @@ class SVGHandler(BasicHandler):
|
||||
# The group must exist already
|
||||
if group.part > 1:
|
||||
if not self.compCpt.has_key(group.element):
|
||||
print "%s: Incorrectly started multi-part group" % (self.currentKanji.id)
|
||||
print "%s: Incorrectly started multi-part group" % (self.currentKanji.kId())
|
||||
elif self.compCpt[group.element] != group.part - 1:
|
||||
print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id)
|
||||
print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.kId())
|
||||
self.compCpt[group.element] = group.part
|
||||
|
||||
def handle_end_g(self):
|
||||
|
||||
83
mergexml.py
83
mergexml.py
@@ -75,53 +75,66 @@ class KanjiStrokeHandler(BasicHandler):
|
||||
if attrs.has_key("id") and attrs["id"] == "Vektorbild": self.active = True
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.mkdir("output")
|
||||
os.mkdir("output/SVG")
|
||||
os.mkdir("output/SVGMismatch")
|
||||
files = os.listdir("XML")
|
||||
kanjis = []
|
||||
mismatch = []
|
||||
handled = set()
|
||||
metComponents = set()
|
||||
for f in files:
|
||||
# Let's keep the variations out of the process for now...
|
||||
if '-' in f: continue
|
||||
if not f.endswith(".xml"): continue
|
||||
|
||||
descHandler = KanjisHandler()
|
||||
xml.sax.parse(os.path.join("XML", f), descHandler)
|
||||
handled.add(realchr(int(f[:-4], 16)))
|
||||
kId = f[:-4]
|
||||
if "-" in kId: code, variant = kId.split("-")
|
||||
else: code, variant = kId, None
|
||||
|
||||
# Parse XML
|
||||
descHandler = KanjisHandler(int(code, 16), variant)
|
||||
xml.sax.parse(os.path.join("XML", f), descHandler)
|
||||
handled.add(kId)
|
||||
|
||||
# Parse SVG
|
||||
parser = xml.sax.make_parser()
|
||||
svgHandler = KanjiStrokeHandler()
|
||||
parser.setContentHandler(svgHandler)
|
||||
parser.setFeature(xml.sax.handler.feature_external_ges, False)
|
||||
parser.setFeature(xml.sax.handler.feature_external_pes, False)
|
||||
svgFile = os.path.join("SVG", f[:-3] + "svg")
|
||||
svgFile = os.path.join("SVG", kId + ".svg")
|
||||
if os.path.exists(svgFile):
|
||||
parser.parse(svgFile)
|
||||
|
||||
metComponents = metComponents.union(descHandler.metComponents)
|
||||
|
||||
kanji = descHandler.kanjis.values()[0]
|
||||
kanji = descHandler.kanji
|
||||
desc = kanji.getStrokes()
|
||||
svg = svgHandler.strokes
|
||||
if len(desc) != len(svg): mismatch.append((descHandler.kanjis.values()[0].root.element, len(desc), len(svg)))
|
||||
# Copy SVG into kanji desc
|
||||
for i in range(min(len(desc), len(svg))):
|
||||
desc[i].svg = svg[i]
|
||||
|
||||
# Add dummy strokes for SVG orphans
|
||||
for i in range(len(desc), len(svg)):
|
||||
s = Stroke(kanji.root)
|
||||
s.stype = "Missing stroke"
|
||||
s.svg = svg[i]
|
||||
kanji.root.childs.append(s)
|
||||
kanjis.append(kanji)
|
||||
|
||||
if len(desc) != len(svg): dst = "SVGMismatch"
|
||||
else: dst = "SVG"
|
||||
out = codecs.open("output/%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8")
|
||||
createSVG(out, kanji)
|
||||
|
||||
|
||||
# Now parse orphan SVGs (probably just kana and romaji)
|
||||
files = os.listdir("SVG")
|
||||
for f in files:
|
||||
# Let's keep the variations out of the process for now...
|
||||
if '-' in f: continue
|
||||
if not f.endswith(".svg"): continue
|
||||
|
||||
if realchr(int(f[:-4], 16)) in handled: continue
|
||||
kId = f[:-4]
|
||||
if "-" in kId: code, variant = kId.split("-")
|
||||
else: code, variant = kId, None
|
||||
|
||||
if f[:-4] in handled: continue
|
||||
parser = xml.sax.make_parser()
|
||||
svgHandler = KanjiStrokeHandler()
|
||||
parser.setContentHandler(svgHandler)
|
||||
@@ -129,48 +142,10 @@ if __name__ == "__main__":
|
||||
parser.setFeature(xml.sax.handler.feature_external_pes, False)
|
||||
parser.parse(os.path.join("SVG", f))
|
||||
|
||||
kanji = Kanji(f[:-4])
|
||||
kanji.midashi = unichr(int(f[:-4], 16))
|
||||
kanji = Kanji(int(code, 16), variant)
|
||||
kanji.root = StrokeGr(None)
|
||||
for s in svgHandler.strokes:
|
||||
stroke = Stroke(kanji.root)
|
||||
stroke.svg = s
|
||||
kanji.root.childs.append(stroke)
|
||||
kanjis.append(kanji)
|
||||
|
||||
# Stroke count mismatch kanji
|
||||
mismatch.sort()
|
||||
misout = codecs.open("Main.StrokeCountMismatch", "w", "utf-8")
|
||||
misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=')
|
||||
misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji have a stroke order mismatch between their XML and SVG descriptions:%0a")
|
||||
for i in range(len(mismatch)):
|
||||
misout.write("* %s: XML %d, SVG %d" % (mismatch[i][0], mismatch[i][1], mismatch[i][2]))
|
||||
misout.write("%0a")
|
||||
|
||||
# Missing components
|
||||
misout = codecs.open("Main.MissingKanji", "w", "utf-8")
|
||||
misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=')
|
||||
misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji are referenced as components but no data is available for them:%0a")
|
||||
for k in metComponents.difference(handled):
|
||||
misout.write("* %s" % (k,))
|
||||
misout.write("%0a")
|
||||
|
||||
# Finally write the output files
|
||||
os.mkdir("data")
|
||||
for kanji in kanjis:
|
||||
out = codecs.open("data/" + str(kanji.id) + ".svg", "w", "utf-8")
|
||||
createSVG(out, kanji)
|
||||
|
||||
# Finally write the output file
|
||||
#curDate = str(datetime.date.today())
|
||||
#kanjis.sort(lambda x,y: cmp(x.id, y.id))
|
||||
#out = codecs.open("kanjivg-%s.xml" % (curDate.replace("-", ""),), "w", "utf-8")
|
||||
#out.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
#out.write("<!-- ")
|
||||
#out.write(licenseString)
|
||||
#out.write("\nThis file has been generated on %s, using the latest KanjiVG data to this date." % (curDate))
|
||||
#out.write("\n-->\n\n")
|
||||
#out.write("<kanjis>\n");
|
||||
#for kanji in kanjis:
|
||||
#kanji.toXML(out)
|
||||
#out.write("</kanjis>\n");
|
||||
# TODO merge with upper part - kana and romaji should not be considered mismatched
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
#!/bin/sh
|
||||
rm -Rf kanjivg.xml.gz generated
|
||||
rm -Rf kanjivg.xml.gz data
|
||||
./mergexml.py
|
||||
outFile="kanjivg-`date +\"%Y%m%d\"`.xml"
|
||||
mkdir -p currentdata/SVG
|
||||
./createsvgfiles.py $outFile
|
||||
tar czf currentdata.tar.gz currentdata
|
||||
outFile="kanjivg-`date +\"%Y%m%d\"`.tar.gz"
|
||||
tar cvzf $outFile data
|
||||
gzip $outFile
|
||||
scp $outFile.gz gnurou@gnurou.org:/srv/http/kanjivg/upload/Main/
|
||||
scp currentdata.tar.gz gnurou@gnurou.org:/home/gnurou
|
||||
scp Main.StrokeCountMismatch Main.MissingKanji gnurou@gnurou.org:/srv/http/kanjivg/wiki.d
|
||||
ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"
|
||||
#ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user