diff --git a/kanjivg.py b/kanjivg.py
index 255b7a2d8..52644658a 100644
--- a/kanjivg.py
+++ b/kanjivg.py
@@ -18,7 +18,7 @@
from xmlhandler import *
# Sample licence header
-licenseString = """Copyright (C) 2009/2010 Ulrich Apel.
+licenseString = """Copyright (C) 2009/2010/2011 Ulrich Apel.
This work is distributed under the conditions of the Creative Commons
Attribution-Share Alike 3.0 Licence. This means you are free:
* to Share - to copy, distribute and transmit the work
@@ -56,32 +56,35 @@ def realchr(i):
class Kanji:
"""Describes a kanji. The root stroke group is accessible from the root member."""
- def __init__(self, id):
- self.id = id
- self.midashi = None
+ def __init__(self, code, variant):
+ # Unicode of char being represented (int)
+ self.code = code
+ # Variant of the character, if any
+ self.variant = variant
self.root = None
- def toSVG(self, out, indent = 0):
- self.root.toSVG(out, self.id, [0], [1])
+ # String identifier used to uniquely identify the kanji
+ def kId(self):
+ ret = "%05x" % (self.code,)
+ if self.variant: ret += "-%s" % (self.variant,)
+ return ret
- def toXML(self, out, indent = 0):
- out.write("\t" * indent + '\n' % (self.midashi, self.id))
- self.root.toXML(out, 0)
- out.write("\t" * indent + '\n')
+ def toSVG(self, out, indent = 0):
+ self.root.toSVG(out, self.kId(), [0], [1])
def simplify(self):
self.root.simplify()
def getStrokes(self):
return self.root.getStrokes()
-
+
class StrokeGr:
"""Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked."""
def __init__(self, parent):
self.parent = parent
if parent: parent.childs.append(self)
- # Element of strokegr, or midashi for kanji
+ # Element of strokegr
self.element = None
# A more common, safer element this one derives of
self.original = None
@@ -284,29 +287,19 @@ class StructuredStrokeGroup:
class KanjisHandler(BasicHandler):
"""XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
- def __init__(self):
+ def __init__(self, code, variant):
BasicHandler.__init__(self)
- self.kanjis = {}
- self.currentKanji = None
+ self.kanji = Kanji(code, variant)
self.groups = []
+ self.compCpt = {}
self.metComponents = set()
def handle_start_kanji(self, attrs):
- id = str(attrs["id"])
- self.currentKanji = Kanji(id)
- self.currentKanji.midashi = unicode(attrs["midashi"])
- # Check that the ID matches the midashi
- midashiNumber = "%04x" % (realord(self.currentKanji.midashi))
- if midashiNumber != id[:len(midashiNumber)]:
- print "Warning: id does not match midashi (%s(%s) %s)" % (self.currentKanji.midashi, midashiNumber, id)
- self.kanjis[id] = self.currentKanji
-
- self.compCpt = {}
+ pass
def handle_end_kanji(self):
if len(self.groups) != 0:
print "WARNING: stroke groups remaining after reading kanji!"
- self.currentKanji = None
self.groups = []
def handle_start_strokegr(self, attrs):
@@ -333,17 +326,17 @@ class KanjisHandler(BasicHandler):
if group.original: self.metComponents.add(group.original)
if group.number:
- if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id)
+ if not group.part: print "%s: Number specified, but part missing" % (self.kanji.kId())
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element + str(group.number)):
- print "%s: Missing numbered group" % (self.currentKanji.id)
+ print "%s: Missing numbered group" % (self.kanji.kId())
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
- print "%s: Incorrectly numbered group" % (self.currentKanji.id)
+ print "%s: Incorrectly numbered group" % (self.kanji.kId())
# The group must not exist
else:
if self.compCpt.has_key(group.element + str(group.number)):
- print "%s: Duplicate numbered group" % (self.currentKanji.id)
+ print "%s: Duplicate numbered group" % (self.kanji.kId())
self.compCpt[group.element + str(group.number)] = group.part
# No number, just a part - groups restart with part 1, otherwise must
# increase correctly
@@ -351,17 +344,17 @@ class KanjisHandler(BasicHandler):
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element):
- print "%s: Incorrectly started multi-part group" % (self.currentKanji.id)
+ print "%s: Incorrectly started multi-part group" % (self.kanji.kId())
elif self.compCpt[group.element] != group.part - 1:
- print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id)
+ print "%s: Incorrectly splitted multi-part group" % (self.kanji.kId())
self.compCpt[group.element] = group.part
def handle_end_strokegr(self):
group = self.groups.pop()
if len(self.groups) == 0:
- if self.currentKanji.root:
+ if self.kanji.root:
print "WARNING: overwriting root of kanji!"
- self.currentKanji.root = group
+ self.kanji.root = group
def handle_start_stroke(self, attrs):
if len(self.groups) == 0: parent = None
@@ -410,17 +403,17 @@ class SVGHandler(BasicHandler):
if group.original: self.metComponents.add(group.original)
if group.number:
- if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.id)
+ if not group.part: print "%s: Number specified, but part missing" % (self.currentKanji.kId())
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element + str(group.number)):
- print "%s: Missing numbered group" % (self.currentKanji.id)
+ print "%s: Missing numbered group" % (self.currentKanji.kId())
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
- print "%s: Incorrectly numbered group" % (self.currentKanji.id)
+ print "%s: Incorrectly numbered group" % (self.currentKanji.kId())
# The group must not exist
else:
if self.compCpt.has_key(group.element + str(group.number)):
- print "%s: Duplicate numbered group" % (self.currentKanji.id)
+ print "%s: Duplicate numbered group" % (self.currentKanji.kId())
self.compCpt[group.element + str(group.number)] = group.part
# No number, just a part - groups restart with part 1, otherwise must
# increase correctly
@@ -428,9 +421,9 @@ class SVGHandler(BasicHandler):
# The group must exist already
if group.part > 1:
if not self.compCpt.has_key(group.element):
- print "%s: Incorrectly started multi-part group" % (self.currentKanji.id)
+ print "%s: Incorrectly started multi-part group" % (self.currentKanji.kId())
elif self.compCpt[group.element] != group.part - 1:
- print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.id)
+ print "%s: Incorrectly splitted multi-part group" % (self.currentKanji.kId())
self.compCpt[group.element] = group.part
def handle_end_g(self):
diff --git a/mergexml.py b/mergexml.py
index dfe799a3d..8db123e55 100755
--- a/mergexml.py
+++ b/mergexml.py
@@ -75,53 +75,66 @@ class KanjiStrokeHandler(BasicHandler):
if attrs.has_key("id") and attrs["id"] == "Vektorbild": self.active = True
if __name__ == "__main__":
+ os.mkdir("output")
+ os.mkdir("output/SVG")
+ os.mkdir("output/SVGMismatch")
files = os.listdir("XML")
- kanjis = []
- mismatch = []
handled = set()
metComponents = set()
for f in files:
- # Let's keep the variations out of the process for now...
- if '-' in f: continue
if not f.endswith(".xml"): continue
- descHandler = KanjisHandler()
- xml.sax.parse(os.path.join("XML", f), descHandler)
- handled.add(realchr(int(f[:-4], 16)))
+ kId = f[:-4]
+ if "-" in kId: code, variant = kId.split("-")
+ else: code, variant = kId, None
+ # Parse XML
+ descHandler = KanjisHandler(int(code, 16), variant)
+ xml.sax.parse(os.path.join("XML", f), descHandler)
+ handled.add(kId)
+
+ # Parse SVG
parser = xml.sax.make_parser()
svgHandler = KanjiStrokeHandler()
parser.setContentHandler(svgHandler)
parser.setFeature(xml.sax.handler.feature_external_ges, False)
parser.setFeature(xml.sax.handler.feature_external_pes, False)
- svgFile = os.path.join("SVG", f[:-3] + "svg")
+ svgFile = os.path.join("SVG", kId + ".svg")
if os.path.exists(svgFile):
parser.parse(svgFile)
metComponents = metComponents.union(descHandler.metComponents)
- kanji = descHandler.kanjis.values()[0]
+ kanji = descHandler.kanji
desc = kanji.getStrokes()
svg = svgHandler.strokes
- if len(desc) != len(svg): mismatch.append((descHandler.kanjis.values()[0].root.element, len(desc), len(svg)))
+ # Copy SVG into kanji desc
for i in range(min(len(desc), len(svg))):
desc[i].svg = svg[i]
+
# Add dummy strokes for SVG orphans
for i in range(len(desc), len(svg)):
s = Stroke(kanji.root)
s.stype = "Missing stroke"
s.svg = svg[i]
kanji.root.childs.append(s)
- kanjis.append(kanji)
+
+ if len(desc) != len(svg): dst = "SVGMismatch"
+ else: dst = "SVG"
+ out = codecs.open("output/%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8")
+ createSVG(out, kanji)
+
# Now parse orphan SVGs (probably just kana and romaji)
files = os.listdir("SVG")
for f in files:
- # Let's keep the variations out of the process for now...
- if '-' in f: continue
if not f.endswith(".svg"): continue
- if realchr(int(f[:-4], 16)) in handled: continue
+ kId = f[:-4]
+ if "-" in kId: code, variant = kId.split("-")
+ else: code, variant = kId, None
+
+ if f[:-4] in handled: continue
parser = xml.sax.make_parser()
svgHandler = KanjiStrokeHandler()
parser.setContentHandler(svgHandler)
@@ -129,48 +142,10 @@ if __name__ == "__main__":
parser.setFeature(xml.sax.handler.feature_external_pes, False)
parser.parse(os.path.join("SVG", f))
- kanji = Kanji(f[:-4])
- kanji.midashi = unichr(int(f[:-4], 16))
+ kanji = Kanji(int(code, 16), variant)
kanji.root = StrokeGr(None)
for s in svgHandler.strokes:
stroke = Stroke(kanji.root)
stroke.svg = s
kanji.root.childs.append(stroke)
- kanjis.append(kanji)
-
- # Stroke count mismatch kanji
- mismatch.sort()
- misout = codecs.open("Main.StrokeCountMismatch", "w", "utf-8")
- misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=')
- misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji have a stroke order mismatch between their XML and SVG descriptions:%0a")
- for i in range(len(mismatch)):
- misout.write("* %s: XML %d, SVG %d" % (mismatch[i][0], mismatch[i][1], mismatch[i][2]))
- misout.write("%0a")
-
- # Missing components
- misout = codecs.open("Main.MissingKanji", "w", "utf-8")
- misout.write('version=pmwiki-2.1.0 urlencoded=1\ntext=')
- misout.write("'''This page is generated - please do not edit it!'''%0a%0aThe following kanji are referenced as components but no data is available for them:%0a")
- for k in metComponents.difference(handled):
- misout.write("* %s" % (k,))
- misout.write("%0a")
-
- # Finally write the output files
- os.mkdir("data")
- for kanji in kanjis:
- out = codecs.open("data/" + str(kanji.id) + ".svg", "w", "utf-8")
- createSVG(out, kanji)
-
- # Finally write the output file
- #curDate = str(datetime.date.today())
- #kanjis.sort(lambda x,y: cmp(x.id, y.id))
- #out = codecs.open("kanjivg-%s.xml" % (curDate.replace("-", ""),), "w", "utf-8")
- #out.write('\n')
- #out.write("\n\n")
- #out.write("\n");
- #for kanji in kanjis:
- #kanji.toXML(out)
- #out.write("\n");
+ # TODO merge with upper part - kana and romaji should not be considered mismatched
diff --git a/updatepublic.sh b/updatepublic.sh
index 2850f46a9..356737eeb 100755
--- a/updatepublic.sh
+++ b/updatepublic.sh
@@ -1,13 +1,10 @@
#!/bin/sh
-rm -Rf kanjivg.xml.gz generated
+rm -Rf kanjivg.xml.gz data
./mergexml.py
-outFile="kanjivg-`date +\"%Y%m%d\"`.xml"
-mkdir -p currentdata/SVG
-./createsvgfiles.py $outFile
-tar czf currentdata.tar.gz currentdata
+outFile="kanjivg-`date +\"%Y%m%d\"`.tar.gz"
+tar cvzf $outFile data
gzip $outFile
scp $outFile.gz gnurou@gnurou.org:/srv/http/kanjivg/upload/Main/
-scp currentdata.tar.gz gnurou@gnurou.org:/home/gnurou
scp Main.StrokeCountMismatch Main.MissingKanji gnurou@gnurou.org:/srv/http/kanjivg/wiki.d
-ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"
+#ssh gnurou@gnurou.org "cd /srv/http/kanjivg ; rm -Rf currentdata ; tar xfz /home/gnurou/currentdata.tar.gz ; rm /home/gnurou/currentdata.tar.gz ; cd upload/Main ; ln -sf $outFile.gz kanjivg-latest.xml.gz"