mirror of
https://github.com/KanjiVG/kanjivg.git
synced 2026-03-11 19:03:06 +01:00
Updated python scripts to work with python 2 and 3, fixed XML parsers
This commit is contained in:
173
kanjivg.py
173
kanjivg.py
@@ -16,6 +16,11 @@
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from xmlhandler import *
|
||||
from utils import PYTHON_VERSION_MAJOR, canonicalId
|
||||
|
||||
if PYTHON_VERSION_MAJOR > 2:
|
||||
def unicode(s):
|
||||
return s
|
||||
|
||||
# Sample licence header
|
||||
licenseString = """Copyright (C) 2009-2013 Ulrich Apel.
|
||||
@@ -56,16 +61,19 @@ def realchr(i):
|
||||
|
||||
class Kanji:
|
||||
"""Describes a kanji. The root stroke group is accessible from the strokes member."""
|
||||
def __init__(self, code, variant):
|
||||
# Unicode of char being represented (int)
|
||||
self.code = code
|
||||
def __init__(self, code, variant = None):
|
||||
# Unicode of char being represented (standard str)
|
||||
self.code = canonicalId(code)
|
||||
# Variant of the character, if any
|
||||
self.variant = variant
|
||||
self.strokes = None
|
||||
|
||||
def __repr__(self):
|
||||
return repr(vars(self))
|
||||
|
||||
# String identifier used to uniquely identify the kanji
|
||||
def kId(self):
|
||||
ret = "%05x" % (self.code,)
|
||||
ret = self.code
|
||||
if self.variant: ret += "-%s" % (self.variant,)
|
||||
return ret
|
||||
|
||||
@@ -88,7 +96,7 @@ class Kanji:
|
||||
|
||||
class StrokeGr:
|
||||
"""Describes a stroke group belonging to a kanji as closely as possible to the XML format. Sub-stroke groups or strokes are available in the childs member. They can either be of class StrokeGr or Stroke so their type should be checked."""
|
||||
def __init__(self, parent):
|
||||
def __init__(self, parent = None):
|
||||
self.parent = parent
|
||||
if parent: parent.childs.append(self)
|
||||
# Element of strokegr
|
||||
@@ -107,6 +115,15 @@ class StrokeGr:
|
||||
|
||||
self.childs = []
|
||||
|
||||
def __repr__(self):
|
||||
return repr(vars(self))
|
||||
|
||||
def setParent(self, parent):
|
||||
if self.parent is not None or parent is None:
|
||||
raise "Set parent should only be set once! There is no cleanup for old parents."
|
||||
parent.childs.append(self)
|
||||
self.parent = parent
|
||||
|
||||
def toSVG(self, out, rootId, groupCpt = [0], strCpt = [1], indent = 0):
|
||||
gid = rootId
|
||||
if groupCpt[0] != 0: gid += "-g" + str(groupCpt[0])
|
||||
@@ -208,6 +225,9 @@ class Stroke:
|
||||
self.stype = None
|
||||
self.svg = None
|
||||
self.numberPos = None
|
||||
|
||||
def __repr__(self):
|
||||
return repr(vars(self))
|
||||
|
||||
def numberToSVG(self, out, number, indent = 0):
|
||||
if self.numberPos:
|
||||
@@ -224,40 +244,58 @@ class Stroke:
|
||||
|
||||
class KanjisHandler(BasicHandler):
|
||||
"""XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
|
||||
def __init__(self, code, variant):
|
||||
def __init__(self):
|
||||
BasicHandler.__init__(self)
|
||||
self.kanji = Kanji(code, variant)
|
||||
self.kanji = None
|
||||
self.kanjis = {}
|
||||
self.group = None
|
||||
self.groups = []
|
||||
self.compCpt = {}
|
||||
self.metComponents = set()
|
||||
|
||||
def handle_start_kanji(self, attrs):
|
||||
pass
|
||||
if self.kanji is not None:
|
||||
raise Exception("Kanji cannot be nested")
|
||||
if self.group is not None:
|
||||
raise Exception("Kanji cannot be inside a group")
|
||||
if len(self.groups) != 0:
|
||||
raise Exception("Previous kanji not closed correctly")
|
||||
idType, idVariantStr = str(attrs["id"]).split("_")
|
||||
if idType != "kvg:kanji":
|
||||
raise Exception("Each kanji should have id formatted as kvg:kanji_XXXXX.")
|
||||
idVariant = idVariantStr.split('-')
|
||||
self.kanji = Kanji(*idVariant)
|
||||
|
||||
|
||||
def handle_end_kanji(self):
|
||||
if len(self.groups) != 0:
|
||||
print("WARNING: stroke groups remaining after reading kanji!")
|
||||
if self.group is not None:
|
||||
raise Exception("A group is not closed inside the kanji.")
|
||||
if len(self.groups) != 1:
|
||||
raise Exception("Kanji should have 1 root group.")
|
||||
self.kanji.strokes = self.groups[0]
|
||||
self.kanjis[self.kanji.code] = self.kanji
|
||||
self.groups = []
|
||||
self.kanji = None
|
||||
|
||||
def handle_start_strokegr(self, attrs):
|
||||
if len(self.groups) == 0: parent = None
|
||||
else: parent = self.groups[-1]
|
||||
group = StrokeGr(parent)
|
||||
def handle_start_g(self, attrs):
|
||||
if self.kanji is None:
|
||||
raise Exception("Stroke group must be inside a kanji")
|
||||
group = StrokeGr(self.group)
|
||||
|
||||
# Now parse group attributes
|
||||
if attrs.has_key("element"): group.element = unicode(attrs["element"])
|
||||
if attrs.has_key("variant"): group.variant = str(attrs["variant"])
|
||||
if attrs.has_key("partial"): group.partial = str(attrs["partial"])
|
||||
if attrs.has_key("original"): group.original = unicode(attrs["original"])
|
||||
if attrs.has_key("part"): group.part = int(attrs["part"])
|
||||
if attrs.has_key("number"): group.number = int(attrs["number"])
|
||||
if attrs.has_key("tradForm") and str(attrs["tradForm"]) == "true": group.tradForm = True
|
||||
if attrs.has_key("radicalForm") and str(attrs["radicalForm"]) == "true": group.radicalForm = True
|
||||
if attrs.has_key("position"): group.position = unicode(attrs["position"])
|
||||
if attrs.has_key("radical"): group.radical = unicode(attrs["radical"])
|
||||
if attrs.has_key("phon"): group.phon = unicode(attrs["phon"])
|
||||
if "kvg:element" in attrs: group.element = unicode(attrs["kvg:element"])
|
||||
if "kvg:variant" in attrs: group.variant = str(attrs["kvg:variant"])
|
||||
if "kvg:partial" in attrs: group.partial = str(attrs["kvg:partial"])
|
||||
if "kvg:original" in attrs: group.original = unicode(attrs["kvg:original"])
|
||||
if "kvg:part" in attrs: group.part = int(attrs["kvg:part"])
|
||||
if "kvg:number" in attrs: group.number = int(attrs["kvg:number"])
|
||||
if "kvg:tradForm" in attrs and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True
|
||||
if "kvg:radicalForm" in attrs and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True
|
||||
if "kvg:position" in attrs: group.position = unicode(attrs["kvg:position"])
|
||||
if "kvg:radical" in attrs: group.radical = unicode(attrs["kvg:radical"])
|
||||
if "kvg:phon" in attrs: group.phon = unicode(attrs["kvg:phon"])
|
||||
|
||||
self.groups.append(group)
|
||||
self.group = group
|
||||
|
||||
if group.element: self.metComponents.add(group.element)
|
||||
if group.original: self.metComponents.add(group.original)
|
||||
@@ -266,13 +304,13 @@ class KanjisHandler(BasicHandler):
|
||||
if not group.part: print("%s: Number specified, but part missing" % (self.kanji.kId()))
|
||||
# The group must exist already
|
||||
if group.part > 1:
|
||||
if not self.compCpt.has_key(group.element + str(group.number)):
|
||||
if (group.element + str(group.number)) not in self.compCpt:
|
||||
print("%s: Missing numbered group" % (self.kanji.kId()))
|
||||
elif self.compCpt[group.element + str(group.number)] != group.part - 1:
|
||||
print("%s: Incorrectly numbered group" % (self.kanji.kId()))
|
||||
# The group must not exist
|
||||
else:
|
||||
if self.compCpt.has_key(group.element + str(group.number)):
|
||||
if (group.element + str(group.number)) in self.compCpt:
|
||||
print("%s: Duplicate numbered group" % (self.kanji.kId()))
|
||||
self.compCpt[group.element + str(group.number)] = group.part
|
||||
# No number, just a part - groups restart with part 1, otherwise must
|
||||
@@ -280,26 +318,25 @@ class KanjisHandler(BasicHandler):
|
||||
elif group.part:
|
||||
# The group must exist already
|
||||
if group.part > 1:
|
||||
if not self.compCpt.has_key(group.element):
|
||||
if group.element not in self.compCpt:
|
||||
print("%s: Incorrectly started multi-part group" % (self.kanji.kId()))
|
||||
elif self.compCpt[group.element] != group.part - 1:
|
||||
print("%s: Incorrectly splitted multi-part group" % (self.kanji.kId()))
|
||||
self.compCpt[group.element] = group.part
|
||||
|
||||
def handle_end_strokegr(self):
|
||||
group = self.groups.pop()
|
||||
if len(self.groups) == 0:
|
||||
if self.kanji.strokes:
|
||||
print("WARNING: overwriting root of kanji!")
|
||||
self.kanji.strokes = group
|
||||
def handle_end_g(self):
|
||||
if self.group.parent is None:
|
||||
self.groups.append(self.group)
|
||||
self.group = self.group.parent
|
||||
|
||||
def handle_start_stroke(self, attrs):
|
||||
if len(self.groups) == 0: parent = None
|
||||
else: parent = self.groups[-1]
|
||||
stroke = Stroke(parent)
|
||||
stroke.stype = unicode(attrs["type"])
|
||||
if attrs.has_key("path"): stroke.svg = unicode(attrs["path"])
|
||||
self.groups[-1].childs.append(stroke)
|
||||
def handle_start_path(self, attrs):
|
||||
if self.kanji is None or self.group is None:
|
||||
raise Exception("Stroke must be inside a kanji and group!")
|
||||
stroke = Stroke(self.group)
|
||||
if "kvg:type" in attrs:
|
||||
stroke.stype = unicode(attrs["kvg:type"])
|
||||
if "d" in attrs: stroke.svg = unicode(attrs["d"])
|
||||
self.group.childs.append(stroke)
|
||||
|
||||
class SVGHandler(BasicHandler):
|
||||
"""SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanji are accessible through the kanjis member, indexed by their svg file name."""
|
||||
@@ -311,28 +348,36 @@ class SVGHandler(BasicHandler):
|
||||
self.metComponents = set()
|
||||
|
||||
def handle_start_g(self, attrs):
|
||||
group = StrokeGr()
|
||||
|
||||
# Special case for handling the root
|
||||
if len(self.groups) == 0:
|
||||
id = hex(realord(attrs["kvg:element"]))[2:]
|
||||
self.currentKanji = Kanji(id)
|
||||
self.kanjis[id] = self.currentKanji
|
||||
idType, idVariantStr = str(attrs["id"]).split("_")
|
||||
idVariant = idVariantStr.split('-')
|
||||
if idType == "kvg:StrokePaths":
|
||||
pass
|
||||
elif idType == "kvg:StrokeNumbers":
|
||||
return
|
||||
else:
|
||||
raise Exception("Invalid root group id type (%s)" % (str(attrs["id"]),))
|
||||
self.currentKanji = Kanji(*idVariant)
|
||||
self.kanjis[self.currentKanji.code] = self.currentKanji
|
||||
self.compCpt = {}
|
||||
parent = None
|
||||
else: parent = self.groups[-1]
|
||||
else:
|
||||
group.setParent(self.groups[-1])
|
||||
|
||||
group = StrokeGr(parent)
|
||||
# Now parse group attributes
|
||||
if attrs.has_key("kvg:element"): group.element = unicode(attrs["kvg:element"])
|
||||
if attrs.has_key("kvg:variant"): group.variant = str(attrs["kvg:variant"])
|
||||
if attrs.has_key("kvg:partial"): group.partial = str(attrs["kvg:partial"])
|
||||
if attrs.has_key("kvg:original"): group.original = unicode(attrs["kvg:original"])
|
||||
if attrs.has_key("kvg:part"): group.part = int(attrs["kvg:part"])
|
||||
if attrs.has_key("kvg:number"): group.number = int(attrs["kvg:number"])
|
||||
if attrs.has_key("kvg:tradForm") and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True
|
||||
if attrs.has_key("kvg:radicalForm") and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True
|
||||
if attrs.has_key("kvg:position"): group.position = unicode(attrs["kvg:position"])
|
||||
if attrs.has_key("kvg:radical"): group.radical = unicode(attrs["kvg:radical"])
|
||||
if attrs.has_key("kvg:phon"): group.phon = unicode(attrs["kvg:phon"])
|
||||
if "kvg:element" in attrs: group.element = unicode(attrs["kvg:element"])
|
||||
if "kvg:variant" in attrs: group.variant = str(attrs["kvg:variant"])
|
||||
if "kvg:partial" in attrs: group.partial = str(attrs["kvg:partial"])
|
||||
if "kvg:original" in attrs: group.original = unicode(attrs["kvg:original"])
|
||||
if "kvg:part" in attrs: group.part = int(attrs["kvg:part"])
|
||||
if "kvg:number" in attrs: group.number = int(attrs["kvg:number"])
|
||||
if "kvg:tradForm" in attrs and str(attrs["kvg:tradForm"]) == "true": group.tradForm = True
|
||||
if "kvg:radicalForm" in attrs and str(attrs["kvg:radicalForm"]) == "true": group.radicalForm = True
|
||||
if "kvg:position" in attrs: group.position = unicode(attrs["kvg:position"])
|
||||
if "kvg:radical" in attrs: group.radical = unicode(attrs["kvg:radical"])
|
||||
if "kvg:phon" in attrs: group.phon = unicode(attrs["kvg:phon"])
|
||||
|
||||
self.groups.append(group)
|
||||
|
||||
@@ -364,9 +409,11 @@ class SVGHandler(BasicHandler):
|
||||
self.compCpt[group.element] = group.part
|
||||
|
||||
def handle_end_g(self):
|
||||
if len(self.groups) == 0:
|
||||
return
|
||||
group = self.groups.pop()
|
||||
# End of kanji?
|
||||
if len(self.groups) == 0:
|
||||
if len(self.groups) == 1: # index 1 - ignore root group
|
||||
self.currentKanji.strokes = group
|
||||
self.currentKanji = None
|
||||
self.groups = []
|
||||
@@ -376,6 +423,8 @@ class SVGHandler(BasicHandler):
|
||||
if len(self.groups) == 0: parent = None
|
||||
else: parent = self.groups[-1]
|
||||
stroke = Stroke(parent)
|
||||
stroke.stype = unicode(attrs["kvg:type"])
|
||||
if attrs.has_key("d"): stroke.svg = unicode(attrs["d"])
|
||||
if "kvg:type" in attrs:
|
||||
stroke.stype = unicode(attrs["kvg:type"])
|
||||
if "d" in attrs:
|
||||
stroke.svg = unicode(attrs["d"])
|
||||
self.groups[-1].childs.append(stroke)
|
||||
|
||||
24
kvg.py
24
kvg.py
@@ -16,8 +16,9 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os, os.path, sys, codecs, re, datetime
|
||||
import sys, os, re, datetime
|
||||
from kanjivg import licenseString
|
||||
from utils import open
|
||||
|
||||
pathre = re.compile(r'<path .*d="([^"]*)".*/>')
|
||||
|
||||
@@ -28,9 +29,9 @@ Recognized commands:
|
||||
release create single release file""" % (sys.argv[0],)
|
||||
|
||||
def createPathsSVG(f):
|
||||
s = codecs.open(f, "r", "utf-8").read()
|
||||
s = open(f, "r", encoding="utf-8").read()
|
||||
paths = pathre.findall(s)
|
||||
out = codecs.open(f[:-4] + "-paths.svg", "w", "utf-8")
|
||||
out = open(f[:-4] + "-paths.svg", "w", encoding="utf-8")
|
||||
out.write("""<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN" "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd" []>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="109" height="109" viewBox="0 0 109 109" style="fill:none;stroke:#000000;stroke-width:3;stroke-linecap:round;stroke-linejoin:round;">\n""")
|
||||
i = 1
|
||||
@@ -44,20 +45,21 @@ def mergePathsSVG(f):
|
||||
if not os.path.exists(pFile):
|
||||
print("%s does not exist!" % (pFile,))
|
||||
return
|
||||
s = codecs.open(pFile, "r", "utf-8").read()
|
||||
s = open(pFile, "r", encoding="utf-8").read()
|
||||
paths = pathre.findall(s)
|
||||
s = codecs.open(f, "r", "utf-8").read()
|
||||
s = open(f, "r", encoding="utf-8").read()
|
||||
pos = 0
|
||||
while True:
|
||||
match = pathre.search(s[pos:])
|
||||
if match and len(paths) == 0 or not match and len(paths) > 0:
|
||||
print("Paths count mismatch for %s" % (f,))
|
||||
return
|
||||
if not match and len(paths) == 0: break
|
||||
if not match and len(paths) == 0:
|
||||
break
|
||||
s = s[:pos + match.start(1)] + paths[0] + s[pos + match.end(1):]
|
||||
pos += match.start(1) + len(paths[0])
|
||||
del paths[0]
|
||||
codecs.open(f, "w", "utf-8").write(s)
|
||||
open(f, "w", encoding="utf-8").write(s)
|
||||
|
||||
def release():
|
||||
datadir = "kanji"
|
||||
@@ -69,7 +71,7 @@ def release():
|
||||
del allfiles
|
||||
files.sort()
|
||||
|
||||
out = open("kanjivg.xml", "w")
|
||||
out = open("kanjivg.xml", "w", encoding='utf8')
|
||||
out.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
out.write("<!--\n")
|
||||
out.write(licenseString)
|
||||
@@ -77,7 +79,8 @@ def release():
|
||||
out.write("\n-->\n")
|
||||
out.write("<kanjivg xmlns:kvg='http://kanjivg.tagaini.net'>\n")
|
||||
for f in files:
|
||||
data = open(os.path.join(datadir, f)).read()
|
||||
data = open(os.path.join(datadir, f), encoding='utf8').read()
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data[data.find("<svg "):]
|
||||
data = data[data.find(idMatchString) + len(idMatchString):]
|
||||
kidend = data.find("\"")
|
||||
@@ -102,7 +105,8 @@ if __name__ == "__main__":
|
||||
action = actions[sys.argv[1]][0]
|
||||
files = sys.argv[2:]
|
||||
|
||||
if len(files) == 0: action()
|
||||
if len(files) == 0:
|
||||
action()
|
||||
else:
|
||||
for f in files:
|
||||
if not os.path.exists(f):
|
||||
|
||||
75
utils.py
Normal file
75
utils.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import sys, os
|
||||
|
||||
PYTHON_VERSION_MAJOR = sys.version_info[0]
|
||||
|
||||
if PYTHON_VERSION_MAJOR < 3:
|
||||
# In python 2, io.open does not support encoding parameter
|
||||
from codecs import open
|
||||
else:
|
||||
from io import open
|
||||
# In python 3, strings are used so unicode() is a pass-through
|
||||
def unicode(s):
|
||||
return s
|
||||
|
||||
def canonicalId(id):
|
||||
if isinstance(id, str):
|
||||
idLen = len(id)
|
||||
if idLen == 1:
|
||||
id = ord(id)
|
||||
elif idLen >= 2 and idLen <= 5:
|
||||
id = int(id, 16)
|
||||
else:
|
||||
raise ValueError("Character id must be a 1-character string with the character itself, or 2-5 hex digit unicode codepoint.")
|
||||
if not isinstance(id, int):
|
||||
raise ValueError("canonicalId: id must be int or str")
|
||||
if id > 0xf and id <= 0xfffff:
|
||||
return "%05x" % (id)
|
||||
raise ValueError("Character id out of range")
|
||||
|
||||
class SvgFileInfo:
|
||||
def __init__(self, file, dir):
|
||||
self.path = os.path.join(dir, file)
|
||||
if file[-4:].lower() != ".svg":
|
||||
raise Exception("File should have .svg exension. (%s)" % (str(self.path)))
|
||||
parts = (file[:-4]).split('-')
|
||||
if len(parts) == 2:
|
||||
self.variant = parts[1]
|
||||
elif len(parts) != 1:
|
||||
raise Exception("File should have at most 2 parts separated by a dash. (%s)" % (str(file)))
|
||||
self.id = parts[0]
|
||||
if self.id != canonicalId(self.id):
|
||||
raise Exception("File name not in canonical format (%s)" % (str(self.path)))
|
||||
|
||||
def __repr__(self):
|
||||
return repr(vars(self))
|
||||
|
||||
def read(self, SVGHandler=None):
|
||||
if SVGHandler is None:
|
||||
from kanjivg import SVGHandler
|
||||
handler = SVGHandler()
|
||||
parseXmlFile(self.path, handler)
|
||||
parsed = list(handler.kanjis.values())
|
||||
if len(parsed) != 1:
|
||||
raise Exception("File does not contain 1 kanji entry. (%s)" % (self.path))
|
||||
return parsed[0]
|
||||
|
||||
def parseXmlFile(path, handler):
|
||||
from xml.sax import parse
|
||||
parse(path, handler)
|
||||
|
||||
def listSvgFiles(dir):
|
||||
return [
|
||||
SvgFileInfo(f, dir)
|
||||
for f in os.listdir(dir)
|
||||
]
|
||||
|
||||
def readXmlFile(path, KanjisHandler=None):
|
||||
if KanjisHandler is None:
|
||||
from kanjivg import KanjisHandler
|
||||
handler = KanjisHandler()
|
||||
parseXmlFile(path, handler)
|
||||
parsed = list(handler.kanjis.values())
|
||||
if len(parsed) == 0:
|
||||
raise Exception("File does not contain any kanji entries. (%s)" % (path))
|
||||
return handler.kanjis
|
||||
|
||||
Reference in New Issue
Block a user