1
0
mirror of https://github.com/KanjiVG/kanjivg.git synced 2026-04-24 22:36:11 +02:00

First version of new format finalized

This commit is contained in:
Alexandre Courbot
2011-06-05 20:12:54 +09:00
parent b94c6d69cb
commit f57ce59846
4 changed files with 66 additions and 109 deletions
+2 -1
View File
@@ -2,4 +2,5 @@
kanjivg.xml
*.pyc
kanjivg-????????.xml.gz
output
kanjivg
kanjivgMismatch
+22 -89
View File
@@ -55,13 +55,13 @@ def realchr(i):
else: return unichr(((i - 0x10000) >> 10) + 0xD800) + unichr(0xDC00 + (i & 0x3ff))
class Kanji:
"""Describes a kanji. The root stroke group is accessible from the root member."""
"""Describes a kanji. The root stroke group is accessible from the strokes member."""
def __init__(self, code, variant):
# Unicode of char being represented (int)
self.code = code
# Variant of the character, if any
self.variant = variant
self.root = None
self.strokes = None
# String identifier used to uniquely identify the kanji
def kId(self):
@@ -69,14 +69,21 @@ class Kanji:
if self.variant: ret += "-%s" % (self.variant,)
return ret
def toSVG(self, out, indent = 0):
self.root.toSVG(out, self.kId(), [0], [1])
def outputStrokesNumbers(self, out, indent = 0):
strokes = self.getStrokes()
cpt = 1
for stroke in strokes:
stroke.numberToSVG(out, cpt, indent + 1)
cpt += 1
def outputStrokes(self, out, indent = 0):
self.strokes.toSVG(out, self.kId(), [0], [1])
def simplify(self):
self.root.simplify()
self.strokes.simplify()
def getStrokes(self):
return self.root.getStrokes()
return self.strokes.getStrokes()
class StrokeGr:
@@ -200,6 +207,11 @@ class Stroke:
def __init__(self, parent):
self.stype = None
self.svg = None
self.numberPos = None
def numberToSVG(self, out, number, indent = 0):
if self.numberPos:
out.write("\t" * indent + '<text transform="matrix(1 0 0 1 %.2f %.2f)">%d</text>\n' % (self.numberPos[0], self.numberPos[1], number))
def toSVG(self, out, rootId, groupCpt, strCpt, indent = 0):
pid = rootId + "-s" + str(strCpt[0])
@@ -207,85 +219,6 @@ class Stroke:
if not self.svg: out.write("\t" * indent + '<path id="%s" d="" kanjivg:type="%s"/>\n' % (pid, self.stype))
else: out.write("\t" * indent + '<path id="%s" kanjivg:type="%s" d="%s"/>\n' % (pid, self.stype, self.svg))
class StructuredKanji:
"""A more structured format for the kanji, where all the parts of groups are grouped together."""
def __init__(self, kanji):
self.components = []
self.strokes = []
stk = []
self.__buildStructure(kanji.root, stk, None)
def __mostCommonAncestor(self, np, npp):
# Update the parent to the most common parent of all parts
npSave = np
if np != None:
while np != npp:
np = np.parent
if np == None:
npp = npp.parent
np = npSave
return np
def __buildStructure(self, group, stk, parent):
# Find the component if it exists already, or create it as needed
# Number exists and part is > 1, we must find a component which number matches.
newParent = None
if group.number > 0 and group.part > 1:
for component in self.components:
if component.element == group.element and component.number == group.number:
newParent = component
component.parent = self.__mostCommonAncestor(component.parent, parent)
break
# Should never happen
if not newParent: raise Exception("Unable to find component!")
# No number but a part, we need the latest component which element matches
elif group.part > 1:
for component in self.components:
if component.element == group.element:
newParent = component
component.parent = self.__mostCommonAncestor(component.parent, parent)
break
if not newParent: raise Exception("Unable to find component!")
# Either a single part component or a first part - we need to create the component
else:
# Only do that if the current group has an element
if group.element:
newParent = StructuredStrokeGroup(parent, group.element, group.original, group.number)
self.components.append(newParent)
# Else keep the same parent
else: newParent = parent
if newParent != parent: stk.append(newParent)
# Add the found group as a child of its parent
if parent: parent.childs.append(newParent)
# Now parse the childs of the group
for child in group.childs:
# Another group - we need to call ourselves recursively to build it
if isinstance(child, StrokeGr):
self.__buildStructure(child, stk, newParent)
# A stroke - just add it to our list as well as
# to the list of all the parents on the stack
elif isinstance(child, Stroke):
self.strokes.append(child)
for pGroup in stk: pGroup.strokes.append(child)
# Set the direct parent of the child
child.parent = newParent
if newParent != parent: stk.pop()
class StructuredStrokeGroup:
def __init__(self, parent, element, original, number):
self.parent = parent
self.element = element
self.original = original
self.number = number
self.childs = []
self.strokes = []
class KanjisHandler(BasicHandler):
"""XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
def __init__(self, code, variant):
@@ -353,9 +286,9 @@ class KanjisHandler(BasicHandler):
def handle_end_strokegr(self):
group = self.groups.pop()
if len(self.groups) == 0:
if self.kanji.root:
if self.kanji.strokes:
print "WARNING: overwriting root of kanji!"
self.kanji.root = group
self.kanji.strokes = group
def handle_start_stroke(self, attrs):
if len(self.groups) == 0: parent = None
@@ -366,7 +299,7 @@ class KanjisHandler(BasicHandler):
self.groups[-1].childs.append(stroke)
class SVGHandler(BasicHandler):
"""SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name."""
"""SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanji are accessible through the kanjis member, indexed by their svg file name."""
def __init__(self):
BasicHandler.__init__(self)
self.kanjis = {}
@@ -431,7 +364,7 @@ class SVGHandler(BasicHandler):
group = self.groups.pop()
# End of kanji?
if len(self.groups) == 0:
self.currentKanji.root = group
self.currentKanji.strokes = group
self.currentKanji = None
self.groups = []
+41 -18
View File
@@ -43,7 +43,7 @@ kanjivg:phon CDATA #IMPLIED >
xmlns:kanjivg CDATA #FIXED "http://kanjivg.tagaini.net"
kanjivg:type CDATA #IMPLIED >
]>
<svg xmlns="http://www.w3.org/2000/svg" width="109" height="109" viewBox="0 0 109 109" style="fill:none;stroke:#000000;stroke-width:3;stroke-linecap:round;stroke-linejoin:round;">
<svg xmlns="http://www.w3.org/2000/svg" width="109" height="109" viewBox="0 0 109 109">
""")
#<defs>
#<marker id="Triangle"
@@ -54,30 +54,48 @@ kanjivg:type CDATA #IMPLIED >
#<path d="M 0 0 L 10 5 L 0 10 z" />
#</marker>
#</defs>
kanji.toSVG(out)
out.write("""<g id="StrokePaths" style="fill:none;stroke:#000000;stroke-width:3;stroke-linecap:round;stroke-linejoin:round;">\n""")
kanji.outputStrokes(out)
out.write("</g>\n");
out.write("""<g id="StrokeNumbers" style="font-size:8;fill:#808080">\n""")
kanji.outputStrokesNumbers(out)
out.write("</g>\n")
out.write("</svg>\n")
# Basic handler to extract the information we need from former SVG files
class KanjiStrokeHandler(BasicHandler):
def __init__(self):
BasicHandler.__init__(self)
self.strokes = []
self.active = False
self.strokesNumbers = []
# 0 -> do nothing, 1 -> parse numbers, 2 -> parse strokes
self.step = 0
# Extract position of number
def handle_start_text(self, attrs):
if self.step != 1: return
if not attrs.has_key("transform"): return
transformData = attrs["transform"]
match = re.match("matrix\(.+ .+ .+ .+ (.+) (.+)\)", transformData)
if not match: return
self.strokesNumbers.append((float(match.group(1)), float(match.group(2))))
def handle_start_path(self, attrs):
if self.step != 2: return
strokeData = attrs["d"]
# Replace spaces between digits by the comma separator
strokeData = re.sub('(\d) (\d)', '\\1,\\2', strokeData)
strokeData = re.sub("[\n\t ]+", "", strokeData)
self.strokes.append(strokeData)
def handle_start_g(self, attrs):
if attrs.has_key("id") and attrs["id"] == "Vektorbild": self.active = True
if attrs.has_key("id"):
if attrs["id"] == "StrokeNumbers": self.step = 1
elif attrs["id"] == "StrokePaths": self.step = 2
if __name__ == "__main__":
os.mkdir("output")
os.mkdir("output/SVG")
os.mkdir("output/SVGMismatch")
os.mkdir("kanjivg")
os.mkdir("kanjivgMismatch")
files = os.listdir("XML")
handled = set()
metComponents = set()
@@ -108,23 +126,27 @@ if __name__ == "__main__":
kanji = descHandler.kanji
desc = kanji.getStrokes()
svg = svgHandler.strokes
numbers = svgHandler.strokesNumbers
if len(svg) != len(numbers):
print "Warning: kanji %s has %d strokes but %d numbers!" % (kId, len(svg), len(numbers))
# Copy SVG into kanji desc
for i in range(min(len(desc), len(svg))):
desc[i].svg = svg[i]
if i < len(numbers): desc[i].numberPos = numbers[i]
# Add dummy strokes for SVG orphans
for i in range(len(desc), len(svg)):
s = Stroke(kanji.root)
s = Stroke(kanji.strokes)
s.stype = "Missing stroke"
s.svg = svg[i]
kanji.root.childs.append(s)
kanji.strokes.childs.append(s)
if len(desc) != len(svg): dst = "SVGMismatch"
else: dst = "SVG"
out = codecs.open("output/%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8")
if len(desc) != len(svg): dst = "kanjivgMismatch"
else: dst = "kanjivg"
out = codecs.open("%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8")
createSVG(out, kanji)
# Now parse orphan SVGs (probably just kana and romaji)
files = os.listdir("SVG")
for f in files:
@@ -143,9 +165,10 @@ if __name__ == "__main__":
parser.parse(os.path.join("SVG", f))
kanji = Kanji(int(code, 16), variant)
kanji.root = StrokeGr(None)
kanji.strokes = StrokeGr(None)
for s in svgHandler.strokes:
stroke = Stroke(kanji.root)
stroke = Stroke(kanji.strokes)
stroke.svg = s
kanji.root.childs.append(stroke)
# TODO merge with upper part - kana and romaji should not be considered mismatched
kanji.strokes.childs.append(stroke)
out = codecs.open("kanjivg/%s.svg" % (kanji.kId(),), "w", "utf-8")
createSVG(out, kanji)
Regular → Executable
+1 -1
View File
@@ -22,7 +22,7 @@
import os, datetime, re
from kanjivg import licenseString
__datadir = "output/SVG"
__datadir = "kanjivg"
if __name__ == "__main__":
allfiles = os.listdir(__datadir)