diff --git a/.gitignore b/.gitignore index 48982a695..7213bd555 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ kanjivg.xml *.pyc kanjivg-????????.xml.gz -output +kanjivg +kanjivgMismatch diff --git a/kanjivg.py b/kanjivg.py index 70ba0745e..447ca01f8 100644 --- a/kanjivg.py +++ b/kanjivg.py @@ -55,13 +55,13 @@ def realchr(i): else: return unichr(((i - 0x10000) >> 10) + 0xD800) + unichr(0xDC00 + (i & 0x3ff)) class Kanji: - """Describes a kanji. The root stroke group is accessible from the root member.""" + """Describes a kanji. The root stroke group is accessible from the strokes member.""" def __init__(self, code, variant): # Unicode of char being represented (int) self.code = code # Variant of the character, if any self.variant = variant - self.root = None + self.strokes = None # String identifier used to uniquely identify the kanji def kId(self): @@ -69,14 +69,21 @@ class Kanji: if self.variant: ret += "-%s" % (self.variant,) return ret - def toSVG(self, out, indent = 0): - self.root.toSVG(out, self.kId(), [0], [1]) + def outputStrokesNumbers(self, out, indent = 0): + strokes = self.getStrokes() + cpt = 1 + for stroke in strokes: + stroke.numberToSVG(out, cpt, indent + 1) + cpt += 1 + + def outputStrokes(self, out, indent = 0): + self.strokes.toSVG(out, self.kId(), [0], [1]) def simplify(self): - self.root.simplify() + self.strokes.simplify() def getStrokes(self): - return self.root.getStrokes() + return self.strokes.getStrokes() class StrokeGr: @@ -200,6 +207,11 @@ class Stroke: def __init__(self, parent): self.stype = None self.svg = None + self.numberPos = None + + def numberToSVG(self, out, number, indent = 0): + if self.numberPos: + out.write("\t" * indent + '%d\n' % (self.numberPos[0], self.numberPos[1], number)) def toSVG(self, out, rootId, groupCpt, strCpt, indent = 0): pid = rootId + "-s" + str(strCpt[0]) @@ -207,85 +219,6 @@ class Stroke: if not self.svg: out.write("\t" * indent + '\n' % (pid, self.stype)) else: out.write("\t" * indent + '\n' % (pid, self.stype, self.svg)) - -class StructuredKanji: - """A more structured format for the kanji, where all the parts of groups are grouped together.""" - def __init__(self, kanji): - self.components = [] - self.strokes = [] - - stk = [] - self.__buildStructure(kanji.root, stk, None) - - def __mostCommonAncestor(self, np, npp): - # Update the parent to the most common parent of all parts - npSave = np - if np != None: - while np != npp: - np = np.parent - if np == None: - npp = npp.parent - np = npSave - return np - - def __buildStructure(self, group, stk, parent): - # Find the component if it exists already, or create it as needed - # Number exists and part is > 1, we must find a component which number matches. - newParent = None - if group.number > 0 and group.part > 1: - for component in self.components: - if component.element == group.element and component.number == group.number: - newParent = component - component.parent = self.__mostCommonAncestor(component.parent, parent) - break - # Should never happen - if not newParent: raise Exception("Unable to find component!") - # No number but a part, we need the latest component which element matches - elif group.part > 1: - for component in self.components: - if component.element == group.element: - newParent = component - component.parent = self.__mostCommonAncestor(component.parent, parent) - break - if not newParent: raise Exception("Unable to find component!") - # Either a single part component or a first part - we need to create the component - else: - # Only do that if the current group has an element - if group.element: - newParent = StructuredStrokeGroup(parent, group.element, group.original, group.number) - self.components.append(newParent) - # Else keep the same parent - else: newParent = parent - - if newParent != parent: stk.append(newParent) - - # Add the found group as a child of its parent - if parent: parent.childs.append(newParent) - - # Now parse the childs of the group - for child in group.childs: - # Another group - we need to call ourselves recursively to build it - if isinstance(child, StrokeGr): - self.__buildStructure(child, stk, newParent) - # A stroke - just add it to our list as well as - # to the list of all the parents on the stack - elif isinstance(child, Stroke): - self.strokes.append(child) - for pGroup in stk: pGroup.strokes.append(child) - # Set the direct parent of the child - child.parent = newParent - - if newParent != parent: stk.pop() - -class StructuredStrokeGroup: - def __init__(self, parent, element, original, number): - self.parent = parent - self.element = element - self.original = original - self.number = number - self.childs = [] - self.strokes = [] - class KanjisHandler(BasicHandler): """XML handler for parsing kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name.""" def __init__(self, code, variant): @@ -353,9 +286,9 @@ class KanjisHandler(BasicHandler): def handle_end_strokegr(self): group = self.groups.pop() if len(self.groups) == 0: - if self.kanji.root: + if self.kanji.strokes: print "WARNING: overwriting root of kanji!" - self.kanji.root = group + self.kanji.strokes = group def handle_start_stroke(self, attrs): if len(self.groups) == 0: parent = None @@ -366,7 +299,7 @@ class KanjisHandler(BasicHandler): self.groups[-1].childs.append(stroke) class SVGHandler(BasicHandler): - """SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanjis are accessible through the kanjis member, indexed by their svg file name.""" + """SVG handler for parsing final kanji files. It can handle single-kanji files or aggregation files. After parsing, the kanji are accessible through the kanjis member, indexed by their svg file name.""" def __init__(self): BasicHandler.__init__(self) self.kanjis = {} @@ -431,7 +364,7 @@ class SVGHandler(BasicHandler): group = self.groups.pop() # End of kanji? if len(self.groups) == 0: - self.currentKanji.root = group + self.currentKanji.strokes = group self.currentKanji = None self.groups = [] diff --git a/mergexml.py b/mergexml.py index 5e8c8986f..6a5f5c78b 100755 --- a/mergexml.py +++ b/mergexml.py @@ -43,7 +43,7 @@ kanjivg:phon CDATA #IMPLIED > xmlns:kanjivg CDATA #FIXED "http://kanjivg.tagaini.net" kanjivg:type CDATA #IMPLIED > ]> - + """) # # # # # - kanji.toSVG(out) + out.write("""\n""") + kanji.outputStrokes(out) + out.write("\n"); + out.write("""\n""") + kanji.outputStrokesNumbers(out) + out.write("\n") out.write("\n") +# Basic handler to extract the information we need from former SVG files class KanjiStrokeHandler(BasicHandler): def __init__(self): BasicHandler.__init__(self) self.strokes = [] - self.active = False + self.strokesNumbers = [] + # 0 -> do nothing, 1 -> parse numbers, 2 -> parse strokes + self.step = 0 + + # Extract position of number + def handle_start_text(self, attrs): + if self.step != 1: return + if not attrs.has_key("transform"): return + transformData = attrs["transform"] + match = re.match("matrix\(.+ .+ .+ .+ (.+) (.+)\)", transformData) + if not match: return + self.strokesNumbers.append((float(match.group(1)), float(match.group(2)))) def handle_start_path(self, attrs): + if self.step != 2: return strokeData = attrs["d"] # Replace spaces between digits by the comma separator strokeData = re.sub('(\d) (\d)', '\\1,\\2', strokeData) strokeData = re.sub("[\n\t ]+", "", strokeData) - self.strokes.append(strokeData) def handle_start_g(self, attrs): - if attrs.has_key("id") and attrs["id"] == "Vektorbild": self.active = True + if attrs.has_key("id"): + if attrs["id"] == "StrokeNumbers": self.step = 1 + elif attrs["id"] == "StrokePaths": self.step = 2 if __name__ == "__main__": - os.mkdir("output") - os.mkdir("output/SVG") - os.mkdir("output/SVGMismatch") + os.mkdir("kanjivg") + os.mkdir("kanjivgMismatch") files = os.listdir("XML") handled = set() metComponents = set() @@ -108,23 +126,27 @@ if __name__ == "__main__": kanji = descHandler.kanji desc = kanji.getStrokes() svg = svgHandler.strokes + numbers = svgHandler.strokesNumbers + if len(svg) != len(numbers): + print "Warning: kanji %s has %d strokes but %d numbers!" % (kId, len(svg), len(numbers)) + # Copy SVG into kanji desc for i in range(min(len(desc), len(svg))): desc[i].svg = svg[i] + if i < len(numbers): desc[i].numberPos = numbers[i] # Add dummy strokes for SVG orphans for i in range(len(desc), len(svg)): - s = Stroke(kanji.root) + s = Stroke(kanji.strokes) s.stype = "Missing stroke" s.svg = svg[i] - kanji.root.childs.append(s) + kanji.strokes.childs.append(s) - if len(desc) != len(svg): dst = "SVGMismatch" - else: dst = "SVG" - out = codecs.open("output/%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8") + if len(desc) != len(svg): dst = "kanjivgMismatch" + else: dst = "kanjivg" + out = codecs.open("%s/%s.svg" % (dst, kanji.kId()), "w", "utf-8") createSVG(out, kanji) - # Now parse orphan SVGs (probably just kana and romaji) files = os.listdir("SVG") for f in files: @@ -143,9 +165,10 @@ if __name__ == "__main__": parser.parse(os.path.join("SVG", f)) kanji = Kanji(int(code, 16), variant) - kanji.root = StrokeGr(None) + kanji.strokes = StrokeGr(None) for s in svgHandler.strokes: - stroke = Stroke(kanji.root) + stroke = Stroke(kanji.strokes) stroke.svg = s - kanji.root.childs.append(stroke) - # TODO merge with upper part - kana and romaji should not be considered mismatched + kanji.strokes.childs.append(stroke) + out = codecs.open("kanjivg/%s.svg" % (kanji.kId(),), "w", "utf-8") + createSVG(out, kanji) diff --git a/releasefile.py b/releasefile.py old mode 100644 new mode 100755 index d402040a0..dc58b6710 --- a/releasefile.py +++ b/releasefile.py @@ -22,7 +22,7 @@ import os, datetime, re from kanjivg import licenseString -__datadir = "output/SVG" +__datadir = "kanjivg" if __name__ == "__main__": allfiles = os.listdir(__datadir)