#!/usr/bin/python # -*- coding: utf-8 -*- # # Copyright (C) 2009 Alexandre Courbot # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . import os, codecs, xml.sax from kanjivg import * # Returns the unicode of a character in a unicode string, taking surrogate pairs into account def realord(s, pos = 0): code = ord(s[pos]) if code >= 0xD800 and code < 0xDC00: if (len(s) <= pos + 1): print "realord warning: missing surrogate character" return 0 code2 = ord(s[pos + 1]) if code2 >= 0xDC00 and code < 0xE000: code = 0x10000 + ((code - 0xD800) * 0x400) + (code2 - 0xDC00) return code def addComponents(strokegr, compSet): if strokegr.element: compSet.add(strokegr.element) if strokegr.original: compSet.add(strokegr.original) for child in strokegr.childs: if isinstance(child, StrokeGr): addComponents(child, compSet) if __name__ == "__main__": # Read all kanjis handler = KanjisHandler() xml.sax.parse("kanjivg.xml", handler) kanjis = handler.kanjis.values() kanjis.sort(lambda x,y: cmp(x.id, y.id)) componentsList = set() for kanji in kanjis: addComponents(kanji.root, componentsList) print len(componentsList) missingComponents = set() for component in componentsList: key = hex(realord(component))[2:] if not handler.kanjis.has_key(key): missingComponents.add(component) print "Missing components:" for component in missingComponents: print component, hex(realord(component))