#!/usr/bin/env python ## mkkanatables.py ## Copyright (C) 2004 Gary Benson ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. # Script to generate tables for Kana Chameleon. import operator import sys HIRAGANA = 0x3040 KATAKANA = 0x30A0 class KanaTable: def __init__(self): self.kana = {} self.addBasicKana() self.addPunctuation() self.addNumbers() self.addCommonCombos() self.addKatakanaCombos() self.addSmallTsus() self.addLongVowels() def addBasicKana(self): """Add all single-character kana.""" self.offset = 1 def addSingle(romaji): if romaji not in ("wi", "we"): if romaji != "vu": self.kana[romaji] = (HIRAGANA + self.offset,) self.kana[romaji.upper()] = (KATAKANA + self.offset,) self.offset += 1 for v in "aiueo": addSingle("x" + v) addSingle(v) for c1 in "kstnhmyrw": for v in "aiueo": for c2 in c1 + {"k":"g","s":"z","t":"d","h":"bp"}.get(c1, ""): kun = c2 + v hep = {"si": "shi", "zi": "ji", "ti": "chi", "tu": "tsu", "hu": "fu"}.get(kun, kun) if hep in ("tsu", "ya", "yu", "yo", "wa"): addSingle("x" + hep) if hep not in ("yi", "ye", "wu"): addSingle(hep) addSingle("n") addSingle("vu") del self.offset def addPunctuation(self): """Add punctuation.""" for romaji, code in ( (" ", 0x3000), ("~", 0xFF5E), ("-", 0x30FC), ("_", 0xFF3F), ("<", 0x3008), (">", 0x3009), ("<<", 0x300A), (">>", 0x300B), ("[", 0x300C), ("]", 0x300D), ("{", 0x300E), ("}", 0x300F), ("(", 0xFF08), (")", 0xFF09), ("[[", 0x3010), ("]]", 0x3011), (".", 0x3002), (",", 0x3001), ("!", 0xFF01), ("?", 0xFF1F), ("*", 0x30FB), ("^", 0xFF3E)): self.kana[romaji] = (code,) def addNumbers(self): """Add doublewidth numbers.""" for n in xrange(10): self.kana[chr(ord("0") + n)] = (0xFF10 + n,) def addCommonCombos(self): """Add sounds which are composed from other kana.""" # Aliases for n for romaji in ("n'", "n-", "m"): self._addMulti(romaji, ("n",)) # Suffixed with small ya, yu or yo for c in "kgsztnhbpmr": c1 = {"s": "sh", "t": "ch", "z": "j"}.get(c, c) + "i" c2 = {"s": "sh", "t": "ch", "z": "j"}.get(c, c + "y") for v in "auo": self._addMulti(c2 + v, (c1, "xy" + v)) def addKatakanaCombos(self): """Add katakana-only sounds which are composed from other kana.""" for romaji in ("ye", "wi", "we", "wo", "kwa", "kwi", "kwe", "kwo", "gwa", "gwi", "gwe", "gwo", "she", "je", "che", "ti", "tu", "tyu", "di", "du", "dyu", "tsa", "tsi", "tse", "tso", "fa", "fi", "fe", "fo", "fyu", "va", "vi", "ve", "vo", "vyu"): i = romaji.endswith("yu") and 2 or 1 self._addMulti(romaji, ({ "y": "i", "w": "u", "kw": "ku", "gw": "gu", "sh": "shi", "j": "ji", "ch": "chi", "t": "te", "d": "de", "ts": "tsu", "f": "fu", "v": "vu"}.get(romaji[:-i]), "x" + romaji[-i:]), only = KATAKANA) def addSmallTsus(self): """Add combinations prefixed with small tsu.""" for romaji in self.kana.keys(): c = romaji[0] if c in "kptcs": c = {"c": "t"}.get(c, c) self._addMulti(c + romaji, ("xtsu", romaji)) def addLongVowels(self): """Add katakana with long vowels.""" for romaji in self.kana.keys(): if romaji[0] != "x" and romaji[-1] in "aiueo": self._addMulti( romaji + romaji[-1], (romaji, "-"), only = KATAKANA) def _addMulti(self, romaji, bits, only = None): if only != KATAKANA: self.kana[romaji] = reduce( operator.add, [self.kana[bit] for bit in bits]) if only != HIRAGANA: self.kana[romaji.upper()] = reduce( operator.add, [self.kana[bit.upper()] for bit in bits]) def convert(self, input): """Convert some romaji to kana.""" output = "" tries = range( apply(max, [len(romaji) for romaji in self.kana.keys()]), 0, -1) while input: for length in tries: if len(input) < length: continue kana = self.kana.get(input[:length]) if kana is not None: output += reduce(operator.add, map(unichr, kana)) input = input[length:] break else: output += input[0] input = input[1:] return output def analyse(self): chars = {} for block in self.kana.values(): for char in block: chars[char] = 1 chars = chars.keys() missing = [i for i in range(12289, 12541) if i not in chars] ranges = [[missing[0]]] for char in missing[1:]: if char == ranges[-1][-1] + 1: ranges[-1].append(char) elif char != ranges[-1][-1]: ranges.append([char]) print ", ".join([len(r) == 1 and "%x" % r[0] or len(r) == 2 and "%x, %x" % (r[0], r[-1]) or "%x-%x" % (r[0], r[-1]) for r in ranges]) def dump(self, convert): output = " $table = array(\n" entries = {} num_entries = 0 for entry in [ '%s => %s' % (repr(r), convert(self.kana[r])) for r in self.kana.keys()]: if entries.has_key(len(entry)): entries[len(entry)].append(entry) else: entries[len(entry)] = [entry] num_entries += 1 keys = entries.keys() keys.sort() keys.reverse() while num_entries: line = [] while num_entries: length = 70 - len(", ".join(line) + ",") for key in keys: if key < length: break else: break item = entries[key].pop() if len(entries[key]) == 0: keys.remove(key) num_entries -= 1 line.append(item) line = "\t" + ", ".join(line) if num_entries: line += ",\n" output += line output += ");" return output def UTF8Encoded(chars): return '"' + "".join([ "\\x%x" % ord(c) for c in "".join(map(unichr, chars)).encode("utf-8")]) + '"' class FunkyEncoded: def __init__(self, xor): self.xor = xor def __call__(self, chars): return self.funkyrepr("".join(map(self.funkychr, chars))) def funkychr(self, char): # Map unicode 3000-30FF into 00-FF # (that's 3000-303F CJK Symbols and Punctuation, # 3040-309F Hiragana, # and 30A0-30FF Katakana) # with unicode FF5E mapped to 20 # and FF01-FF1F mapped to 21-3F # and FF3E-FF3F mapped to 1E-1F # (that's bits of FF00-FFEF Halfwidth and Fullwidth Forms) if (char >= 0x3000 and char <= 0x301F or char >= 0x3040 and char <= 0x30FF): char -= 0x3000 elif char == 0xFF5E: char = 0x20 elif char >= 0xFF01 and char <= 0xFF1F: char -= 0xFEE0 elif char >= 0xFF3E and char <= 0xFF3F: char -= 0xFF20 return chr(char ^ self.xor) def funkyrepr(self, input): output = "" quote = "'" for c in input: if c in "\\'": output += "\\" + c elif ord(c) < 32 or ord(c) > 126: output += "\\%o" % ord(c) quote = '"' else: output += c return quote + output + quote class WunkyEncoded(FunkyEncoded): def funkyrepr(self, input): output = 0 input = map(ord, input) input.reverse() for c in input: output <<= 8 output |= c return output if __name__ == "__main__": table = KanaTable() # We need to find an XOR which will stop any numbers being # negative, thus avoiding 32/64-bit issues, and it has to be an # unused character to avoid zeros, which are used to denote the # end of the 'string'. possibles = [] encode = WunkyEncoded(0) for kana in table.kana.values(): for xor in [encode((char,)) for char in kana]: if xor not in possibles: possibles.append(xor) possibles = [c for c in range(256) if c and c not in possibles] possibles.sort() print >>sys.stderr, "Filtering %d possibles" % len(possibles) xors = [] for xor in possibles: encode = WunkyEncoded(xor) for kana in table.kana.values(): if encode(kana) < 0: break else: xors.append(xor) print >>sys.stderr, "Testing %d valid xors" % len(xors) scores = {} for xor in xors: score = len(table.dump(WunkyEncoded(xor))) if scores.has_key(score): scores[score].append(xor) else: scores[score] = [xor] keys = scores.keys() keys.sort() bestscore = keys[0] bestxors = scores[bestscore] bestxors.sort() print >>sys.stderr, "Best: %d (xors = %s)" % (bestscore, bestxors) bestxor = bestxors[0] print >>sys.stderr, "Using xor = %d" % bestxor print table.dump(WunkyEncoded(bestxor))