diff options
Diffstat (limited to 'www/wiki/maintenance/language/zhtable/Makefile.py')
-rwxr-xr-x | www/wiki/maintenance/language/zhtable/Makefile.py | 452 |
1 files changed, 452 insertions, 0 deletions
diff --git a/www/wiki/maintenance/language/zhtable/Makefile.py b/www/wiki/maintenance/language/zhtable/Makefile.py new file mode 100755 index 00000000..abe08e4b --- /dev/null +++ b/www/wiki/maintenance/language/zhtable/Makefile.py @@ -0,0 +1,452 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @author Philip +import os +import platform +import re +import shutil +import sys +import tarfile +import zipfile + +pyversion = platform.python_version() +islinux = platform.system().lower() == 'linux' + +if pyversion[:3] in ['2.6', '2.7']: + import urllib as urllib_request + import codecs + open = codecs.open + _unichr = unichr + if sys.maxunicode < 0x10000: + def unichr(i): + if i < 0x10000: + return _unichr(i) + else: + return _unichr(0xD7C0 + (i >> 10)) + _unichr(0xDC00 + (i & 0x3FF)) +elif pyversion[:2] == '3.': + import urllib.request as urllib_request + unichr = chr + + +def unichr2(*args): + return [unichr(int(i.split('<')[0][2:], 16)) for i in args] + + +def unichr3(*args): + return [unichr(int(i[2:7], 16)) for i in args if i[2:7]] + +# DEFINE +UNIHAN_VER = '6.3.0' +SF_MIRROR = 'dfn' +SCIM_TABLES_VER = '0.5.13' +SCIM_PINYIN_VER = '0.5.92' +LIBTABE_VER = '0.2.3' +# END OF DEFINE + + +def download(url, dest): + if os.path.isfile(dest): + print('File %s is up to date.' % dest) + return + global islinux + if islinux: + # we use wget instead urlretrieve under Linux, + # because wget could display details like download progress + os.system('wget %s -O %s' % (url, dest)) + else: + print('Downloading from [%s] ...' % url) + urllib_request.urlretrieve(url, dest) + print('Download complete.\n') + return + + +def uncompress(fp, member, encoding='U8'): + name = member.rsplit('/', 1)[-1] + print('Extracting %s ...' % name) + fp.extract(member) + shutil.move(member, name) + if '/' in member: + shutil.rmtree(member.split('/', 1)[0]) + if pyversion[:1] in ['2']: + fc = open(name, 'rb', encoding, 'ignore') + else: + fc = open(name, 'r', encoding=encoding, errors='ignore') + return fc + +unzip = lambda path, member, encoding = 'U8': \ + uncompress(zipfile.ZipFile(path), member, encoding) + +untargz = lambda path, member, encoding = 'U8': \ + uncompress(tarfile.open(path, 'r:gz'), member, encoding) + + +def parserCore(fp, pos, beginmark=None, endmark=None): + if beginmark and endmark: + start = False + else: + start = True + mlist = set() + for line in fp: + if beginmark and line.startswith(beginmark): + start = True + continue + elif endmark and line.startswith(endmark): + break + if start and not line.startswith('#'): + elems = line.split() + if len(elems) < 2: + continue + elif len(elems[0]) > 1 and len(elems[pos]) > 1: # words only + mlist.add(elems[pos]) + return mlist + + +def tablesParser(path, name): + """ Read file from scim-tables and parse it. """ + global SCIM_TABLES_VER + src = 'scim-tables-%s/tables/zh/%s' % (SCIM_TABLES_VER, name) + fp = untargz(path, src, 'U8') + return parserCore(fp, 1, 'BEGIN_TABLE', 'END_TABLE') + +ezbigParser = lambda path: tablesParser(path, 'EZ-Big.txt.in') +wubiParser = lambda path: tablesParser(path, 'Wubi.txt.in') +zrmParser = lambda path: tablesParser(path, 'Ziranma.txt.in') + + +def phraseParser(path): + """ Read phrase_lib.txt and parse it. """ + global SCIM_PINYIN_VER + src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER + fp = untargz(path, src, 'U8') + return parserCore(fp, 0) + + +def tsiParser(path): + """ Read tsi.src and parse it. """ + src = 'libtabe/tsi-src/tsi.src' + fp = untargz(path, src, 'big5hkscs') + return parserCore(fp, 0) + + +def unihanParser(path): + """ Read Unihan_Variants.txt and parse it. """ + fp = unzip(path, 'Unihan_Variants.txt', 'U8') + t2s = dict() + s2t = dict() + for line in fp: + if line.startswith('#'): + continue + else: + elems = line.split() + if len(elems) < 3: + continue + type = elems.pop(1) + elems = unichr2(*elems) + if type == 'kTraditionalVariant': + s2t[elems[0]] = elems[1:] + elif type == 'kSimplifiedVariant': + t2s[elems[0]] = elems[1:] + fp.close() + return (t2s, s2t) + + +def applyExcludes(mlist, path): + """ Apply exclude rules from path to mlist. """ + if pyversion[:1] in ['2']: + excludes = open(path, 'rb', 'U8').read().split() + else: + excludes = open(path, 'r', encoding='U8').read().split() + excludes = [word.split('#')[0].strip() for word in excludes] + excludes = '|'.join(excludes) + excptn = re.compile('.*(?:%s).*' % excludes) + diff = [mword for mword in mlist if excptn.search(mword)] + mlist.difference_update(diff) + return mlist + + +def charManualTable(path): + fp = open(path, 'r', encoding='U8') + for line in fp: + elems = line.split('#')[0].split('|') + elems = unichr3(*elems) + if len(elems) > 1: + yield elems[0], elems[1:] + + +def toManyRules(src_table): + tomany = set() + if pyversion[:1] in ['2']: + for (f, t) in src_table.iteritems(): + for i in range(1, len(t)): + tomany.add(t[i]) + else: + for (f, t) in src_table.items(): + for i in range(1, len(t)): + tomany.add(t[i]) + return tomany + + +def removeRules(path, table): + fp = open(path, 'r', encoding='U8') + texc = list() + for line in fp: + elems = line.split('=>') + f = t = elems[0].strip() + if len(elems) == 2: + t = elems[1].strip() + f = f.strip('"').strip("'") + t = t.strip('"').strip("'") + if f: + try: + table.pop(f) + except: + pass + if t: + texc.append(t) + texcptn = re.compile('^(?:%s)$' % '|'.join(texc)) + if pyversion[:1] in ['2']: + for (tmp_f, tmp_t) in table.copy().iteritems(): + if texcptn.match(tmp_t): + table.pop(tmp_f) + else: + for (tmp_f, tmp_t) in table.copy().items(): + if texcptn.match(tmp_t): + table.pop(tmp_f) + return table + + +def customRules(path): + fp = open(path, 'r', encoding='U8') + ret = dict() + for line in fp: + line = line.rstrip('\r\n') + if '#' in line: + line = line.split('#')[0].rstrip() + elems = line.split('\t') + if len(elems) > 1: + ret[elems[0]] = elems[1] + return ret + + +def dictToSortedList(src_table, pos): + return sorted(src_table.items(), key=lambda m: (m[pos], m[1 - pos])) + + +def translate(text, conv_table): + i = 0 + while i < len(text): + for j in range(len(text) - i, 0, -1): + f = text[i:][:j] + t = conv_table.get(f) + if t: + text = text[:i] + t + text[i:][j:] + i += len(t) - 1 + break + i += 1 + return text + + +def manualWordsTable(path, conv_table, reconv_table): + fp = open(path, 'r', encoding='U8') + reconv_table = reconv_table.copy() + out_table = {} + wordlist = [line.split('#')[0].strip() for line in fp] + wordlist = list(set(wordlist)) + wordlist.sort(key=lambda w: (len(w), w), reverse=True) + while wordlist: + word = wordlist.pop() + new_word = translate(word, conv_table) + rcv_word = translate(word, reconv_table) + if word != rcv_word: + reconv_table[word] = out_table[word] = word + reconv_table[new_word] = out_table[new_word] = word + return out_table + + +def defaultWordsTable(src_wordlist, src_tomany, char_conv_table, + char_reconv_table): + wordlist = list(src_wordlist) + wordlist.sort(key=lambda w: (len(w), w), reverse=True) + word_conv_table = {} + word_reconv_table = {} + conv_table = char_conv_table.copy() + reconv_table = char_reconv_table.copy() + tomanyptn = re.compile('(?:%s)' % '|'.join(src_tomany)) + while wordlist: + conv_table.update(word_conv_table) + reconv_table.update(word_reconv_table) + word = wordlist.pop() + new_word_len = word_len = len(word) + while new_word_len == word_len: + test_word = translate(word, reconv_table) + new_word = translate(word, conv_table) + if not reconv_table.get(new_word) and \ + (test_word != word or + (tomanyptn.search(word) and + word != translate(new_word, reconv_table))): + word_conv_table[word] = new_word + word_reconv_table[new_word] = word + try: + word = wordlist.pop() + except IndexError: + break + new_word_len = len(word) + return word_reconv_table + + +def PHPArray(table): + lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t] + return '\n'.join(lines) + + +def main(): + # Get Unihan.zip: + url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER + han_dest = 'Unihan-%s.zip' % UNIHAN_VER + download(url, han_dest) + + sfurlbase = 'http://%s.dl.sourceforge.net/sourceforge/' % SF_MIRROR + + # Get scim-tables-$(SCIM_TABLES_VER).tar.gz: + url = sfurlbase + 'scim/scim-tables-%s.tar.gz' % SCIM_TABLES_VER + tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER + download(url, tbe_dest) + + # Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz: + url = sfurlbase + 'scim/scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER + pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER + download(url, pyn_dest) + + # Get libtabe-$(LIBTABE_VER).tgz: + url = sfurlbase + 'libtabe/libtabe-%s.tgz' % LIBTABE_VER + lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER + download(url, lbt_dest) + + # Unihan.txt + (t2s_1tomany, s2t_1tomany) = unihanParser(han_dest) + + t2s_1tomany.update(charManualTable('symme_supp.manual')) + t2s_1tomany.update(charManualTable('trad2simp.manual')) + s2t_1tomany.update((t[0], [f]) for (f, t) in charManualTable('symme_supp.manual')) + s2t_1tomany.update(charManualTable('simp2trad.manual')) + + if pyversion[:1] in ['2']: + t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.iteritems()]) + s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.iteritems()]) + else: + t2s_1to1 = dict([(f, t[0]) for (f, t) in t2s_1tomany.items()]) + s2t_1to1 = dict([(f, t[0]) for (f, t) in s2t_1tomany.items()]) + + s_tomany = toManyRules(t2s_1tomany) + t_tomany = toManyRules(s2t_1tomany) + + # noconvert rules + t2s_1to1 = removeRules('trad2simp_noconvert.manual', t2s_1to1) + s2t_1to1 = removeRules('simp2trad_noconvert.manual', s2t_1to1) + + # the supper set for word to word conversion + t2s_1to1_supp = t2s_1to1.copy() + s2t_1to1_supp = s2t_1to1.copy() + t2s_1to1_supp.update(customRules('trad2simp_supp_set.manual')) + s2t_1to1_supp.update(customRules('simp2trad_supp_set.manual')) + + # word to word manual rules + t2s_word2word_manual = manualWordsTable('simpphrases.manual', + s2t_1to1_supp, t2s_1to1_supp) + t2s_word2word_manual.update(customRules('toSimp.manual')) + s2t_word2word_manual = manualWordsTable('tradphrases.manual', + t2s_1to1_supp, s2t_1to1_supp) + s2t_word2word_manual.update(customRules('toTrad.manual')) + + # word to word rules from input methods + t_wordlist = set() + s_wordlist = set() + t_wordlist.update(ezbigParser(tbe_dest), + tsiParser(lbt_dest)) + s_wordlist.update(wubiParser(tbe_dest), + zrmParser(tbe_dest), + phraseParser(pyn_dest)) + + # exclude + s_wordlist = applyExcludes(s_wordlist, 'simpphrases_exclude.manual') + t_wordlist = applyExcludes(t_wordlist, 'tradphrases_exclude.manual') + + s2t_supp = s2t_1to1_supp.copy() + s2t_supp.update(s2t_word2word_manual) + t2s_supp = t2s_1to1_supp.copy() + t2s_supp.update(t2s_word2word_manual) + + # parse list to dict + t2s_word2word = defaultWordsTable(s_wordlist, s_tomany, + s2t_1to1_supp, t2s_supp) + t2s_word2word.update(t2s_word2word_manual) + s2t_word2word = defaultWordsTable(t_wordlist, t_tomany, + t2s_1to1_supp, s2t_supp) + s2t_word2word.update(s2t_word2word_manual) + + # Final tables + # sorted list toHans + if pyversion[:1] in ['2']: + t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.iteritems() if f != t]) + else: + t2s_1to1 = dict([(f, t) for (f, t) in t2s_1to1.items() if f != t]) + toHans = dictToSortedList(t2s_1to1, 0) + dictToSortedList(t2s_word2word, 1) + # sorted list toHant + if pyversion[:1] in ['2']: + s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.iteritems() if f != t]) + else: + s2t_1to1 = dict([(f, t) for (f, t) in s2t_1to1.items() if f != t]) + toHant = dictToSortedList(s2t_1to1, 0) + dictToSortedList(s2t_word2word, 1) + # sorted list toCN + toCN = dictToSortedList(customRules('toCN.manual'), 1) + # sorted list toHK + toHK = dictToSortedList(customRules('toHK.manual'), 1) + # sorted list toTW + toTW = dictToSortedList(customRules('toTW.manual'), 1) + + # Get PHP Array + php = '''<?php +/** + * Simplified / Traditional Chinese conversion tables + * + * Automatically generated using code and data in maintenance/language/zhtable/ + * Do not modify directly! + * + * @file + */ + +namespace MediaWiki\Languages\Data; + +class ZhConversion { +public static $zh2Hant = [\n''' + php += PHPArray(toHant) \ + + '\n];\n\npublic static $zh2Hans = [\n' \ + + PHPArray(toHans) \ + + '\n];\n\npublic static $zh2TW = [\n' \ + + PHPArray(toTW) \ + + '\n];\n\npublic static $zh2HK = [\n' \ + + PHPArray(toHK) \ + + '\n];\n\npublic static $zh2CN = [\n' \ + + PHPArray(toCN) \ + + '\n];\n}\n' + + if pyversion[:1] in ['2']: + f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'wb', encoding='utf8') + else: + f = open(os.path.join('..', '..', '..', 'languages', 'data', 'ZhConversion.php'), 'w', buffering=4096, encoding='utf8') + print ('Writing ZhConversion.php ... ') + f.write(php) + f.close() + + # Remove temporary files + print ('Deleting temporary files ... ') + os.remove('EZ-Big.txt.in') + os.remove('phrase_lib.txt') + os.remove('tsi.src') + os.remove('Unihan_Variants.txt') + os.remove('Wubi.txt.in') + os.remove('Ziranma.txt.in') + + +if __name__ == '__main__': + main() |