summaryrefslogtreecommitdiff
path: root/platform/www/inc/Utf8
diff options
context:
space:
mode:
Diffstat (limited to 'platform/www/inc/Utf8')
-rw-r--r--platform/www/inc/Utf8/Asian.php99
-rw-r--r--platform/www/inc/Utf8/Clean.php204
-rw-r--r--platform/www/inc/Utf8/Conversion.php162
-rw-r--r--platform/www/inc/Utf8/PhpString.php383
-rw-r--r--platform/www/inc/Utf8/Table.php93
-rw-r--r--platform/www/inc/Utf8/Unicode.php277
-rw-r--r--platform/www/inc/Utf8/tables/case.php659
-rw-r--r--platform/www/inc/Utf8/tables/loweraccents.php116
-rw-r--r--platform/www/inc/Utf8/tables/romanization.php1458
-rw-r--r--platform/www/inc/Utf8/tables/specials.php615
-rw-r--r--platform/www/inc/Utf8/tables/upperaccents.php114
11 files changed, 4180 insertions, 0 deletions
diff --git a/platform/www/inc/Utf8/Asian.php b/platform/www/inc/Utf8/Asian.php
new file mode 100644
index 0000000..c7baa30
--- /dev/null
+++ b/platform/www/inc/Utf8/Asian.php
@@ -0,0 +1,99 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * Methods and constants to handle Asian "words"
+ *
+ * This uses a crude regexp to determine which parts of an Asian string should be treated as words.
+ * This is necessary because in some Asian languages a single unicode char represents a whole idea
+ * without spaces separating them.
+ */
+class Asian
+{
+
+ /**
+ * This defines a non-capturing group for the use in regular expressions to match any asian character that
+ * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from
+ * http://en.wikipedia.org/wiki/Unicode_block
+ */
+ const REGEXP =
+ '(?:' .
+
+ '[\x{0E00}-\x{0E7F}]' . // Thai
+
+ '|' .
+
+ '[' .
+ '\x{2E80}-\x{3040}' . // CJK -> Hangul
+ '\x{309D}-\x{30A0}' .
+ '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' .
+ '\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs
+ '\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms
+ "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B
+ "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C
+ "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D
+ "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement
+ ']' .
+
+ '|' .
+
+ '[' . // Hiragana/Katakana (can be two characters)
+ '\x{3042}\x{3044}\x{3046}\x{3048}' .
+ '\x{304A}-\x{3062}\x{3064}-\x{3082}' .
+ '\x{3084}\x{3086}\x{3088}-\x{308D}' .
+ '\x{308F}-\x{3094}' .
+ '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' .
+ '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' .
+ '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' .
+ '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' .
+ '][' .
+ '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' .
+ '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' .
+ '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' .
+ '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' .
+ '\x{31F0}-\x{31FF}' .
+ ']?' .
+ ')';
+
+
+ /**
+ * Check if the given term contains Asian word characters
+ *
+ * @param string $term
+ * @return bool
+ */
+ public static function isAsianWords($term)
+ {
+ return (bool)preg_match('/' . self::REGEXP . '/u', $term);
+ }
+
+ /**
+ * Surround all Asian words in the given text with the given separator
+ *
+ * @param string $text Original text containing asian words
+ * @param string $sep the separator to use
+ * @return string Text with separated asian words
+ */
+ public static function separateAsianWords($text, $sep = ' ')
+ {
+ // handle asian chars as single words (may fail on older PHP version)
+ $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text);
+ if (!is_null($asia)) $text = $asia; // recover from regexp falure
+
+ return $text;
+ }
+
+ /**
+ * Split the given text into separate parts
+ *
+ * Each part is either a non-asian string, or a single asian word
+ *
+ * @param string $term
+ * @return string[]
+ */
+ public static function splitAsianWords($term)
+ {
+ return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
+ }
+}
diff --git a/platform/www/inc/Utf8/Clean.php b/platform/www/inc/Utf8/Clean.php
new file mode 100644
index 0000000..0975ff5
--- /dev/null
+++ b/platform/www/inc/Utf8/Clean.php
@@ -0,0 +1,204 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * Methods to assess and clean UTF-8 strings
+ */
+class Clean
+{
+ /**
+ * Checks if a string contains 7bit ASCII only
+ *
+ * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
+ *
+ * @param string $str
+ * @return bool
+ */
+ public static function isASCII($str)
+ {
+ return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
+ }
+
+ /**
+ * Tries to detect if a string is in Unicode encoding
+ *
+ * @author <bmorel@ssi.fr>
+ * @link http://php.net/manual/en/function.utf8-encode.php
+ *
+ * @param string $str
+ * @return bool
+ */
+ public static function isUtf8($str)
+ {
+ $len = strlen($str);
+ for ($i = 0; $i < $len; $i++) {
+ $b = ord($str[$i]);
+ if ($b < 0x80) continue; # 0bbbbbbb
+ elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
+ elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
+ elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
+ elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
+ elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
+ else return false; # Does not match any model
+
+ for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
+ if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Strips all high byte chars
+ *
+ * Returns a pure ASCII7 string
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ *
+ * @param string $str
+ * @return string
+ */
+ public static function strip($str)
+ {
+ $ascii = '';
+ $len = strlen($str);
+ for ($i = 0; $i < $len; $i++) {
+ if (ord($str[$i]) < 128) {
+ $ascii .= $str[$i];
+ }
+ }
+ return $ascii;
+ }
+
+ /**
+ * Removes special characters (nonalphanumeric) from a UTF-8 string
+ *
+ * This function adds the controlchars 0x00 to 0x19 to the array of
+ * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ *
+ * @param string $string The UTF8 string to strip of special chars
+ * @param string $repl Replace special with this string
+ * @param string $additional Additional chars to strip (used in regexp char class)
+ * @return string
+ */
+ public static function stripspecials($string, $repl = '', $additional = '')
+ {
+ static $specials = null;
+ if ($specials === null) {
+ $specials = preg_quote(Table::specialChars(), '/');
+ }
+
+ return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
+ }
+
+ /**
+ * Replace bad bytes with an alternative character
+ *
+ * ASCII character is recommended for replacement char
+ *
+ * PCRE Pattern to locate bad bytes in a UTF-8 string
+ * Comes from W3 FAQ: Multilingual Forms
+ * Note: modified to include full ASCII range including control chars
+ *
+ * @author Harry Fuecks <hfuecks@gmail.com>
+ * @see http://www.w3.org/International/questions/qa-forms-utf-8
+ *
+ * @param string $str to search
+ * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
+ * @return string
+ */
+ public static function replaceBadBytes($str, $replace = '')
+ {
+ $UTF8_BAD =
+ '([\x00-\x7F]' . # ASCII (including control chars)
+ '|[\xC2-\xDF][\x80-\xBF]' . # non-overlong 2-byte
+ '|\xE0[\xA0-\xBF][\x80-\xBF]' . # excluding overlongs
+ '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' . # straight 3-byte
+ '|\xED[\x80-\x9F][\x80-\xBF]' . # excluding surrogates
+ '|\xF0[\x90-\xBF][\x80-\xBF]{2}' . # planes 1-3
+ '|[\xF1-\xF3][\x80-\xBF]{3}' . # planes 4-15
+ '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . # plane 16
+ '|(.{1}))'; # invalid byte
+ ob_start();
+ while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
+ if (!isset($matches[2])) {
+ echo $matches[0];
+ } else {
+ echo $replace;
+ }
+ $str = substr($str, strlen($matches[0]));
+ }
+ return ob_get_clean();
+ }
+
+
+ /**
+ * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
+ *
+ * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
+ * letters. Default is to deaccent both cases ($case = 0)
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ *
+ * @param string $string
+ * @param int $case
+ * @return string
+ */
+ public static function deaccent($string, $case = 0)
+ {
+ if ($case <= 0) {
+ $string = strtr($string, Table::lowerAccents());
+ }
+ if ($case >= 0) {
+ $string = strtr($string, Table::upperAccents());
+ }
+ return $string;
+ }
+
+ /**
+ * Romanize a non-latin string
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ *
+ * @param string $string
+ * @return string
+ */
+ public static function romanize($string)
+ {
+ if (self::isASCII($string)) return $string; //nothing to do
+
+ return strtr($string, Table::romanization());
+ }
+
+ /**
+ * adjust a byte index into a utf8 string to a utf8 character boundary
+ *
+ * @author chris smith <chris@jalakai.co.uk>
+ *
+ * @param string $str utf8 character string
+ * @param int $i byte index into $str
+ * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
+ * @return int byte index into $str now pointing to a utf8 character boundary
+ */
+ public static function correctIdx($str, $i, $next = false)
+ {
+
+ if ($i <= 0) return 0;
+
+ $limit = strlen($str);
+ if ($i >= $limit) return $limit;
+
+ if ($next) {
+ while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
+ } else {
+ while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
+ }
+
+ return $i;
+ }
+
+}
diff --git a/platform/www/inc/Utf8/Conversion.php b/platform/www/inc/Utf8/Conversion.php
new file mode 100644
index 0000000..fad9cd0
--- /dev/null
+++ b/platform/www/inc/Utf8/Conversion.php
@@ -0,0 +1,162 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * Methods to convert from and to UTF-8 strings
+ */
+class Conversion
+{
+
+ /**
+ * Encodes UTF-8 characters to HTML entities
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ * @author <vpribish at shopping dot com>
+ * @link http://php.net/manual/en/function.utf8-decode.php
+ *
+ * @param string $str
+ * @param bool $all Encode non-utf8 char to HTML as well
+ * @return string
+ */
+ public static function toHtml($str, $all = false)
+ {
+ $ret = '';
+ foreach (Unicode::fromUtf8($str) as $cp) {
+ if ($cp < 0x80 && !$all) {
+ $ret .= chr($cp);
+ } elseif ($cp < 0x100) {
+ $ret .= "&#$cp;";
+ } else {
+ $ret .= '&#x' . dechex($cp) . ';';
+ }
+ }
+ return $ret;
+ }
+
+ /**
+ * Decodes HTML entities to UTF-8 characters
+ *
+ * Convert any &#..; entity to a codepoint,
+ * The entities flag defaults to only decoding numeric entities.
+ * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
+ * are handled as well. Avoids the problem that would occur if you
+ * had to decode "&amp;#38;&#38;amp;#38;"
+ *
+ * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
+ * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
+ * what it should be -> "&#38;&amp#38;"
+ *
+ * @author Tom N Harris <tnharris@whoopdedo.org>
+ *
+ * @param string $str UTF-8 encoded string
+ * @param boolean $entities decode name entities in addtition to numeric ones
+ * @return string UTF-8 encoded string with numeric (and named) entities replaced.
+ */
+ public static function fromHtml($str, $entities = false)
+ {
+ if (!$entities) {
+ return preg_replace_callback(
+ '/(&#([Xx])?([0-9A-Za-z]+);)/m',
+ [__CLASS__, 'decodeNumericEntity'],
+ $str
+ );
+ }
+
+ return preg_replace_callback(
+ '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
+ [__CLASS__, 'decodeAnyEntity'],
+ $str
+ );
+ }
+
+ /**
+ * Decodes any HTML entity to it's correct UTF-8 char equivalent
+ *
+ * @param string $ent An entity
+ * @return string
+ */
+ protected static function decodeAnyEntity($ent)
+ {
+ // create the named entity lookup table
+ static $table = null;
+ if ($table === null) {
+ $table = get_html_translation_table(HTML_ENTITIES);
+ $table = array_flip($table);
+ $table = array_map(
+ static function ($c) {
+ return Unicode::toUtf8(array(ord($c)));
+ },
+ $table
+ );
+ }
+
+ if ($ent[1] === '#') {
+ return self::decodeNumericEntity($ent);
+ }
+
+ if (array_key_exists($ent[0], $table)) {
+ return $table[$ent[0]];
+ }
+
+ return $ent[0];
+ }
+
+ /**
+ * Decodes numeric HTML entities to their correct UTF-8 characters
+ *
+ * @param $ent string A numeric entity
+ * @return string|false
+ */
+ protected static function decodeNumericEntity($ent)
+ {
+ switch ($ent[2]) {
+ case 'X':
+ case 'x':
+ $cp = hexdec($ent[3]);
+ break;
+ default:
+ $cp = intval($ent[3]);
+ break;
+ }
+ return Unicode::toUtf8(array($cp));
+ }
+
+ /**
+ * UTF-8 to UTF-16BE conversion.
+ *
+ * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
+ *
+ * @param string $str
+ * @param bool $bom
+ * @return string
+ */
+ public static function toUtf16be($str, $bom = false)
+ {
+ $out = $bom ? "\xFE\xFF" : '';
+ if (UTF8_MBSTRING) {
+ return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
+ }
+
+ $uni = Unicode::fromUtf8($str);
+ foreach ($uni as $cp) {
+ $out .= pack('n', $cp);
+ }
+ return $out;
+ }
+
+ /**
+ * UTF-8 to UTF-16BE conversion.
+ *
+ * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
+ *
+ * @param string $str
+ * @return false|string
+ */
+ public static function fromUtf16be($str)
+ {
+ $uni = unpack('n*', $str);
+ return Unicode::toUtf8($uni);
+ }
+
+}
diff --git a/platform/www/inc/Utf8/PhpString.php b/platform/www/inc/Utf8/PhpString.php
new file mode 100644
index 0000000..5bcd601
--- /dev/null
+++ b/platform/www/inc/Utf8/PhpString.php
@@ -0,0 +1,383 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * UTF-8 aware equivalents to PHP's string functions
+ */
+class PhpString
+{
+
+ /**
+ * A locale independent basename() implementation
+ *
+ * works around a bug in PHP's basename() implementation
+ *
+ * @param string $path A path
+ * @param string $suffix If the name component ends in suffix this will also be cut off
+ * @return string
+ * @link https://bugs.php.net/bug.php?id=37738
+ *
+ * @see basename()
+ */
+ public static function basename($path, $suffix = '')
+ {
+ $path = trim($path, '\\/');
+ $rpos = max(strrpos($path, '/'), strrpos($path, '\\'));
+ if ($rpos) {
+ $path = substr($path, $rpos + 1);
+ }
+
+ $suflen = strlen($suffix);
+ if ($suflen && (substr($path, -$suflen) === $suffix)) {
+ $path = substr($path, 0, -$suflen);
+ }
+
+ return $path;
+ }
+
+ /**
+ * Unicode aware replacement for strlen()
+ *
+ * utf8_decode() converts characters that are not in ISO-8859-1
+ * to '?', which, for the purpose of counting, is alright - It's
+ * even faster than mb_strlen.
+ *
+ * @param string $string
+ * @return int
+ * @see utf8_decode()
+ *
+ * @author <chernyshevsky at hotmail dot com>
+ * @see strlen()
+ */
+ public static function strlen($string)
+ {
+ if (function_exists('utf8_decode')) {
+ return strlen(utf8_decode($string));
+ }
+
+ if (UTF8_MBSTRING) {
+ return mb_strlen($string, 'UTF-8');
+ }
+
+ if (function_exists('iconv_strlen')) {
+ return iconv_strlen($string, 'UTF-8');
+ }
+
+ return strlen($string);
+ }
+
+ /**
+ * UTF-8 aware alternative to substr
+ *
+ * Return part of a string given character offset (and optionally length)
+ *
+ * @param string $str
+ * @param int $offset number of UTF-8 characters offset (from left)
+ * @param int $length (optional) length in UTF-8 characters from offset
+ * @return string
+ * @author Harry Fuecks <hfuecks@gmail.com>
+ * @author Chris Smith <chris@jalakai.co.uk>
+ *
+ */
+ public static function substr($str, $offset, $length = null)
+ {
+ if (UTF8_MBSTRING) {
+ if ($length === null) {
+ return mb_substr($str, $offset);
+ }
+
+ return mb_substr($str, $offset, $length);
+ }
+
+ /*
+ * Notes:
+ *
+ * no mb string support, so we'll use pcre regex's with 'u' flag
+ * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
+ * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
+ *
+ * substr documentation states false can be returned in some cases (e.g. offset > string length)
+ * mb_substr never returns false, it will return an empty string instead.
+ *
+ * calculating the number of characters in the string is a relatively expensive operation, so
+ * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
+ */
+
+ // cast parameters to appropriate types to avoid multiple notices/warnings
+ $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects
+ $offset = (int)$offset;
+ if ($length !== null) $length = (int)$length;
+
+ // handle trivial cases
+ if ($length === 0) return '';
+ if ($offset < 0 && $length < 0 && $length < $offset) return '';
+
+ $offset_pattern = '';
+ $length_pattern = '';
+
+ // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
+ if ($offset < 0) {
+ $strlen = self::strlen($str); // see notes
+ $offset = $strlen + $offset;
+ if ($offset < 0) $offset = 0;
+ }
+
+ // establish a pattern for offset, a non-captured group equal in length to offset
+ if ($offset > 0) {
+ $Ox = (int)($offset / 65535);
+ $Oy = $offset % 65535;
+
+ if ($Ox) $offset_pattern = '(?:.{65535}){' . $Ox . '}';
+ $offset_pattern = '^(?:' . $offset_pattern . '.{' . $Oy . '})';
+ } else {
+ $offset_pattern = '^'; // offset == 0; just anchor the pattern
+ }
+
+ // establish a pattern for length
+ if ($length === null) {
+ $length_pattern = '(.*)$'; // the rest of the string
+ } else {
+
+ if (!isset($strlen)) $strlen = self::strlen($str); // see notes
+ if ($offset > $strlen) return ''; // another trivial case
+
+ if ($length > 0) {
+
+ // reduce any length that would go past the end of the string
+ $length = min($strlen - $offset, $length);
+
+ $Lx = (int)($length / 65535);
+ $Ly = $length % 65535;
+
+ // +ve length requires ... a captured group of length characters
+ if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
+ $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
+
+ } else if ($length < 0) {
+
+ if ($length < ($offset - $strlen)) return '';
+
+ $Lx = (int)((-$length) / 65535);
+ $Ly = (-$length) % 65535;
+
+ // -ve length requires ... capture everything except a group of -length characters
+ // anchored at the tail-end of the string
+ if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
+ $length_pattern = '(.*)(?:' . $length_pattern . '.{' . $Ly . '})$';
+ }
+ }
+
+ if (!preg_match('#' . $offset_pattern . $length_pattern . '#us', $str, $match)) return '';
+ return $match[1];
+ }
+
+ // phpcs:disable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
+ /**
+ * Unicode aware replacement for substr_replace()
+ *
+ * @param string $string input string
+ * @param string $replacement the replacement
+ * @param int $start the replacing will begin at the start'th offset into string.
+ * @param int $length If given and is positive, it represents the length of the portion of string which is
+ * to be replaced. If length is zero then this function will have the effect of inserting
+ * replacement into string at the given start offset.
+ * @return string
+ * @see substr_replace()
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+ public static function substr_replace($string, $replacement, $start, $length = 0)
+ {
+ $ret = '';
+ if ($start > 0) $ret .= self::substr($string, 0, $start);
+ $ret .= $replacement;
+ $ret .= self::substr($string, $start + $length);
+ return $ret;
+ }
+ // phpcs:enable PSR1.Methods.CamelCapsMethodName.NotCamelCaps
+
+ /**
+ * Unicode aware replacement for ltrim()
+ *
+ * @param string $str
+ * @param string $charlist
+ * @return string
+ * @see ltrim()
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+ public static function ltrim($str, $charlist = '')
+ {
+ if ($charlist === '') return ltrim($str);
+
+ //quote charlist for use in a characterclass
+ $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
+
+ return preg_replace('/^[' . $charlist . ']+/u', '', $str);
+ }
+
+ /**
+ * Unicode aware replacement for rtrim()
+ *
+ * @param string $str
+ * @param string $charlist
+ * @return string
+ * @see rtrim()
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+ public static function rtrim($str, $charlist = '')
+ {
+ if ($charlist === '') return rtrim($str);
+
+ //quote charlist for use in a characterclass
+ $charlist = preg_replace('!([\\\\\\-\\]\\[/])!', '\\\${1}', $charlist);
+
+ return preg_replace('/[' . $charlist . ']+$/u', '', $str);
+ }
+
+ /**
+ * Unicode aware replacement for trim()
+ *
+ * @param string $str
+ * @param string $charlist
+ * @return string
+ * @see trim()
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+ public static function trim($str, $charlist = '')
+ {
+ if ($charlist === '') return trim($str);
+
+ return self::ltrim(self::rtrim($str, $charlist), $charlist);
+ }
+
+ /**
+ * This is a unicode aware replacement for strtolower()
+ *
+ * Uses mb_string extension if available
+ *
+ * @param string $string
+ * @return string
+ * @see \dokuwiki\Utf8\PhpString::strtoupper()
+ *
+ * @author Leo Feyer <leo@typolight.org>
+ * @see strtolower()
+ */
+ public static function strtolower($string)
+ {
+ if (UTF8_MBSTRING) {
+ if (class_exists('Normalizer', $autoload = false)) {
+ return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
+ }
+ return (mb_strtolower($string, 'utf-8'));
+ }
+ return strtr($string, Table::upperCaseToLowerCase());
+ }
+
+ /**
+ * This is a unicode aware replacement for strtoupper()
+ *
+ * Uses mb_string extension if available
+ *
+ * @param string $string
+ * @return string
+ * @see \dokuwiki\Utf8\PhpString::strtoupper()
+ *
+ * @author Leo Feyer <leo@typolight.org>
+ * @see strtoupper()
+ */
+ public static function strtoupper($string)
+ {
+ if (UTF8_MBSTRING) return mb_strtoupper($string, 'utf-8');
+
+ return strtr($string, Table::lowerCaseToUpperCase());
+ }
+
+
+ /**
+ * UTF-8 aware alternative to ucfirst
+ * Make a string's first character uppercase
+ *
+ * @param string $str
+ * @return string with first character as upper case (if applicable)
+ * @author Harry Fuecks
+ *
+ */
+ public static function ucfirst($str)
+ {
+ switch (self::strlen($str)) {
+ case 0:
+ return '';
+ case 1:
+ return self::strtoupper($str);
+ default:
+ preg_match('/^(.{1})(.*)$/us', $str, $matches);
+ return self::strtoupper($matches[1]) . $matches[2];
+ }
+ }
+
+ /**
+ * UTF-8 aware alternative to ucwords
+ * Uppercase the first character of each word in a string
+ *
+ * @param string $str
+ * @return string with first char of each word uppercase
+ * @author Harry Fuecks
+ * @see http://php.net/ucwords
+ *
+ */
+ public static function ucwords($str)
+ {
+ // Note: [\x0c\x09\x0b\x0a\x0d\x20] matches;
+ // form feeds, horizontal tabs, vertical tabs, linefeeds and carriage returns
+ // This corresponds to the definition of a "word" defined at http://php.net/ucwords
+ $pattern = '/(^|([\x0c\x09\x0b\x0a\x0d\x20]+))([^\x0c\x09\x0b\x0a\x0d\x20]{1})[^\x0c\x09\x0b\x0a\x0d\x20]*/u';
+
+ return preg_replace_callback(
+ $pattern,
+ function ($matches) {
+ $leadingws = $matches[2];
+ $ucfirst = self::strtoupper($matches[3]);
+ $ucword = self::substr_replace(ltrim($matches[0]), $ucfirst, 0, 1);
+ return $leadingws . $ucword;
+ },
+ $str
+ );
+ }
+
+ /**
+ * This is an Unicode aware replacement for strpos
+ *
+ * @param string $haystack
+ * @param string $needle
+ * @param integer $offset
+ * @return integer
+ * @author Leo Feyer <leo@typolight.org>
+ * @see strpos()
+ *
+ */
+ public static function strpos($haystack, $needle, $offset = 0)
+ {
+ $comp = 0;
+ $length = null;
+
+ while ($length === null || $length < $offset) {
+ $pos = strpos($haystack, $needle, $offset + $comp);
+
+ if ($pos === false)
+ return false;
+
+ $length = self::strlen(substr($haystack, 0, $pos));
+
+ if ($length < $offset)
+ $comp = $pos - $length;
+ }
+
+ return $length;
+ }
+
+
+}
diff --git a/platform/www/inc/Utf8/Table.php b/platform/www/inc/Utf8/Table.php
new file mode 100644
index 0000000..8683c92
--- /dev/null
+++ b/platform/www/inc/Utf8/Table.php
@@ -0,0 +1,93 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * Provides static access to the UTF-8 conversion tables
+ *
+ * Lazy-Loads tables on first access
+ */
+class Table
+{
+
+ /**
+ * Get the upper to lower case conversion table
+ *
+ * @return array
+ */
+ public static function upperCaseToLowerCase()
+ {
+ static $table = null;
+ if ($table === null) $table = include __DIR__ . '/tables/case.php';
+ return $table;
+ }
+
+ /**
+ * Get the lower to upper case conversion table
+ *
+ * @return array
+ */
+ public static function lowerCaseToUpperCase()
+ {
+ static $table = null;
+ if ($table === null) {
+ $uclc = self::upperCaseToLowerCase();
+ $table = array_flip($uclc);
+ }
+ return $table;
+ }
+
+ /**
+ * Get the lower case accent table
+ * @return array
+ */
+ public static function lowerAccents()
+ {
+ static $table = null;
+ if ($table === null) {
+ $table = include __DIR__ . '/tables/loweraccents.php';
+ }
+ return $table;
+ }
+
+ /**
+ * Get the lower case accent table
+ * @return array
+ */
+ public static function upperAccents()
+ {
+ static $table = null;
+ if ($table === null) {
+ $table = include __DIR__ . '/tables/upperaccents.php';
+ }
+ return $table;
+ }
+
+ /**
+ * Get the romanization table
+ * @return array
+ */
+ public static function romanization()
+ {
+ static $table = null;
+ if ($table === null) {
+ $table = include __DIR__ . '/tables/romanization.php';
+ }
+ return $table;
+ }
+
+ /**
+ * Get the special chars as a concatenated string
+ * @return string
+ */
+ public static function specialChars()
+ {
+ static $string = null;
+ if ($string === null) {
+ $table = include __DIR__ . '/tables/specials.php';
+ // FIXME should we cache this to file system?
+ $string = Unicode::toUtf8($table);
+ }
+ return $string;
+ }
+}
diff --git a/platform/www/inc/Utf8/Unicode.php b/platform/www/inc/Utf8/Unicode.php
new file mode 100644
index 0000000..4b64265
--- /dev/null
+++ b/platform/www/inc/Utf8/Unicode.php
@@ -0,0 +1,277 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * Convert between UTF-8 and a list of Unicode Code Points
+ */
+class Unicode
+{
+
+ /**
+ * Takes an UTF-8 string and returns an array of ints representing the
+ * Unicode characters. Astral planes are supported ie. the ints in the
+ * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
+ * are not allowed.
+ *
+ * If $strict is set to true the function returns false if the input
+ * string isn't a valid UTF-8 octet sequence and raises a PHP error at
+ * level E_USER_WARNING
+ *
+ * Note: this function has been modified slightly in this library to
+ * trigger errors on encountering bad bytes
+ *
+ * @author <hsivonen@iki.fi>
+ * @author Harry Fuecks <hfuecks@gmail.com>
+ * @see unicode_to_utf8
+ * @link http://hsivonen.iki.fi/php-utf8/
+ * @link http://sourceforge.net/projects/phputf8/
+ * @todo break into less complex chunks
+ * @todo use exceptions instead of user errors
+ *
+ * @param string $str UTF-8 encoded string
+ * @param boolean $strict Check for invalid sequences?
+ * @return mixed array of unicode code points or false if UTF-8 invalid
+ */
+ public static function fromUtf8($str, $strict = false)
+ {
+ $mState = 0; // cached expected number of octets after the current octet
+ // until the beginning of the next UTF8 character sequence
+ $mUcs4 = 0; // cached Unicode character
+ $mBytes = 1; // cached expected number of octets in the current sequence
+
+ $out = array();
+
+ $len = strlen($str);
+
+ for ($i = 0; $i < $len; $i++) {
+
+ $in = ord($str[$i]);
+
+ if ($mState === 0) {
+
+ // When mState is zero we expect either a US-ASCII character or a
+ // multi-octet sequence.
+ if (0 === (0x80 & $in)) {
+ // US-ASCII, pass straight through.
+ $out[] = $in;
+ $mBytes = 1;
+
+ } else if (0xC0 === (0xE0 & $in)) {
+ // First octet of 2 octet sequence
+ $mUcs4 = $in;
+ $mUcs4 = ($mUcs4 & 0x1F) << 6;
+ $mState = 1;
+ $mBytes = 2;
+
+ } else if (0xE0 === (0xF0 & $in)) {
+ // First octet of 3 octet sequence
+ $mUcs4 = $in;
+ $mUcs4 = ($mUcs4 & 0x0F) << 12;
+ $mState = 2;
+ $mBytes = 3;
+
+ } else if (0xF0 === (0xF8 & $in)) {
+ // First octet of 4 octet sequence
+ $mUcs4 = $in;
+ $mUcs4 = ($mUcs4 & 0x07) << 18;
+ $mState = 3;
+ $mBytes = 4;
+
+ } else if (0xF8 === (0xFC & $in)) {
+ /* First octet of 5 octet sequence.
+ *
+ * This is illegal because the encoded codepoint must be either
+ * (a) not the shortest form or
+ * (b) outside the Unicode range of 0-0x10FFFF.
+ * Rather than trying to resynchronize, we will carry on until the end
+ * of the sequence and let the later error handling code catch it.
+ */
+ $mUcs4 = $in;
+ $mUcs4 = ($mUcs4 & 0x03) << 24;
+ $mState = 4;
+ $mBytes = 5;
+
+ } else if (0xFC === (0xFE & $in)) {
+ // First octet of 6 octet sequence, see comments for 5 octet sequence.
+ $mUcs4 = $in;
+ $mUcs4 = ($mUcs4 & 1) << 30;
+ $mState = 5;
+ $mBytes = 6;
+
+ } elseif ($strict) {
+ /* Current octet is neither in the US-ASCII range nor a legal first
+ * octet of a multi-octet sequence.
+ */
+ trigger_error(
+ 'utf8_to_unicode: Illegal sequence identifier ' .
+ 'in UTF-8 at byte ' . $i,
+ E_USER_WARNING
+ );
+ return false;
+
+ }
+
+ } else {
+
+ // When mState is non-zero, we expect a continuation of the multi-octet
+ // sequence
+ if (0x80 === (0xC0 & $in)) {
+
+ // Legal continuation.
+ $shift = ($mState - 1) * 6;
+ $tmp = $in;
+ $tmp = ($tmp & 0x0000003F) << $shift;
+ $mUcs4 |= $tmp;
+
+ /**
+ * End of the multi-octet sequence. mUcs4 now contains the final
+ * Unicode codepoint to be output
+ */
+ if (0 === --$mState) {
+
+ /*
+ * Check for illegal sequences and codepoints.
+ */
+ // From Unicode 3.1, non-shortest form is illegal
+ if (((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
+ ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
+ ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
+ (4 < $mBytes) ||
+ // From Unicode 3.2, surrogate characters are illegal
+ (($mUcs4 & 0xFFFFF800) === 0xD800) ||
+ // Codepoints outside the Unicode range are illegal
+ ($mUcs4 > 0x10FFFF)) {
+
+ if ($strict) {
+ trigger_error(
+ 'utf8_to_unicode: Illegal sequence or codepoint ' .
+ 'in UTF-8 at byte ' . $i,
+ E_USER_WARNING
+ );
+
+ return false;
+ }
+
+ }
+
+ if (0xFEFF !== $mUcs4) {
+ // BOM is legal but we don't want to output it
+ $out[] = $mUcs4;
+ }
+
+ //initialize UTF8 cache
+ $mState = 0;
+ $mUcs4 = 0;
+ $mBytes = 1;
+ }
+
+ } elseif ($strict) {
+ /**
+ *((0xC0 & (*in) != 0x80) && (mState != 0))
+ * Incomplete multi-octet sequence.
+ */
+ trigger_error(
+ 'utf8_to_unicode: Incomplete multi-octet ' .
+ ' sequence in UTF-8 at byte ' . $i,
+ E_USER_WARNING
+ );
+
+ return false;
+ }
+ }
+ }
+ return $out;
+ }
+
+ /**
+ * Takes an array of ints representing the Unicode characters and returns
+ * a UTF-8 string. Astral planes are supported ie. the ints in the
+ * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
+ * are not allowed.
+ *
+ * If $strict is set to true the function returns false if the input
+ * array contains ints that represent surrogates or are outside the
+ * Unicode range and raises a PHP error at level E_USER_WARNING
+ *
+ * Note: this function has been modified slightly in this library to use
+ * output buffering to concatenate the UTF-8 string (faster) as well as
+ * reference the array by it's keys
+ *
+ * @param array $arr of unicode code points representing a string
+ * @param boolean $strict Check for invalid sequences?
+ * @return string|false UTF-8 string or false if array contains invalid code points
+ *
+ * @author <hsivonen@iki.fi>
+ * @author Harry Fuecks <hfuecks@gmail.com>
+ * @see utf8_to_unicode
+ * @link http://hsivonen.iki.fi/php-utf8/
+ * @link http://sourceforge.net/projects/phputf8/
+ * @todo use exceptions instead of user errors
+ */
+ public static function toUtf8($arr, $strict = false)
+ {
+ if (!is_array($arr)) return '';
+ ob_start();
+
+ foreach (array_keys($arr) as $k) {
+
+ if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
+ # ASCII range (including control chars)
+
+ echo chr($arr[$k]);
+
+ } else if ($arr[$k] <= 0x07ff) {
+ # 2 byte sequence
+
+ echo chr(0xc0 | ($arr[$k] >> 6));
+ echo chr(0x80 | ($arr[$k] & 0x003f));
+
+ } else if ($arr[$k] == 0xFEFF) {
+ # Byte order mark (skip)
+ // nop -- zap the BOM
+
+ } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
+ # Test for illegal surrogates
+
+ // found a surrogate
+ if ($strict) {
+ trigger_error(
+ 'unicode_to_utf8: Illegal surrogate ' .
+ 'at index: ' . $k . ', value: ' . $arr[$k],
+ E_USER_WARNING
+ );
+ return false;
+ }
+
+ } else if ($arr[$k] <= 0xffff) {
+ # 3 byte sequence
+
+ echo chr(0xe0 | ($arr[$k] >> 12));
+ echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
+ echo chr(0x80 | ($arr[$k] & 0x003f));
+
+ } else if ($arr[$k] <= 0x10ffff) {
+ # 4 byte sequence
+
+ echo chr(0xf0 | ($arr[$k] >> 18));
+ echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
+ echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
+ echo chr(0x80 | ($arr[$k] & 0x3f));
+
+ } elseif ($strict) {
+
+ trigger_error(
+ 'unicode_to_utf8: Codepoint out of Unicode range ' .
+ 'at index: ' . $k . ', value: ' . $arr[$k],
+ E_USER_WARNING
+ );
+
+ // out of range
+ return false;
+ }
+ }
+
+ return ob_get_clean();
+ }
+}
diff --git a/platform/www/inc/Utf8/tables/case.php b/platform/www/inc/Utf8/tables/case.php
new file mode 100644
index 0000000..6c41b58
--- /dev/null
+++ b/platform/www/inc/Utf8/tables/case.php
@@ -0,0 +1,659 @@
+<?php
+/**
+ * UTF-8 Case lookup table
+ *
+ * This lookuptable defines the lower case letters to their corresponding
+ * upper case letter in UTF-8
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ */
+return [
+ 'A' => 'a',
+ 'B' => 'b',
+ 'C' => 'c',
+ 'D' => 'd',
+ 'E' => 'e',
+ 'F' => 'f',
+ 'G' => 'g',
+ 'H' => 'h',
+ 'I' => 'i',
+ 'J' => 'j',
+ 'K' => 'k',
+ 'L' => 'l',
+ 'M' => 'm',
+ 'N' => 'n',
+ 'O' => 'o',
+ 'P' => 'p',
+ 'Q' => 'q',
+ 'R' => 'r',
+ 'S' => 's',
+ 'T' => 't',
+ 'U' => 'u',
+ 'V' => 'v',
+ 'W' => 'w',
+ 'X' => 'x',
+ 'Y' => 'y',
+ 'Z' => 'z',
+ 'À' => 'à',
+ 'Á' => 'á',
+ 'Â' => 'â',
+ 'Ã' => 'ã',
+ 'Ä' => 'ä',
+ 'Å' => 'å',
+ 'Æ' => 'æ',
+ 'Ç' => 'ç',
+ 'È' => 'è',
+ 'É' => 'é',
+ 'Ê' => 'ê',
+ 'Ë' => 'ë',
+ 'Ì' => 'ì',
+ 'Í' => 'í',
+ 'Î' => 'î',
+ 'Ï' => 'ï',
+ 'Ð' => 'ð',
+ 'Ñ' => 'ñ',
+ 'Ò' => 'ò',
+ 'Ó' => 'ó',
+ 'Ô' => 'ô',
+ 'Õ' => 'õ',
+ 'Ö' => 'ö',
+ 'Ø' => 'ø',
+ 'Ù' => 'ù',
+ 'Ú' => 'ú',
+ 'Û' => 'û',
+ 'Ü' => 'ü',
+ 'Ý' => 'ý',
+ 'Þ' => 'þ',
+ 'Ā' => 'ā',
+ 'Ă' => 'ă',
+ 'Ą' => 'ą',
+ 'Ć' => 'ć',
+ 'Ĉ' => 'ĉ',
+ 'Ċ' => 'ċ',
+ 'Č' => 'č',
+ 'Ď' => 'ď',
+ 'Đ' => 'đ',
+ 'Ē' => 'ē',
+ 'Ĕ' => 'ĕ',
+ 'Ė' => 'ė',
+ 'Ę' => 'ę',
+ 'Ě' => 'ě',
+ 'Ĝ' => 'ĝ',
+ 'Ğ' => 'ğ',
+ 'Ġ' => 'ġ',
+ 'Ģ' => 'ģ',
+ 'Ĥ' => 'ĥ',
+ 'Ħ' => 'ħ',
+ 'Ĩ' => 'ĩ',
+ 'Ī' => 'ī',
+ 'Ĭ' => 'ĭ',
+ 'Į' => 'į',
+ 'IJ' => 'ij',
+ 'Ĵ' => 'ĵ',
+ 'Ķ' => 'ķ',
+ 'Ĺ' => 'ĺ',
+ 'Ļ' => 'ļ',
+ 'Ľ' => 'ľ',
+ 'Ŀ' => 'ŀ',
+ 'Ł' => 'ł',
+ 'Ń' => 'ń',
+ 'Ņ' => 'ņ',
+ 'Ň' => 'ň',
+ 'Ŋ' => 'ŋ',
+ 'Ō' => 'ō',
+ 'Ŏ' => 'ŏ',
+ 'Ő' => 'ő',
+ 'Œ' => 'œ',
+ 'Ŕ' => 'ŕ',
+ 'Ŗ' => 'ŗ',
+ 'Ř' => 'ř',
+ 'Ś' => 'ś',
+ 'Ŝ' => 'ŝ',
+ 'Ş' => 'ş',
+ 'Š' => 'š',
+ 'Ţ' => 'ţ',
+ 'Ť' => 'ť',
+ 'Ŧ' => 'ŧ',
+ 'Ũ' => 'ũ',
+ 'Ū' => 'ū',
+ 'Ŭ' => 'ŭ',
+ 'Ů' => 'ů',
+ 'Ű' => 'ű',
+ 'Ų' => 'ų',
+ 'Ŵ' => 'ŵ',
+ 'Ŷ' => 'ŷ',
+ 'Ÿ' => 'ÿ',
+ 'Ź' => 'ź',
+ 'Ż' => 'ż',
+ 'Ž' => 'ž',
+ 'Ɓ' => 'ɓ',
+ 'Ƃ' => 'ƃ',
+ 'Ƅ' => 'ƅ',
+ 'Ɔ' => 'ɔ',
+ 'Ƈ' => 'ƈ',
+ 'Ɖ' => 'ɖ',
+ 'Ɗ' => 'ɗ',
+ 'Ƌ' => 'ƌ',
+ 'Ǝ' => 'ǝ',
+ 'Ə' => 'ə',
+ 'Ɛ' => 'ɛ',
+ 'Ƒ' => 'ƒ',
+ 'Ɣ' => 'ɣ',
+ 'Ɩ' => 'ɩ',
+ 'Ɨ' => 'ɨ',
+ 'Ƙ' => 'ƙ',
+ 'Ɯ' => 'ɯ',
+ 'Ɲ' => 'ɲ',
+ 'Ɵ' => 'ɵ',
+ 'Ơ' => 'ơ',
+ 'Ƣ' => 'ƣ',
+ 'Ƥ' => 'ƥ',
+ 'Ʀ' => 'ʀ',
+ 'Ƨ' => 'ƨ',
+ 'Ʃ' => 'ʃ',
+ 'Ƭ' => 'ƭ',
+ 'Ʈ' => 'ʈ',
+ 'Ư' => 'ư',
+ 'Ʊ' => 'ʊ',
+ 'Ʋ' => 'ʋ',
+ 'Ƴ' => 'ƴ',
+ 'Ƶ' => 'ƶ',
+ 'Ʒ' => 'ʒ',
+ 'Ƹ' => 'ƹ',
+ 'Ƽ' => 'ƽ',
+ 'Dž' => 'dž',
+ 'Lj' => 'lj',
+ 'Nj' => 'nj',
+ 'Ǎ' => 'ǎ',
+ 'Ǐ' => 'ǐ',
+ 'Ǒ' => 'ǒ',
+ 'Ǔ' => 'ǔ',
+ 'Ǖ' => 'ǖ',
+ 'Ǘ' => 'ǘ',
+ 'Ǚ' => 'ǚ',
+ 'Ǜ' => 'ǜ',
+ 'Ǟ' => 'ǟ',
+ 'Ǡ' => 'ǡ',
+ 'Ǣ' => 'ǣ',
+ 'Ǥ' => 'ǥ',
+ 'Ǧ' => 'ǧ',
+ 'Ǩ' => 'ǩ',
+ 'Ǫ' => 'ǫ',
+ 'Ǭ' => 'ǭ',
+ 'Ǯ' => 'ǯ',
+ 'Dz' => 'dz',
+ 'Ǵ' => 'ǵ',
+ 'Ƕ' => 'ƕ',
+ 'Ƿ' => 'ƿ',
+ 'Ǹ' => 'ǹ',
+ 'Ǻ' => 'ǻ',
+ 'Ǽ' => 'ǽ',
+ 'Ǿ' => 'ǿ',
+ 'Ȁ' => 'ȁ',
+ 'Ȃ' => 'ȃ',
+ 'Ȅ' => 'ȅ',
+ 'Ȇ' => 'ȇ',
+ 'Ȉ' => 'ȉ',
+ 'Ȋ' => 'ȋ',
+ 'Ȍ' => 'ȍ',
+ 'Ȏ' => 'ȏ',
+ 'Ȑ' => 'ȑ',
+ 'Ȓ' => 'ȓ',
+ 'Ȕ' => 'ȕ',
+ 'Ȗ' => 'ȗ',
+ 'Ș' => 'ș',
+ 'Ț' => 'ț',
+ 'Ȝ' => 'ȝ',
+ 'Ȟ' => 'ȟ',
+ 'Ƞ' => 'ƞ',
+ 'Ȣ' => 'ȣ',
+ 'Ȥ' => 'ȥ',
+ 'Ȧ' => 'ȧ',
+ 'Ȩ' => 'ȩ',
+ 'Ȫ' => 'ȫ',
+ 'Ȭ' => 'ȭ',
+ 'Ȯ' => 'ȯ',
+ 'Ȱ' => 'ȱ',
+ 'Ȳ' => 'ȳ',
+ 'Ά' => 'ά',
+ 'Έ' => 'έ',
+ 'Ή' => 'ή',
+ 'Ί' => 'ί',
+ 'Ό' => 'ό',
+ 'Ύ' => 'ύ',
+ 'Ώ' => 'ώ',
+ 'Α' => 'α',
+ 'Β' => 'β',
+ 'Γ' => 'γ',
+ 'Δ' => 'δ',
+ 'Ε' => 'ε',
+ 'Ζ' => 'ζ',
+ 'Η' => 'η',
+ 'Θ' => 'θ',
+ 'Ι' => 'ι',
+ 'Κ' => 'κ',
+ 'Λ' => 'λ',
+ 'Μ' => 'μ',
+ 'Ν' => 'ν',
+ 'Ξ' => 'ξ',
+ 'Ο' => 'ο',
+ 'Π' => 'π',
+ 'Ρ' => 'ρ',
+ 'Σ' => 'σ',
+ 'Τ' => 'τ',
+ 'Υ' => 'υ',
+ 'Φ' => 'φ',
+ 'Χ' => 'χ',
+ 'Ψ' => 'ψ',
+ 'Ω' => 'ω',
+ 'Ϊ' => 'ϊ',
+ 'Ϋ' => 'ϋ',
+ 'Ϙ' => 'ϙ',
+ 'Ϛ' => 'ϛ',
+ 'Ϝ' => 'ϝ',
+ 'Ϟ' => 'ϟ',
+ 'Ϡ' => 'ϡ',
+ 'Ϣ' => 'ϣ',
+ 'Ϥ' => 'ϥ',
+ 'Ϧ' => 'ϧ',
+ 'Ϩ' => 'ϩ',
+ 'Ϫ' => 'ϫ',
+ 'Ϭ' => 'ϭ',
+ 'Ϯ' => 'ϯ',
+ 'Ѐ' => 'ѐ',
+ 'Ё' => 'ё',
+ 'Ђ' => 'ђ',
+ 'Ѓ' => 'ѓ',
+ 'Є' => 'є',
+ 'Ѕ' => 'ѕ',
+ 'І' => 'і',
+ 'Ї' => 'ї',
+ 'Ј' => 'ј',
+ 'Љ' => 'љ',
+ 'Њ' => 'њ',
+ 'Ћ' => 'ћ',
+ 'Ќ' => 'ќ',
+ 'Ѝ' => 'ѝ',
+ 'Ў' => 'ў',
+ 'Џ' => 'џ',
+ 'А' => 'а',
+ 'Б' => 'б',
+ 'В' => 'в',
+ 'Г' => 'г',
+ 'Д' => 'д',
+ 'Е' => 'е',
+ 'Ж' => 'ж',
+ 'З' => 'з',
+ 'И' => 'и',
+ 'Й' => 'й',
+ 'К' => 'к',
+ 'Л' => 'л',
+ 'М' => 'м',
+ 'Н' => 'н',
+ 'О' => 'о',
+ 'П' => 'п',
+ 'Р' => 'р',
+ 'С' => 'с',
+ 'Т' => 'т',
+ 'У' => 'у',
+ 'Ф' => 'ф',
+ 'Х' => 'х',
+ 'Ц' => 'ц',
+ 'Ч' => 'ч',
+ 'Ш' => 'ш',
+ 'Щ' => 'щ',
+ 'Ъ' => 'ъ',
+ 'Ы' => 'ы',
+ 'Ь' => 'ь',
+ 'Э' => 'э',
+ 'Ю' => 'ю',
+ 'Я' => 'я',
+ 'Ѡ' => 'ѡ',
+ 'Ѣ' => 'ѣ',
+ 'Ѥ' => 'ѥ',
+ 'Ѧ' => 'ѧ',
+ 'Ѩ' => 'ѩ',
+ 'Ѫ' => 'ѫ',
+ 'Ѭ' => 'ѭ',
+ 'Ѯ' => 'ѯ',
+ 'Ѱ' => 'ѱ',
+ 'Ѳ' => 'ѳ',
+ 'Ѵ' => 'ѵ',
+ 'Ѷ' => 'ѷ',
+ 'Ѹ' => 'ѹ',
+ 'Ѻ' => 'ѻ',
+ 'Ѽ' => 'ѽ',
+ 'Ѿ' => 'ѿ',
+ 'Ҁ' => 'ҁ',
+ 'Ҋ' => 'ҋ',
+ 'Ҍ' => 'ҍ',
+ 'Ҏ' => 'ҏ',
+ 'Ґ' => 'ґ',
+ 'Ғ' => 'ғ',
+ 'Ҕ' => 'ҕ',
+ 'Җ' => 'җ',
+ 'Ҙ' => 'ҙ',
+ 'Қ' => 'қ',
+ 'Ҝ' => 'ҝ',
+ 'Ҟ' => 'ҟ',
+ 'Ҡ' => 'ҡ',
+ 'Ң' => 'ң',
+ 'Ҥ' => 'ҥ',
+ 'Ҧ' => 'ҧ',
+ 'Ҩ' => 'ҩ',
+ 'Ҫ' => 'ҫ',
+ 'Ҭ' => 'ҭ',
+ 'Ү' => 'ү',
+ 'Ұ' => 'ұ',
+ 'Ҳ' => 'ҳ',
+ 'Ҵ' => 'ҵ',
+ 'Ҷ' => 'ҷ',
+ 'Ҹ' => 'ҹ',
+ 'Һ' => 'һ',
+ 'Ҽ' => 'ҽ',
+ 'Ҿ' => 'ҿ',
+ 'Ӂ' => 'ӂ',
+ 'Ӄ' => 'ӄ',
+ 'Ӆ' => 'ӆ',
+ 'Ӈ' => 'ӈ',
+ 'Ӊ' => 'ӊ',
+ 'Ӌ' => 'ӌ',
+ 'Ӎ' => 'ӎ',
+ 'Ӑ' => 'ӑ',
+ 'Ӓ' => 'ӓ',
+ 'Ӕ' => 'ӕ',
+ 'Ӗ' => 'ӗ',
+ 'Ә' => 'ә',
+ 'Ӛ' => 'ӛ',
+ 'Ӝ' => 'ӝ',
+ 'Ӟ' => 'ӟ',
+ 'Ӡ' => 'ӡ',
+ 'Ӣ' => 'ӣ',
+ 'Ӥ' => 'ӥ',
+ 'Ӧ' => 'ӧ',
+ 'Ө' => 'ө',
+ 'Ӫ' => 'ӫ',
+ 'Ӭ' => 'ӭ',
+ 'Ӯ' => 'ӯ',
+ 'Ӱ' => 'ӱ',
+ 'Ӳ' => 'ӳ',
+ 'Ӵ' => 'ӵ',
+ 'Ӹ' => 'ӹ',
+ 'Ԁ' => 'ԁ',
+ 'Ԃ' => 'ԃ',
+ 'Ԅ' => 'ԅ',
+ 'Ԇ' => 'ԇ',
+ 'Ԉ' => 'ԉ',
+ 'Ԋ' => 'ԋ',
+ 'Ԍ' => 'ԍ',
+ 'Ԏ' => 'ԏ',
+ 'Ա' => 'ա',
+ 'Բ' => 'բ',
+ 'Գ' => 'գ',
+ 'Դ' => 'դ',
+ 'Ե' => 'ե',
+ 'Զ' => 'զ',
+ 'Է' => 'է',
+ 'Ը' => 'ը',
+ 'Թ' => 'թ',
+ 'Ժ' => 'ժ',
+ 'Ի' => 'ի',
+ 'Լ' => 'լ',
+ 'Խ' => 'խ',
+ 'Ծ' => 'ծ',
+ 'Կ' => 'կ',
+ 'Հ' => 'հ',
+ 'Ձ' => 'ձ',
+ 'Ղ' => 'ղ',
+ 'Ճ' => 'ճ',
+ 'Մ' => 'մ',
+ 'Յ' => 'յ',
+ 'Ն' => 'ն',
+ 'Շ' => 'շ',
+ 'Ո' => 'ո',
+ 'Չ' => 'չ',
+ 'Պ' => 'պ',
+ 'Ջ' => 'ջ',
+ 'Ռ' => 'ռ',
+ 'Ս' => 'ս',
+ 'Վ' => 'վ',
+ 'Տ' => 'տ',
+ 'Ր' => 'ր',
+ 'Ց' => 'ց',
+ 'Ւ' => 'ւ',
+ 'Փ' => 'փ',
+ 'Ք' => 'ք',
+ 'Օ' => 'օ',
+ 'Ֆ' => 'ֆ',
+ 'Ḁ' => 'ḁ',
+ 'Ḃ' => 'ḃ',
+ 'Ḅ' => 'ḅ',
+ 'Ḇ' => 'ḇ',
+ 'Ḉ' => 'ḉ',
+ 'Ḋ' => 'ḋ',
+ 'Ḍ' => 'ḍ',
+ 'Ḏ' => 'ḏ',
+ 'Ḑ' => 'ḑ',
+ 'Ḓ' => 'ḓ',
+ 'Ḕ' => 'ḕ',
+ 'Ḗ' => 'ḗ',
+ 'Ḙ' => 'ḙ',
+ 'Ḛ' => 'ḛ',
+ 'Ḝ' => 'ḝ',
+ 'Ḟ' => 'ḟ',
+ 'Ḡ' => 'ḡ',
+ 'Ḣ' => 'ḣ',
+ 'Ḥ' => 'ḥ',
+ 'Ḧ' => 'ḧ',
+ 'Ḩ' => 'ḩ',
+ 'Ḫ' => 'ḫ',
+ 'Ḭ' => 'ḭ',
+ 'Ḯ' => 'ḯ',
+ 'Ḱ' => 'ḱ',
+ 'Ḳ' => 'ḳ',
+ 'Ḵ' => 'ḵ',
+ 'Ḷ' => 'ḷ',
+ 'Ḹ' => 'ḹ',
+ 'Ḻ' => 'ḻ',
+ 'Ḽ' => 'ḽ',
+ 'Ḿ' => 'ḿ',
+ 'Ṁ' => 'ṁ',
+ 'Ṃ' => 'ṃ',
+ 'Ṅ' => 'ṅ',
+ 'Ṇ' => 'ṇ',
+ 'Ṉ' => 'ṉ',
+ 'Ṋ' => 'ṋ',
+ 'Ṍ' => 'ṍ',
+ 'Ṏ' => 'ṏ',
+ 'Ṑ' => 'ṑ',
+ 'Ṓ' => 'ṓ',
+ 'Ṕ' => 'ṕ',
+ 'Ṗ' => 'ṗ',
+ 'Ṙ' => 'ṙ',
+ 'Ṛ' => 'ṛ',
+ 'Ṝ' => 'ṝ',
+ 'Ṟ' => 'ṟ',
+ 'Ṡ' => 'ṡ',
+ 'Ṣ' => 'ṣ',
+ 'Ṥ' => 'ṥ',
+ 'Ṧ' => 'ṧ',
+ 'Ṩ' => 'ṩ',
+ 'Ṫ' => 'ṫ',
+ 'Ṭ' => 'ṭ',
+ 'Ṯ' => 'ṯ',
+ 'Ṱ' => 'ṱ',
+ 'Ṳ' => 'ṳ',
+ 'Ṵ' => 'ṵ',
+ 'Ṷ' => 'ṷ',
+ 'Ṹ' => 'ṹ',
+ 'Ṻ' => 'ṻ',
+ 'Ṽ' => 'ṽ',
+ 'Ṿ' => 'ṿ',
+ 'Ẁ' => 'ẁ',
+ 'Ẃ' => 'ẃ',
+ 'Ẅ' => 'ẅ',
+ 'Ẇ' => 'ẇ',
+ 'Ẉ' => 'ẉ',
+ 'Ẋ' => 'ẋ',
+ 'Ẍ' => 'ẍ',
+ 'Ẏ' => 'ẏ',
+ 'Ẑ' => 'ẑ',
+ 'Ẓ' => 'ẓ',
+ 'Ẕ' => 'ẕ',
+ 'Ạ' => 'ạ',
+ 'Ả' => 'ả',
+ 'Ấ' => 'ấ',
+ 'Ầ' => 'ầ',
+ 'Ẩ' => 'ẩ',
+ 'Ẫ' => 'ẫ',
+ 'Ậ' => 'ậ',
+ 'Ắ' => 'ắ',
+ 'Ằ' => 'ằ',
+ 'Ẳ' => 'ẳ',
+ 'Ẵ' => 'ẵ',
+ 'Ặ' => 'ặ',
+ 'Ẹ' => 'ẹ',
+ 'Ẻ' => 'ẻ',
+ 'Ẽ' => 'ẽ',
+ 'Ế' => 'ế',
+ 'Ề' => 'ề',
+ 'Ể' => 'ể',
+ 'Ễ' => 'ễ',
+ 'Ệ' => 'ệ',
+ 'Ỉ' => 'ỉ',
+ 'Ị' => 'ị',
+ 'Ọ' => 'ọ',
+ 'Ỏ' => 'ỏ',
+ 'Ố' => 'ố',
+ 'Ồ' => 'ồ',
+ 'Ổ' => 'ổ',
+ 'Ỗ' => 'ỗ',
+ 'Ộ' => 'ộ',
+ 'Ớ' => 'ớ',
+ 'Ờ' => 'ờ',
+ 'Ở' => 'ở',
+ 'Ỡ' => 'ỡ',
+ 'Ợ' => 'ợ',
+ 'Ụ' => 'ụ',
+ 'Ủ' => 'ủ',
+ 'Ứ' => 'ứ',
+ 'Ừ' => 'ừ',
+ 'Ử' => 'ử',
+ 'Ữ' => 'ữ',
+ 'Ự' => 'ự',
+ 'Ỳ' => 'ỳ',
+ 'Ỵ' => 'ỵ',
+ 'Ỷ' => 'ỷ',
+ 'Ỹ' => 'ỹ',
+ 'Ἀ' => 'ἀ',
+ 'Ἁ' => 'ἁ',
+ 'Ἂ' => 'ἂ',
+ 'Ἃ' => 'ἃ',
+ 'Ἄ' => 'ἄ',
+ 'Ἅ' => 'ἅ',
+ 'Ἆ' => 'ἆ',
+ 'Ἇ' => 'ἇ',
+ 'Ἐ' => 'ἐ',
+ 'Ἑ' => 'ἑ',
+ 'Ἒ' => 'ἒ',
+ 'Ἓ' => 'ἓ',
+ 'Ἔ' => 'ἔ',
+ 'Ἕ' => 'ἕ',
+ 'Ἡ' => 'ἡ',
+ 'Ἢ' => 'ἢ',
+ 'Ἣ' => 'ἣ',
+ 'Ἤ' => 'ἤ',
+ 'Ἥ' => 'ἥ',
+ 'Ἦ' => 'ἦ',
+ 'Ἧ' => 'ἧ',
+ 'Ἰ' => 'ἰ',
+ 'Ἱ' => 'ἱ',
+ 'Ἲ' => 'ἲ',
+ 'Ἳ' => 'ἳ',
+ 'Ἴ' => 'ἴ',
+ 'Ἵ' => 'ἵ',
+ 'Ἶ' => 'ἶ',
+ 'Ἷ' => 'ἷ',
+ 'Ὀ' => 'ὀ',
+ 'Ὁ' => 'ὁ',
+ 'Ὂ' => 'ὂ',
+ 'Ὃ' => 'ὃ',
+ 'Ὄ' => 'ὄ',
+ 'Ὅ' => 'ὅ',
+ 'Ὑ' => 'ὑ',
+ 'Ὓ' => 'ὓ',
+ 'Ὕ' => 'ὕ',
+ 'Ὗ' => 'ὗ',
+ 'Ὡ' => 'ὡ',
+ 'Ὢ' => 'ὢ',
+ 'Ὣ' => 'ὣ',
+ 'Ὤ' => 'ὤ',
+ 'Ὥ' => 'ὥ',
+ 'Ὦ' => 'ὦ',
+ 'Ὧ' => 'ὧ',
+ 'ᾈ' => 'ᾀ',
+ 'ᾉ' => 'ᾁ',
+ 'ᾊ' => 'ᾂ',
+ 'ᾋ' => 'ᾃ',
+ 'ᾌ' => 'ᾄ',
+ 'ᾍ' => 'ᾅ',
+ 'ᾎ' => 'ᾆ',
+ 'ᾏ' => 'ᾇ',
+ 'ᾘ' => 'ᾐ',
+ 'ᾙ' => 'ᾑ',
+ 'ᾚ' => 'ᾒ',
+ 'ᾛ' => 'ᾓ',
+ 'ᾜ' => 'ᾔ',
+ 'ᾝ' => 'ᾕ',
+ 'ᾞ' => 'ᾖ',
+ 'ᾟ' => 'ᾗ',
+ 'ᾩ' => 'ᾡ',
+ 'ᾪ' => 'ᾢ',
+ 'ᾫ' => 'ᾣ',
+ 'ᾬ' => 'ᾤ',
+ 'ᾭ' => 'ᾥ',
+ 'ᾮ' => 'ᾦ',
+ 'ᾯ' => 'ᾧ',
+ 'Ᾰ' => 'ᾰ',
+ 'Ᾱ' => 'ᾱ',
+ 'Ὰ' => 'ὰ',
+ 'ᾼ' => 'ᾳ',
+ 'Ὲ' => 'ὲ',
+ 'Ὴ' => 'ὴ',
+ 'ῌ' => 'ῃ',
+ 'Ῐ' => 'ῐ',
+ 'Ῑ' => 'ῑ',
+ 'Ὶ' => 'ὶ',
+ 'Ῡ' => 'ῡ',
+ 'Ὺ' => 'ὺ',
+ 'Ῥ' => 'ῥ',
+ 'Ὸ' => 'ὸ',
+ 'Ὼ' => 'ὼ',
+ 'ῼ' => 'ῳ',
+ 'A' => 'a',
+ 'B' => 'b',
+ 'C' => 'c',
+ 'D' => 'd',
+ 'E' => 'e',
+ 'F' => 'f',
+ 'G' => 'g',
+ 'H' => 'h',
+ 'I' => 'i',
+ 'J' => 'j',
+ 'K' => 'k',
+ 'L' => 'l',
+ 'M' => 'm',
+ 'N' => 'n',
+ 'O' => 'o',
+ 'P' => 'p',
+ 'Q' => 'q',
+ 'R' => 'r',
+ 'S' => 's',
+ 'T' => 't',
+ 'U' => 'u',
+ 'V' => 'v',
+ 'W' => 'w',
+ 'X' => 'x',
+ 'Y' => 'y',
+ 'Z' => 'z',
+];
diff --git a/platform/www/inc/Utf8/tables/loweraccents.php b/platform/www/inc/Utf8/tables/loweraccents.php
new file mode 100644
index 0000000..cc3ec8e
--- /dev/null
+++ b/platform/www/inc/Utf8/tables/loweraccents.php
@@ -0,0 +1,116 @@
+<?php
+/**
+ * UTF-8 lookup table for lower case accented letters
+ *
+ * This lookuptable defines replacements for accented characters from the ASCII-7
+ * range. This are lower case letters only.
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see \dokuwiki\Utf8\Clean::deaccent()
+ */
+return [
+ 'á' => 'a',
+ 'à' => 'a',
+ 'ă' => 'a',
+ 'â' => 'a',
+ 'å' => 'a',
+ 'ä' => 'ae',
+ 'ã' => 'a',
+ 'ą' => 'a',
+ 'ā' => 'a',
+ 'æ' => 'ae',
+ 'ḃ' => 'b',
+ 'ć' => 'c',
+ 'ĉ' => 'c',
+ 'č' => 'c',
+ 'ċ' => 'c',
+ 'ç' => 'c',
+ 'ď' => 'd',
+ 'ḋ' => 'd',
+ 'đ' => 'd',
+ 'ð' => 'dh',
+ 'é' => 'e',
+ 'è' => 'e',
+ 'ĕ' => 'e',
+ 'ê' => 'e',
+ 'ě' => 'e',
+ 'ë' => 'e',
+ 'ė' => 'e',
+ 'ę' => 'e',
+ 'ē' => 'e',
+ 'ḟ' => 'f',
+ 'ƒ' => 'f',
+ 'ğ' => 'g',
+ 'ĝ' => 'g',
+ 'ġ' => 'g',
+ 'ģ' => 'g',
+ 'ĥ' => 'h',
+ 'ħ' => 'h',
+ 'í' => 'i',
+ 'ì' => 'i',
+ 'î' => 'i',
+ 'ï' => 'i',
+ 'ĩ' => 'i',
+ 'į' => 'i',
+ 'ī' => 'i',
+ 'ĵ' => 'j',
+ 'ķ' => 'k',
+ 'ĺ' => 'l',
+ 'ľ' => 'l',
+ 'ļ' => 'l',
+ 'ł' => 'l',
+ 'ṁ' => 'm',
+ 'ń' => 'n',
+ 'ň' => 'n',
+ 'ñ' => 'n',
+ 'ņ' => 'n',
+ 'ó' => 'o',
+ 'ò' => 'o',
+ 'ô' => 'o',
+ 'ö' => 'oe',
+ 'ő' => 'o',
+ 'õ' => 'o',
+ 'ø' => 'o',
+ 'ō' => 'o',
+ 'ơ' => 'o',
+ 'ṗ' => 'p',
+ 'ŕ' => 'r',
+ 'ř' => 'r',
+ 'ŗ' => 'r',
+ 'ś' => 's',
+ 'ŝ' => 's',
+ 'š' => 's',
+ 'ṡ' => 's',
+ 'ş' => 's',
+ 'ș' => 's',
+ 'ß' => 'ss',
+ 'ť' => 't',
+ 'ṫ' => 't',
+ 'ţ' => 't',
+ 'ț' => 't',
+ 'ŧ' => 't',
+ 'ú' => 'u',
+ 'ù' => 'u',
+ 'ŭ' => 'u',
+ 'û' => 'u',
+ 'ů' => 'u',
+ 'ü' => 'ue',
+ 'ű' => 'u',
+ 'ũ' => 'u',
+ 'ų' => 'u',
+ 'ū' => 'u',
+ 'ư' => 'u',
+ 'ẃ' => 'w',
+ 'ẁ' => 'w',
+ 'ŵ' => 'w',
+ 'ẅ' => 'w',
+ 'ý' => 'y',
+ 'ỳ' => 'y',
+ 'ŷ' => 'y',
+ 'ÿ' => 'y',
+ 'ź' => 'z',
+ 'ž' => 'z',
+ 'ż' => 'z',
+ 'þ' => 'th',
+ 'µ' => 'u',
+];
diff --git a/platform/www/inc/Utf8/tables/romanization.php b/platform/www/inc/Utf8/tables/romanization.php
new file mode 100644
index 0000000..e757b9c
--- /dev/null
+++ b/platform/www/inc/Utf8/tables/romanization.php
@@ -0,0 +1,1458 @@
+<?php
+/**
+ * Romanization lookup table
+ *
+ * This lookup tables provides a way to transform strings written in a language
+ * different from the ones based upon latin letters into plain ASCII.
+ *
+ * Please note: this is not a scientific transliteration table. It only works
+ * oneway from nonlatin to ASCII and it works by simple character replacement
+ * only. Specialities of each language are not supported.
+ *
+ * @todo some keys are used multiple times
+ * @todo remove or integrate commented pairs
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @author Vitaly Blokhin <vitinfo@vitn.com>
+ * @author Bisqwit <bisqwit@iki.fi>
+ * @author Arthit Suriyawongkul <arthit@gmail.com>
+ * @author Denis Scheither <amorphis@uni-bremen.de>
+ * @author Eivind Morland <eivind.morland@gmail.com>
+ * @link http://www.uconv.com/translit.htm
+ * @link http://kanjidict.stc.cx/hiragana.php?src=2
+ * @link http://www.translatum.gr/converter/greek-transliteration.htm
+ * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
+ * @link http://www.btranslations.com/resources/romanization/korean.asp
+ */
+return [
+ // scandinavian - differs from what we do in deaccent
+ 'å' => 'a',
+ 'Å' => 'A',
+ 'ä' => 'a',
+ 'Ä' => 'A',
+ 'ö' => 'o',
+ 'Ö' => 'O',
+
+ //russian cyrillic
+ 'а' => 'a',
+ 'А' => 'A',
+ 'б' => 'b',
+ 'Б' => 'B',
+ 'в' => 'v',
+ 'В' => 'V',
+ 'г' => 'g',
+ 'Г' => 'G',
+ 'д' => 'd',
+ 'Д' => 'D',
+ 'е' => 'e',
+ 'Е' => 'E',
+ 'ё' => 'jo',
+ 'Ё' => 'Jo',
+ 'ж' => 'zh',
+ 'Ж' => 'Zh',
+ 'з' => 'z',
+ 'З' => 'Z',
+ 'и' => 'i',
+ 'И' => 'I',
+ 'й' => 'j',
+ 'Й' => 'J',
+ 'к' => 'k',
+ 'К' => 'K',
+ 'л' => 'l',
+ 'Л' => 'L',
+ 'м' => 'm',
+ 'М' => 'M',
+ 'н' => 'n',
+ 'Н' => 'N',
+ 'о' => 'o',
+ 'О' => 'O',
+ 'п' => 'p',
+ 'П' => 'P',
+ 'р' => 'r',
+ 'Р' => 'R',
+ 'с' => 's',
+ 'С' => 'S',
+ 'т' => 't',
+ 'Т' => 'T',
+ 'у' => 'u',
+ 'У' => 'U',
+ 'ф' => 'f',
+ 'Ф' => 'F',
+ 'х' => 'x',
+ 'Х' => 'X',
+ 'ц' => 'c',
+ 'Ц' => 'C',
+ 'ч' => 'ch',
+ 'Ч' => 'Ch',
+ 'ш' => 'sh',
+ 'Ш' => 'Sh',
+ 'щ' => 'sch',
+ 'Щ' => 'Sch',
+ 'ъ' => '',
+ 'Ъ' => '',
+ 'ы' => 'y',
+ 'Ы' => 'Y',
+ 'ь' => '',
+ 'Ь' => '',
+ 'э' => 'eh',
+ 'Э' => 'Eh',
+ 'ю' => 'ju',
+ 'Ю' => 'Ju',
+ 'я' => 'ja',
+ 'Я' => 'Ja',
+
+ // Ukrainian cyrillic
+ 'Ґ' => 'Gh',
+ 'ґ' => 'gh',
+ 'Є' => 'Je',
+ 'є' => 'je',
+ 'І' => 'I',
+ 'і' => 'i',
+ 'Ї' => 'Ji',
+ 'ї' => 'ji',
+
+ // Georgian
+ 'ა' => 'a',
+ 'ბ' => 'b',
+ 'გ' => 'g',
+ 'დ' => 'd',
+ 'ე' => 'e',
+ 'ვ' => 'v',
+ 'ზ' => 'z',
+ 'თ' => 'th',
+ 'ი' => 'i',
+ 'კ' => 'p',
+ 'ლ' => 'l',
+ 'მ' => 'm',
+ 'ნ' => 'n',
+ 'ო' => 'o',
+ 'პ' => 'p',
+ 'ჟ' => 'zh',
+ 'რ' => 'r',
+ 'ს' => 's',
+ 'ტ' => 't',
+ 'უ' => 'u',
+ 'ფ' => 'ph',
+ 'ქ' => 'kh',
+ 'ღ' => 'gh',
+ 'ყ' => 'q',
+ 'შ' => 'sh',
+ 'ჩ' => 'ch',
+ 'ც' => 'c',
+ 'ძ' => 'dh',
+ 'წ' => 'w',
+ 'ჭ' => 'j',
+ 'ხ' => 'x',
+ 'ჯ' => 'jh',
+ 'ჰ' => 'xh',
+
+ //Sanskrit
+ 'अ' => 'a',
+ 'आ' => 'ah',
+ 'इ' => 'i',
+ 'ई' => 'ih',
+ 'उ' => 'u',
+ 'ऊ' => 'uh',
+ 'ऋ' => 'ry',
+ 'ॠ' => 'ryh',
+ 'ऌ' => 'ly',
+ 'ॡ' => 'lyh',
+ 'ए' => 'e',
+ 'ऐ' => 'ay',
+ 'ओ' => 'o',
+ 'औ' => 'aw',
+ 'अं' => 'amh',
+ 'अः' => 'aq',
+ 'क' => 'k',
+ 'ख' => 'kh',
+ 'ग' => 'g',
+ 'घ' => 'gh',
+ 'ङ' => 'nh',
+ 'च' => 'c',
+ 'छ' => 'ch',
+ 'ज' => 'j',
+ 'झ' => 'jh',
+ 'ञ' => 'ny',
+ 'ट' => 'tq',
+ 'ठ' => 'tqh',
+ 'ड' => 'dq',
+ 'ढ' => 'dqh',
+ 'ण' => 'nq',
+ 'त' => 't',
+ 'थ' => 'th',
+ 'द' => 'd',
+ 'ध' => 'dh',
+ 'न' => 'n',
+ 'प' => 'p',
+ 'फ' => 'ph',
+ 'ब' => 'b',
+ 'भ' => 'bh',
+ 'म' => 'm',
+ 'य' => 'z',
+ 'र' => 'r',
+ 'ल' => 'l',
+ 'व' => 'v',
+ 'श' => 'sh',
+ 'ष' => 'sqh',
+ 'स' => 's',
+ 'ह' => 'x',
+
+ //Sanskrit diacritics
+ 'Ā' => 'A',
+ 'Ī' => 'I',
+ 'Ū' => 'U',
+ 'Ṛ' => 'R',
+ 'Ṝ' => 'R',
+ 'Ṅ' => 'N',
+ 'Ñ' => 'N',
+ 'Ṭ' => 'T',
+ 'Ḍ' => 'D',
+ 'Ṇ' => 'N',
+ 'Ś' => 'S',
+ 'Ṣ' => 'S',
+ 'Ṁ' => 'M',
+ 'Ṃ' => 'M',
+ 'Ḥ' => 'H',
+ 'Ḷ' => 'L',
+ 'Ḹ' => 'L',
+ 'ā' => 'a',
+ 'ī' => 'i',
+ 'ū' => 'u',
+ 'ṛ' => 'r',
+ 'ṝ' => 'r',
+ 'ṅ' => 'n',
+ 'ñ' => 'n',
+ 'ṭ' => 't',
+ 'ḍ' => 'd',
+ 'ṇ' => 'n',
+ 'ś' => 's',
+ 'ṣ' => 's',
+ 'ṁ' => 'm',
+ 'ṃ' => 'm',
+ 'ḥ' => 'h',
+ 'ḷ' => 'l',
+ 'ḹ' => 'l',
+
+ //Hebrew
+ 'א' => 'a',
+ 'ב' => 'b',
+ 'ג' => 'g',
+ 'ד' => 'd',
+ 'ה' => 'h',
+ 'ו' => 'v',
+ 'ז' => 'z',
+ 'ח' => 'kh',
+ 'ט' => 'th',
+ 'י' => 'y',
+ 'ך' => 'h',
+ 'כ' => 'k',
+ 'ל' => 'l',
+ 'ם' => 'm',
+ 'מ' => 'm',
+ 'ן' => 'n',
+ 'נ' => 'n',
+ 'ס' => 's',
+ 'ע' => 'ah',
+ 'ף' => 'f',
+ 'פ' => 'p',
+ 'ץ' => 'c',
+ 'צ' => 'c',
+ 'ק' => 'q',
+ 'ר' => 'r',
+ 'ש' => 'sh',
+ 'ת' => 't',
+
+ //Arabic
+ 'ا' => 'a',
+ 'ب' => 'b',
+ 'ت' => 't',
+ 'ث' => 'th',
+ 'ج' => 'g',
+ 'ح' => 'xh',
+ 'خ' => 'x',
+ 'د' => 'd',
+ 'ذ' => 'dh',
+ 'ر' => 'r',
+ 'ز' => 'z',
+ 'س' => 's',
+ 'ش' => 'sh',
+ 'ص' => 's\'',
+ 'ض' => 'd\'',
+ 'ط' => 't\'',
+ 'ظ' => 'z\'',
+ 'ع' => 'y',
+ 'غ' => 'gh',
+ 'ف' => 'f',
+ 'ق' => 'q',
+ 'ك' => 'k',
+ 'ل' => 'l',
+ 'م' => 'm',
+ 'ن' => 'n',
+ 'ه' => 'x\'',
+ 'و' => 'u',
+ 'ي' => 'i',
+
+ // Japanese characters (last update: 2008-05-09)
+
+ // Japanese hiragana
+
+ // 3 character syllables, っ doubles the consonant after
+ 'っちゃ' => 'ccha',
+ 'っちぇ' => 'cche',
+ 'っちょ' => 'ccho',
+ 'っちゅ' => 'cchu',
+ 'っびゃ' => 'bbya',
+ 'っびぇ' => 'bbye',
+ 'っびぃ' => 'bbyi',
+ 'っびょ' => 'bbyo',
+ 'っびゅ' => 'bbyu',
+ 'っぴゃ' => 'ppya',
+ 'っぴぇ' => 'ppye',
+ 'っぴぃ' => 'ppyi',
+ 'っぴょ' => 'ppyo',
+ 'っぴゅ' => 'ppyu',
+ 'っちゃ' => 'ccha',
+ 'っちぇ' => 'cche',
+ 'っち' => 'cchi',
+ 'っちょ' => 'ccho',
+ 'っちゅ' => 'cchu',
+ // 'っひゃ'=>'hya',
+ // 'っひぇ'=>'hye',
+ // 'っひぃ'=>'hyi',
+ // 'っひょ'=>'hyo',
+ // 'っひゅ'=>'hyu',
+ 'っきゃ' => 'kkya',
+ 'っきぇ' => 'kkye',
+ 'っきぃ' => 'kkyi',
+ 'っきょ' => 'kkyo',
+ 'っきゅ' => 'kkyu',
+ 'っぎゃ' => 'ggya',
+ 'っぎぇ' => 'ggye',
+ 'っぎぃ' => 'ggyi',
+ 'っぎょ' => 'ggyo',
+ 'っぎゅ' => 'ggyu',
+ 'っみゃ' => 'mmya',
+ 'っみぇ' => 'mmye',
+ 'っみぃ' => 'mmyi',
+ 'っみょ' => 'mmyo',
+ 'っみゅ' => 'mmyu',
+ 'っにゃ' => 'nnya',
+ 'っにぇ' => 'nnye',
+ 'っにぃ' => 'nnyi',
+ 'っにょ' => 'nnyo',
+ 'っにゅ' => 'nnyu',
+ 'っりゃ' => 'rrya',
+ 'っりぇ' => 'rrye',
+ 'っりぃ' => 'rryi',
+ 'っりょ' => 'rryo',
+ 'っりゅ' => 'rryu',
+ 'っしゃ' => 'ssha',
+ 'っしぇ' => 'sshe',
+ 'っし' => 'sshi',
+ 'っしょ' => 'ssho',
+ 'っしゅ' => 'sshu',
+
+ // seperate hiragana 'n' ('n' + 'i' != 'ni', normally we would write "kon'nichi wa" but the
+ // apostrophe would be converted to _ anyway)
+ 'んあ' => 'n_a',
+ 'んえ' => 'n_e',
+ 'んい' => 'n_i',
+ 'んお' => 'n_o',
+ 'んう' => 'n_u',
+ 'んや' => 'n_ya',
+ 'んよ' => 'n_yo',
+ 'んゆ' => 'n_yu',
+
+ // 2 character syllables - normal
+ 'ふぁ' => 'fa',
+ 'ふぇ' => 'fe',
+ 'ふぃ' => 'fi',
+ 'ふぉ' => 'fo',
+ 'ちゃ' => 'cha',
+ 'ちぇ' => 'che',
+ 'ち' => 'chi',
+ 'ちょ' => 'cho',
+ 'ちゅ' => 'chu',
+ 'ひゃ' => 'hya',
+ 'ひぇ' => 'hye',
+ 'ひぃ' => 'hyi',
+ 'ひょ' => 'hyo',
+ 'ひゅ' => 'hyu',
+ 'びゃ' => 'bya',
+ 'びぇ' => 'bye',
+ 'びぃ' => 'byi',
+ 'びょ' => 'byo',
+ 'びゅ' => 'byu',
+ 'ぴゃ' => 'pya',
+ 'ぴぇ' => 'pye',
+ 'ぴぃ' => 'pyi',
+ 'ぴょ' => 'pyo',
+ 'ぴゅ' => 'pyu',
+ 'きゃ' => 'kya',
+ 'きぇ' => 'kye',
+ 'きぃ' => 'kyi',
+ 'きょ' => 'kyo',
+ 'きゅ' => 'kyu',
+ 'ぎゃ' => 'gya',
+ 'ぎぇ' => 'gye',
+ 'ぎぃ' => 'gyi',
+ 'ぎょ' => 'gyo',
+ 'ぎゅ' => 'gyu',
+ 'みゃ' => 'mya',
+ 'みぇ' => 'mye',
+ 'みぃ' => 'myi',
+ 'みょ' => 'myo',
+ 'みゅ' => 'myu',
+ 'にゃ' => 'nya',
+ 'にぇ' => 'nye',
+ 'にぃ' => 'nyi',
+ 'にょ' => 'nyo',
+ 'にゅ' => 'nyu',
+ 'りゃ' => 'rya',
+ 'りぇ' => 'rye',
+ 'りぃ' => 'ryi',
+ 'りょ' => 'ryo',
+ 'りゅ' => 'ryu',
+ 'しゃ' => 'sha',
+ 'しぇ' => 'she',
+ 'し' => 'shi',
+ 'しょ' => 'sho',
+ 'しゅ' => 'shu',
+ 'じゃ' => 'ja',
+ 'じぇ' => 'je',
+ 'じょ' => 'jo',
+ 'じゅ' => 'ju',
+ 'うぇ' => 'we',
+ 'うぃ' => 'wi',
+ 'いぇ' => 'ye',
+
+ // 2 character syllables, っ doubles the consonant after
+ 'っば' => 'bba',
+ 'っべ' => 'bbe',
+ 'っび' => 'bbi',
+ 'っぼ' => 'bbo',
+ 'っぶ' => 'bbu',
+ 'っぱ' => 'ppa',
+ 'っぺ' => 'ppe',
+ 'っぴ' => 'ppi',
+ 'っぽ' => 'ppo',
+ 'っぷ' => 'ppu',
+ 'った' => 'tta',
+ 'って' => 'tte',
+ 'っち' => 'cchi',
+ 'っと' => 'tto',
+ 'っつ' => 'ttsu',
+ 'っだ' => 'dda',
+ 'っで' => 'dde',
+ 'っぢ' => 'ddi',
+ 'っど' => 'ddo',
+ 'っづ' => 'ddu',
+ 'っが' => 'gga',
+ 'っげ' => 'gge',
+ 'っぎ' => 'ggi',
+ 'っご' => 'ggo',
+ 'っぐ' => 'ggu',
+ 'っか' => 'kka',
+ 'っけ' => 'kke',
+ 'っき' => 'kki',
+ 'っこ' => 'kko',
+ 'っく' => 'kku',
+ 'っま' => 'mma',
+ 'っめ' => 'mme',
+ 'っみ' => 'mmi',
+ 'っも' => 'mmo',
+ 'っむ' => 'mmu',
+ 'っな' => 'nna',
+ 'っね' => 'nne',
+ 'っに' => 'nni',
+ 'っの' => 'nno',
+ 'っぬ' => 'nnu',
+ 'っら' => 'rra',
+ 'っれ' => 'rre',
+ 'っり' => 'rri',
+ 'っろ' => 'rro',
+ 'っる' => 'rru',
+ 'っさ' => 'ssa',
+ 'っせ' => 'sse',
+ 'っし' => 'sshi',
+ 'っそ' => 'sso',
+ 'っす' => 'ssu',
+ 'っざ' => 'zza',
+ 'っぜ' => 'zze',
+ 'っじ' => 'jji',
+ 'っぞ' => 'zzo',
+ 'っず' => 'zzu',
+
+ // 1 character syllabels
+ 'あ' => 'a',
+ 'え' => 'e',
+ 'い' => 'i',
+ 'お' => 'o',
+ 'う' => 'u',
+ 'ん' => 'n',
+ 'は' => 'ha',
+ 'へ' => 'he',
+ 'ひ' => 'hi',
+ 'ほ' => 'ho',
+ 'ふ' => 'fu',
+ 'ば' => 'ba',
+ 'べ' => 'be',
+ 'び' => 'bi',
+ 'ぼ' => 'bo',
+ 'ぶ' => 'bu',
+ 'ぱ' => 'pa',
+ 'ぺ' => 'pe',
+ 'ぴ' => 'pi',
+ 'ぽ' => 'po',
+ 'ぷ' => 'pu',
+ 'た' => 'ta',
+ 'て' => 'te',
+ 'ち' => 'chi',
+ 'と' => 'to',
+ 'つ' => 'tsu',
+ 'だ' => 'da',
+ 'で' => 'de',
+ 'ぢ' => 'di',
+ 'ど' => 'do',
+ 'づ' => 'du',
+ 'が' => 'ga',
+ 'げ' => 'ge',
+ 'ぎ' => 'gi',
+ 'ご' => 'go',
+ 'ぐ' => 'gu',
+ 'か' => 'ka',
+ 'け' => 'ke',
+ 'き' => 'ki',
+ 'こ' => 'ko',
+ 'く' => 'ku',
+ 'ま' => 'ma',
+ 'め' => 'me',
+ 'み' => 'mi',
+ 'も' => 'mo',
+ 'む' => 'mu',
+ 'な' => 'na',
+ 'ね' => 'ne',
+ 'に' => 'ni',
+ 'の' => 'no',
+ 'ぬ' => 'nu',
+ 'ら' => 'ra',
+ 'れ' => 're',
+ 'り' => 'ri',
+ 'ろ' => 'ro',
+ 'る' => 'ru',
+ 'さ' => 'sa',
+ 'せ' => 'se',
+ 'し' => 'shi',
+ 'そ' => 'so',
+ 'す' => 'su',
+ 'わ' => 'wa',
+ 'を' => 'wo',
+ 'ざ' => 'za',
+ 'ぜ' => 'ze',
+ 'じ' => 'ji',
+ 'ぞ' => 'zo',
+ 'ず' => 'zu',
+ 'や' => 'ya',
+ 'よ' => 'yo',
+ 'ゆ' => 'yu',
+ // old characters
+ 'ゑ' => 'we',
+ 'ゐ' => 'wi',
+
+ // convert what's left (probably only kicks in when something's missing above)
+ // 'ぁ'=>'a','ぇ'=>'e','ぃ'=>'i','ぉ'=>'o','ぅ'=>'u',
+ // 'ゃ'=>'ya','ょ'=>'yo','ゅ'=>'yu',
+
+ // never seen one of those (disabled for the moment)
+ // 'ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo','ヴ'=>'vu',
+ // 'でゃ'=>'dha','でぇ'=>'dhe','でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu',
+ // 'どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi','どぉ'=>'dwo','どぅ'=>'dwu',
+ // 'ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo','ぢゅ'=>'dyu',
+ // 'ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo','ふぅ'=>'fwu',
+ // 'ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
+ // 'すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi','すぉ'=>'swo','すぅ'=>'swu',
+ // 'てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
+ // 'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu',
+ // 'とぁ'=>'twa','とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu',
+ // 'ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi','ヴょ'=>'vyo','ヴゅ'=>'vyu',
+ // 'うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who','うぅ'=>'whu',
+ // 'じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi','じょ'=>'zho','じゅ'=>'zhu',
+ // 'じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo','じゅ'=>'zyu',
+
+ // 'spare' characters from other romanization systems
+ // 'だ'=>'da','で'=>'de','ぢ'=>'di','ど'=>'do','づ'=>'du',
+ // 'ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
+ // 'さ'=>'sa','せ'=>'se','し'=>'si','そ'=>'so','す'=>'su',
+ // 'ちゃ'=>'cya','ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu',
+ //'じゃ'=>'jya','じぇ'=>'jye','じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu',
+ //'りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo','りゅ'=>'lyu',
+ //'しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo','しゅ'=>'syu',
+ //'ちゃ'=>'tya','ちぇ'=>'tye','ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu',
+ //'し'=>'ci',,い'=>'yi','ぢ'=>'dzi',
+ //'っじゃ'=>'jja','っじぇ'=>'jje','っじ'=>'jji','っじょ'=>'jjo','っじゅ'=>'jju',
+
+
+ // Japanese katakana
+
+ // 4 character syllables: ッ doubles the consonant after, ー doubles the vowel before
+ // (usualy written with macron, but we don't want that in our URLs)
+ 'ッビャー' => 'bbyaa',
+ 'ッビェー' => 'bbyee',
+ 'ッビィー' => 'bbyii',
+ 'ッビョー' => 'bbyoo',
+ 'ッビュー' => 'bbyuu',
+ 'ッピャー' => 'ppyaa',
+ 'ッピェー' => 'ppyee',
+ 'ッピィー' => 'ppyii',
+ 'ッピョー' => 'ppyoo',
+ 'ッピュー' => 'ppyuu',
+ 'ッキャー' => 'kkyaa',
+ 'ッキェー' => 'kkyee',
+ 'ッキィー' => 'kkyii',
+ 'ッキョー' => 'kkyoo',
+ 'ッキュー' => 'kkyuu',
+ 'ッギャー' => 'ggyaa',
+ 'ッギェー' => 'ggyee',
+ 'ッギィー' => 'ggyii',
+ 'ッギョー' => 'ggyoo',
+ 'ッギュー' => 'ggyuu',
+ 'ッミャー' => 'mmyaa',
+ 'ッミェー' => 'mmyee',
+ 'ッミィー' => 'mmyii',
+ 'ッミョー' => 'mmyoo',
+ 'ッミュー' => 'mmyuu',
+ 'ッニャー' => 'nnyaa',
+ 'ッニェー' => 'nnyee',
+ 'ッニィー' => 'nnyii',
+ 'ッニョー' => 'nnyoo',
+ 'ッニュー' => 'nnyuu',
+ 'ッリャー' => 'rryaa',
+ 'ッリェー' => 'rryee',
+ 'ッリィー' => 'rryii',
+ 'ッリョー' => 'rryoo',
+ 'ッリュー' => 'rryuu',
+ 'ッシャー' => 'sshaa',
+ 'ッシェー' => 'sshee',
+ 'ッシー' => 'sshii',
+ 'ッショー' => 'sshoo',
+ 'ッシュー' => 'sshuu',
+ 'ッチャー' => 'cchaa',
+ 'ッチェー' => 'cchee',
+ 'ッチー' => 'cchii',
+ 'ッチョー' => 'cchoo',
+ 'ッチュー' => 'cchuu',
+ 'ッティー' => 'ttii',
+ 'ッヂィー' => 'ddii',
+
+ // 3 character syllables - doubled vowels
+ 'ファー' => 'faa',
+ 'フェー' => 'fee',
+ 'フィー' => 'fii',
+ 'フォー' => 'foo',
+ 'フャー' => 'fyaa',
+ 'フェー' => 'fyee',
+ 'フィー' => 'fyii',
+ 'フョー' => 'fyoo',
+ 'フュー' => 'fyuu',
+ 'ヒャー' => 'hyaa',
+ 'ヒェー' => 'hyee',
+ 'ヒィー' => 'hyii',
+ 'ヒョー' => 'hyoo',
+ 'ヒュー' => 'hyuu',
+ 'ビャー' => 'byaa',
+ 'ビェー' => 'byee',
+ 'ビィー' => 'byii',
+ 'ビョー' => 'byoo',
+ 'ビュー' => 'byuu',
+ 'ピャー' => 'pyaa',
+ 'ピェー' => 'pyee',
+ 'ピィー' => 'pyii',
+ 'ピョー' => 'pyoo',
+ 'ピュー' => 'pyuu',
+ 'キャー' => 'kyaa',
+ 'キェー' => 'kyee',
+ 'キィー' => 'kyii',
+ 'キョー' => 'kyoo',
+ 'キュー' => 'kyuu',
+ 'ギャー' => 'gyaa',
+ 'ギェー' => 'gyee',
+ 'ギィー' => 'gyii',
+ 'ギョー' => 'gyoo',
+ 'ギュー' => 'gyuu',
+ 'ミャー' => 'myaa',
+ 'ミェー' => 'myee',
+ 'ミィー' => 'myii',
+ 'ミョー' => 'myoo',
+ 'ミュー' => 'myuu',
+ 'ニャー' => 'nyaa',
+ 'ニェー' => 'nyee',
+ 'ニィー' => 'nyii',
+ 'ニョー' => 'nyoo',
+ 'ニュー' => 'nyuu',
+ 'リャー' => 'ryaa',
+ 'リェー' => 'ryee',
+ 'リィー' => 'ryii',
+ 'リョー' => 'ryoo',
+ 'リュー' => 'ryuu',
+ 'シャー' => 'shaa',
+ 'シェー' => 'shee',
+ 'シー' => 'shii',
+ 'ショー' => 'shoo',
+ 'シュー' => 'shuu',
+ 'ジャー' => 'jaa',
+ 'ジェー' => 'jee',
+ 'ジー' => 'jii',
+ 'ジョー' => 'joo',
+ 'ジュー' => 'juu',
+ 'スァー' => 'swaa',
+ 'スェー' => 'swee',
+ 'スィー' => 'swii',
+ 'スォー' => 'swoo',
+ 'スゥー' => 'swuu',
+ 'デァー' => 'daa',
+ 'デェー' => 'dee',
+ 'ディー' => 'dii',
+ 'デォー' => 'doo',
+ 'デゥー' => 'duu',
+ 'チャー' => 'chaa',
+ 'チェー' => 'chee',
+ 'チー' => 'chii',
+ 'チョー' => 'choo',
+ 'チュー' => 'chuu',
+ 'ヂャー' => 'dyaa',
+ 'ヂェー' => 'dyee',
+ 'ヂィー' => 'dyii',
+ 'ヂョー' => 'dyoo',
+ 'ヂュー' => 'dyuu',
+ 'ツャー' => 'tsaa',
+ 'ツェー' => 'tsee',
+ 'ツィー' => 'tsii',
+ 'ツョー' => 'tsoo',
+ 'ツー' => 'tsuu',
+ 'トァー' => 'twaa',
+ 'トェー' => 'twee',
+ 'トィー' => 'twii',
+ 'トォー' => 'twoo',
+ 'トゥー' => 'twuu',
+ 'ドァー' => 'dwaa',
+ 'ドェー' => 'dwee',
+ 'ドィー' => 'dwii',
+ 'ドォー' => 'dwoo',
+ 'ドゥー' => 'dwuu',
+ 'ウァー' => 'whaa',
+ 'ウェー' => 'whee',
+ 'ウィー' => 'whii',
+ 'ウォー' => 'whoo',
+ 'ウゥー' => 'whuu',
+ 'ヴャー' => 'vyaa',
+ 'ヴェー' => 'vyee',
+ 'ヴィー' => 'vyii',
+ 'ヴョー' => 'vyoo',
+ 'ヴュー' => 'vyuu',
+ 'ヴァー' => 'vaa',
+ 'ヴェー' => 'vee',
+ 'ヴィー' => 'vii',
+ 'ヴォー' => 'voo',
+ 'ヴー' => 'vuu',
+ 'ウェー' => 'wee',
+ 'ウィー' => 'wii',
+ 'イェー' => 'yee',
+ 'ティー' => 'tii',
+ 'ヂィー' => 'dii',
+
+ // 3 character syllables - doubled consonants
+ 'ッビャ' => 'bbya',
+ 'ッビェ' => 'bbye',
+ 'ッビィ' => 'bbyi',
+ 'ッビョ' => 'bbyo',
+ 'ッビュ' => 'bbyu',
+ 'ッピャ' => 'ppya',
+ 'ッピェ' => 'ppye',
+ 'ッピィ' => 'ppyi',
+ 'ッピョ' => 'ppyo',
+ 'ッピュ' => 'ppyu',
+ 'ッキャ' => 'kkya',
+ 'ッキェ' => 'kkye',
+ 'ッキィ' => 'kkyi',
+ 'ッキョ' => 'kkyo',
+ 'ッキュ' => 'kkyu',
+ 'ッギャ' => 'ggya',
+ 'ッギェ' => 'ggye',
+ 'ッギィ' => 'ggyi',
+ 'ッギョ' => 'ggyo',
+ 'ッギュ' => 'ggyu',
+ 'ッミャ' => 'mmya',
+ 'ッミェ' => 'mmye',
+ 'ッミィ' => 'mmyi',
+ 'ッミョ' => 'mmyo',
+ 'ッミュ' => 'mmyu',
+ 'ッニャ' => 'nnya',
+ 'ッニェ' => 'nnye',
+ 'ッニィ' => 'nnyi',
+ 'ッニョ' => 'nnyo',
+ 'ッニュ' => 'nnyu',
+ 'ッリャ' => 'rrya',
+ 'ッリェ' => 'rrye',
+ 'ッリィ' => 'rryi',
+ 'ッリョ' => 'rryo',
+ 'ッリュ' => 'rryu',
+ 'ッシャ' => 'ssha',
+ 'ッシェ' => 'sshe',
+ 'ッシ' => 'sshi',
+ 'ッショ' => 'ssho',
+ 'ッシュ' => 'sshu',
+ 'ッチャ' => 'ccha',
+ 'ッチェ' => 'cche',
+ 'ッチ' => 'cchi',
+ 'ッチョ' => 'ccho',
+ 'ッチュ' => 'cchu',
+ 'ッティ' => 'tti',
+ 'ッヂィ' => 'ddi',
+
+ // 3 character syllables - doubled vowel and consonants
+ 'ッバー' => 'bbaa',
+ 'ッベー' => 'bbee',
+ 'ッビー' => 'bbii',
+ 'ッボー' => 'bboo',
+ 'ッブー' => 'bbuu',
+ 'ッパー' => 'ppaa',
+ 'ッペー' => 'ppee',
+ 'ッピー' => 'ppii',
+ 'ッポー' => 'ppoo',
+ 'ップー' => 'ppuu',
+ 'ッケー' => 'kkee',
+ 'ッキー' => 'kkii',
+ 'ッコー' => 'kkoo',
+ 'ックー' => 'kkuu',
+ 'ッカー' => 'kkaa',
+ 'ッガー' => 'ggaa',
+ 'ッゲー' => 'ggee',
+ 'ッギー' => 'ggii',
+ 'ッゴー' => 'ggoo',
+ 'ッグー' => 'gguu',
+ 'ッマー' => 'maa',
+ 'ッメー' => 'mee',
+ 'ッミー' => 'mii',
+ 'ッモー' => 'moo',
+ 'ッムー' => 'muu',
+ 'ッナー' => 'nnaa',
+ 'ッネー' => 'nnee',
+ 'ッニー' => 'nnii',
+ 'ッノー' => 'nnoo',
+ 'ッヌー' => 'nnuu',
+ 'ッラー' => 'rraa',
+ 'ッレー' => 'rree',
+ 'ッリー' => 'rrii',
+ 'ッロー' => 'rroo',
+ 'ッルー' => 'rruu',
+ 'ッサー' => 'ssaa',
+ 'ッセー' => 'ssee',
+ 'ッシー' => 'sshii',
+ 'ッソー' => 'ssoo',
+ 'ッスー' => 'ssuu',
+ 'ッザー' => 'zzaa',
+ 'ッゼー' => 'zzee',
+ 'ッジー' => 'jjii',
+ 'ッゾー' => 'zzoo',
+ 'ッズー' => 'zzuu',
+ 'ッター' => 'ttaa',
+ 'ッテー' => 'ttee',
+ 'ッチー' => 'chii',
+ 'ットー' => 'ttoo',
+ 'ッツー' => 'ttsuu',
+ 'ッダー' => 'ddaa',
+ 'ッデー' => 'ddee',
+ 'ッヂー' => 'ddii',
+ 'ッドー' => 'ddoo',
+ 'ッヅー' => 'dduu',
+
+ // 2 character syllables - normal
+ 'ファ' => 'fa',
+ 'フェ' => 'fe',
+ 'フィ' => 'fi',
+ 'フォ' => 'fo',
+ 'フゥ' => 'fu',
+ // 'フャ'=>'fya',
+ // 'フェ'=>'fye',
+ // 'フィ'=>'fyi',
+ // 'フョ'=>'fyo',
+ // 'フュ'=>'fyu',
+ 'フャ' => 'fa',
+ 'フェ' => 'fe',
+ 'フィ' => 'fi',
+ 'フョ' => 'fo',
+ 'フュ' => 'fu',
+ 'ヒャ' => 'hya',
+ 'ヒェ' => 'hye',
+ 'ヒィ' => 'hyi',
+ 'ヒョ' => 'hyo',
+ 'ヒュ' => 'hyu',
+ 'ビャ' => 'bya',
+ 'ビェ' => 'bye',
+ 'ビィ' => 'byi',
+ 'ビョ' => 'byo',
+ 'ビュ' => 'byu',
+ 'ピャ' => 'pya',
+ 'ピェ' => 'pye',
+ 'ピィ' => 'pyi',
+ 'ピョ' => 'pyo',
+ 'ピュ' => 'pyu',
+ 'キャ' => 'kya',
+ 'キェ' => 'kye',
+ 'キィ' => 'kyi',
+ 'キョ' => 'kyo',
+ 'キュ' => 'kyu',
+ 'ギャ' => 'gya',
+ 'ギェ' => 'gye',
+ 'ギィ' => 'gyi',
+ 'ギョ' => 'gyo',
+ 'ギュ' => 'gyu',
+ 'ミャ' => 'mya',
+ 'ミェ' => 'mye',
+ 'ミィ' => 'myi',
+ 'ミョ' => 'myo',
+ 'ミュ' => 'myu',
+ 'ニャ' => 'nya',
+ 'ニェ' => 'nye',
+ 'ニィ' => 'nyi',
+ 'ニョ' => 'nyo',
+ 'ニュ' => 'nyu',
+ 'リャ' => 'rya',
+ 'リェ' => 'rye',
+ 'リィ' => 'ryi',
+ 'リョ' => 'ryo',
+ 'リュ' => 'ryu',
+ 'シャ' => 'sha',
+ 'シェ' => 'she',
+ 'ショ' => 'sho',
+ 'シュ' => 'shu',
+ 'ジャ' => 'ja',
+ 'ジェ' => 'je',
+ 'ジョ' => 'jo',
+ 'ジュ' => 'ju',
+ 'スァ' => 'swa',
+ 'スェ' => 'swe',
+ 'スィ' => 'swi',
+ 'スォ' => 'swo',
+ 'スゥ' => 'swu',
+ 'デァ' => 'da',
+ 'デェ' => 'de',
+ 'ディ' => 'di',
+ 'デォ' => 'do',
+ 'デゥ' => 'du',
+ 'チャ' => 'cha',
+ 'チェ' => 'che',
+ 'チ' => 'chi',
+ 'チョ' => 'cho',
+ 'チュ' => 'chu',
+ // 'ヂャ'=>'dya',
+ // 'ヂェ'=>'dye',
+ // 'ヂィ'=>'dyi',
+ // 'ヂョ'=>'dyo',
+ // 'ヂュ'=>'dyu',
+ 'ツャ' => 'tsa',
+ 'ツェ' => 'tse',
+ 'ツィ' => 'tsi',
+ 'ツョ' => 'tso',
+ 'ツ' => 'tsu',
+ 'トァ' => 'twa',
+ 'トェ' => 'twe',
+ 'トィ' => 'twi',
+ 'トォ' => 'two',
+ 'トゥ' => 'twu',
+ 'ドァ' => 'dwa',
+ 'ドェ' => 'dwe',
+ 'ドィ' => 'dwi',
+ 'ドォ' => 'dwo',
+ 'ドゥ' => 'dwu',
+ 'ウァ' => 'wha',
+ 'ウェ' => 'whe',
+ 'ウィ' => 'whi',
+ 'ウォ' => 'who',
+ 'ウゥ' => 'whu',
+ 'ヴャ' => 'vya',
+ 'ヴェ' => 'vye',
+ 'ヴィ' => 'vyi',
+ 'ヴョ' => 'vyo',
+ 'ヴュ' => 'vyu',
+ 'ヴァ' => 'va',
+ 'ヴェ' => 've',
+ 'ヴィ' => 'vi',
+ 'ヴォ' => 'vo',
+ 'ヴ' => 'vu',
+ 'ウェ' => 'we',
+ 'ウィ' => 'wi',
+ 'イェ' => 'ye',
+ 'ティ' => 'ti',
+ 'ヂィ' => 'di',
+
+ // 2 character syllables - doubled vocal
+ 'アー' => 'aa',
+ 'エー' => 'ee',
+ 'イー' => 'ii',
+ 'オー' => 'oo',
+ 'ウー' => 'uu',
+ 'ダー' => 'daa',
+ 'デー' => 'dee',
+ 'ヂー' => 'dii',
+ 'ドー' => 'doo',
+ 'ヅー' => 'duu',
+ 'ハー' => 'haa',
+ 'ヘー' => 'hee',
+ 'ヒー' => 'hii',
+ 'ホー' => 'hoo',
+ 'フー' => 'fuu',
+ 'バー' => 'baa',
+ 'ベー' => 'bee',
+ 'ビー' => 'bii',
+ 'ボー' => 'boo',
+ 'ブー' => 'buu',
+ 'パー' => 'paa',
+ 'ペー' => 'pee',
+ 'ピー' => 'pii',
+ 'ポー' => 'poo',
+ 'プー' => 'puu',
+ 'ケー' => 'kee',
+ 'キー' => 'kii',
+ 'コー' => 'koo',
+ 'クー' => 'kuu',
+ 'カー' => 'kaa',
+ 'ガー' => 'gaa',
+ 'ゲー' => 'gee',
+ 'ギー' => 'gii',
+ 'ゴー' => 'goo',
+ 'グー' => 'guu',
+ 'マー' => 'maa',
+ 'メー' => 'mee',
+ 'ミー' => 'mii',
+ 'モー' => 'moo',
+ 'ムー' => 'muu',
+ 'ナー' => 'naa',
+ 'ネー' => 'nee',
+ 'ニー' => 'nii',
+ 'ノー' => 'noo',
+ 'ヌー' => 'nuu',
+ 'ラー' => 'raa',
+ 'レー' => 'ree',
+ 'リー' => 'rii',
+ 'ロー' => 'roo',
+ 'ルー' => 'ruu',
+ 'サー' => 'saa',
+ 'セー' => 'see',
+ 'シー' => 'shii',
+ 'ソー' => 'soo',
+ 'スー' => 'suu',
+ 'ザー' => 'zaa',
+ 'ゼー' => 'zee',
+ 'ジー' => 'jii',
+ 'ゾー' => 'zoo',
+ 'ズー' => 'zuu',
+ 'ター' => 'taa',
+ 'テー' => 'tee',
+ 'チー' => 'chii',
+ 'トー' => 'too',
+ 'ツー' => 'tsuu',
+ 'ワー' => 'waa',
+ 'ヲー' => 'woo',
+ 'ヤー' => 'yaa',
+ 'ヨー' => 'yoo',
+ 'ユー' => 'yuu',
+ 'ヵー' => 'kaa',
+ 'ヶー' => 'kee',
+ // old characters
+ 'ヱー' => 'wee',
+ 'ヰー' => 'wii',
+
+ // seperate katakana 'n'
+ 'ンア' => 'n_a',
+ 'ンエ' => 'n_e',
+ 'ンイ' => 'n_i',
+ 'ンオ' => 'n_o',
+ 'ンウ' => 'n_u',
+ 'ンヤ' => 'n_ya',
+ 'ンヨ' => 'n_yo',
+ 'ンユ' => 'n_yu',
+
+ // 2 character syllables - doubled consonants
+ 'ッバ' => 'bba',
+ 'ッベ' => 'bbe',
+ 'ッビ' => 'bbi',
+ 'ッボ' => 'bbo',
+ 'ッブ' => 'bbu',
+ 'ッパ' => 'ppa',
+ 'ッペ' => 'ppe',
+ 'ッピ' => 'ppi',
+ 'ッポ' => 'ppo',
+ 'ップ' => 'ppu',
+ 'ッケ' => 'kke',
+ 'ッキ' => 'kki',
+ 'ッコ' => 'kko',
+ 'ック' => 'kku',
+ 'ッカ' => 'kka',
+ 'ッガ' => 'gga',
+ 'ッゲ' => 'gge',
+ 'ッギ' => 'ggi',
+ 'ッゴ' => 'ggo',
+ 'ッグ' => 'ggu',
+ 'ッマ' => 'ma',
+ 'ッメ' => 'me',
+ 'ッミ' => 'mi',
+ 'ッモ' => 'mo',
+ 'ッム' => 'mu',
+ 'ッナ' => 'nna',
+ 'ッネ' => 'nne',
+ 'ッニ' => 'nni',
+ 'ッノ' => 'nno',
+ 'ッヌ' => 'nnu',
+ 'ッラ' => 'rra',
+ 'ッレ' => 'rre',
+ 'ッリ' => 'rri',
+ 'ッロ' => 'rro',
+ 'ッル' => 'rru',
+ 'ッサ' => 'ssa',
+ 'ッセ' => 'sse',
+ 'ッシ' => 'sshi',
+ 'ッソ' => 'sso',
+ 'ッス' => 'ssu',
+ 'ッザ' => 'zza',
+ 'ッゼ' => 'zze',
+ 'ッジ' => 'jji',
+ 'ッゾ' => 'zzo',
+ 'ッズ' => 'zzu',
+ 'ッタ' => 'tta',
+ 'ッテ' => 'tte',
+ 'ッチ' => 'cchi',
+ 'ット' => 'tto',
+ 'ッツ' => 'ttsu',
+ 'ッダ' => 'dda',
+ 'ッデ' => 'dde',
+ 'ッヂ' => 'ddi',
+ 'ッド' => 'ddo',
+ 'ッヅ' => 'ddu',
+
+ // 1 character syllables
+ 'ア' => 'a',
+ 'エ' => 'e',
+ 'イ' => 'i',
+ 'オ' => 'o',
+ 'ウ' => 'u',
+ 'ン' => 'n',
+ 'ハ' => 'ha',
+ 'ヘ' => 'he',
+ 'ヒ' => 'hi',
+ 'ホ' => 'ho',
+ 'フ' => 'fu',
+ 'バ' => 'ba',
+ 'ベ' => 'be',
+ 'ビ' => 'bi',
+ 'ボ' => 'bo',
+ 'ブ' => 'bu',
+ 'パ' => 'pa',
+ 'ペ' => 'pe',
+ 'ピ' => 'pi',
+ 'ポ' => 'po',
+ 'プ' => 'pu',
+ 'ケ' => 'ke',
+ 'キ' => 'ki',
+ 'コ' => 'ko',
+ 'ク' => 'ku',
+ 'カ' => 'ka',
+ 'ガ' => 'ga',
+ 'ゲ' => 'ge',
+ 'ギ' => 'gi',
+ 'ゴ' => 'go',
+ 'グ' => 'gu',
+ 'マ' => 'ma',
+ 'メ' => 'me',
+ 'ミ' => 'mi',
+ 'モ' => 'mo',
+ 'ム' => 'mu',
+ 'ナ' => 'na',
+ 'ネ' => 'ne',
+ 'ニ' => 'ni',
+ 'ノ' => 'no',
+ 'ヌ' => 'nu',
+ 'ラ' => 'ra',
+ 'レ' => 're',
+ 'リ' => 'ri',
+ 'ロ' => 'ro',
+ 'ル' => 'ru',
+ 'サ' => 'sa',
+ 'セ' => 'se',
+ 'シ' => 'shi',
+ 'ソ' => 'so',
+ 'ス' => 'su',
+ 'ザ' => 'za',
+ 'ゼ' => 'ze',
+ 'ジ' => 'ji',
+ 'ゾ' => 'zo',
+ 'ズ' => 'zu',
+ 'タ' => 'ta',
+ 'テ' => 'te',
+ 'チ' => 'chi',
+ 'ト' => 'to',
+ 'ツ' => 'tsu',
+ 'ダ' => 'da',
+ 'デ' => 'de',
+ 'ヂ' => 'di',
+ 'ド' => 'do',
+ 'ヅ' => 'du',
+ 'ワ' => 'wa',
+ 'ヲ' => 'wo',
+ 'ヤ' => 'ya',
+ 'ヨ' => 'yo',
+ 'ユ' => 'yu',
+ 'ヵ' => 'ka',
+ 'ヶ' => 'ke',
+ // old characters
+ 'ヱ' => 'we',
+ 'ヰ' => 'wi',
+
+ // convert what's left (probably only kicks in when something's missing above)
+ 'ァ' => 'a',
+ 'ェ' => 'e',
+ 'ィ' => 'i',
+ 'ォ' => 'o',
+ 'ゥ' => 'u',
+ 'ャ' => 'ya',
+ 'ョ' => 'yo',
+ 'ュ' => 'yu',
+
+ // special characters
+ '・' => '_',
+ '、' => '_',
+ 'ー' => '_',
+ // when used with hiragana (seldom), this character would not be converted otherwise
+
+ // 'ラ'=>'la',
+ // 'レ'=>'le',
+ // 'リ'=>'li',
+ // 'ロ'=>'lo',
+ // 'ル'=>'lu',
+ // 'チャ'=>'cya',
+ // 'チェ'=>'cye',
+ // 'チィ'=>'cyi',
+ // 'チョ'=>'cyo',
+ // 'チュ'=>'cyu',
+ // 'デャ'=>'dha',
+ // 'デェ'=>'dhe',
+ // 'ディ'=>'dhi',
+ // 'デョ'=>'dho',
+ // 'デュ'=>'dhu',
+ // 'リャ'=>'lya',
+ // 'リェ'=>'lye',
+ // 'リィ'=>'lyi',
+ // 'リョ'=>'lyo',
+ // 'リュ'=>'lyu',
+ // 'テャ'=>'tha',
+ // 'テェ'=>'the',
+ // 'ティ'=>'thi',
+ // 'テョ'=>'tho',
+ // 'テュ'=>'thu',
+ // 'ファ'=>'fwa',
+ // 'フェ'=>'fwe',
+ // 'フィ'=>'fwi',
+ // 'フォ'=>'fwo',
+ // 'フゥ'=>'fwu',
+ // 'チャ'=>'tya',
+ // 'チェ'=>'tye',
+ // 'チィ'=>'tyi',
+ // 'チョ'=>'tyo',
+ // 'チュ'=>'tyu',
+ // 'ジャ'=>'jya',
+ // 'ジェ'=>'jye',
+ // 'ジィ'=>'jyi',
+ // 'ジョ'=>'jyo',
+ // 'ジュ'=>'jyu',
+ // 'ジャ'=>'zha',
+ // 'ジェ'=>'zhe',
+ // 'ジィ'=>'zhi',
+ // 'ジョ'=>'zho',
+ // 'ジュ'=>'zhu',
+ // 'ジャ'=>'zya',
+ // 'ジェ'=>'zye',
+ // 'ジィ'=>'zyi',
+ // 'ジョ'=>'zyo',
+ // 'ジュ'=>'zyu',
+ // 'シャ'=>'sya',
+ // 'シェ'=>'sye',
+ // 'シィ'=>'syi',
+ // 'ショ'=>'syo',
+ // 'シュ'=>'syu',
+ // 'シ'=>'ci',
+ // 'フ'=>'hu',
+ // 'シ'=>'si',
+ // 'チ'=>'ti',
+ // 'ツ'=>'tu',
+ // 'イ'=>'yi',
+ // 'ヂ'=>'dzi',
+
+ // "Greeklish"
+ 'Γ' => 'G',
+ 'Δ' => 'E',
+ 'Θ' => 'Th',
+ 'Λ' => 'L',
+ 'Ξ' => 'X',
+ 'Π' => 'P',
+ 'Σ' => 'S',
+ 'Φ' => 'F',
+ 'Ψ' => 'Ps',
+ 'γ' => 'g',
+ 'δ' => 'e',
+ 'θ' => 'th',
+ 'λ' => 'l',
+ 'ξ' => 'x',
+ 'π' => 'p',
+ 'σ' => 's',
+ 'φ' => 'f',
+ 'ψ' => 'ps',
+
+ // Thai
+ 'ก' => 'k',
+ 'ข' => 'kh',
+ 'ฃ' => 'kh',
+ 'ค' => 'kh',
+ 'ฅ' => 'kh',
+ 'ฆ' => 'kh',
+ 'ง' => 'ng',
+ 'จ' => 'ch',
+ 'ฉ' => 'ch',
+ 'ช' => 'ch',
+ 'ซ' => 's',
+ 'ฌ' => 'ch',
+ 'ญ' => 'y',
+ 'ฎ' => 'd',
+ 'ฏ' => 't',
+ 'ฐ' => 'th',
+ 'ฑ' => 'd',
+ 'ฒ' => 'th',
+ 'ณ' => 'n',
+ 'ด' => 'd',
+ 'ต' => 't',
+ 'ถ' => 'th',
+ 'ท' => 'th',
+ 'ธ' => 'th',
+ 'น' => 'n',
+ 'บ' => 'b',
+ 'ป' => 'p',
+ 'ผ' => 'ph',
+ 'ฝ' => 'f',
+ 'พ' => 'ph',
+ 'ฟ' => 'f',
+ 'ภ' => 'ph',
+ 'ม' => 'm',
+ 'ย' => 'y',
+ 'ร' => 'r',
+ 'ฤ' => 'rue',
+ 'ฤๅ' => 'rue',
+ 'ล' => 'l',
+ 'ฦ' => 'lue',
+ 'ฦๅ' => 'lue',
+ 'ว' => 'w',
+ 'ศ' => 's',
+ 'ษ' => 's',
+ 'ส' => 's',
+ 'ห' => 'h',
+ 'ฬ' => 'l',
+ 'ฮ' => 'h',
+ 'ะ' => 'a',
+ 'ั' => 'a',
+ 'รร' => 'a',
+ 'า' => 'a',
+ 'ๅ' => 'a',
+ 'ำ' => 'am',
+ 'ํา' => 'am',
+ 'ิ' => 'i',
+ 'ี' => 'i',
+ 'ึ' => 'ue',
+ 'ี' => 'ue',
+ 'ุ' => 'u',
+ 'ู' => 'u',
+ 'เ' => 'e',
+ 'แ' => 'ae',
+ 'โ' => 'o',
+ 'อ' => 'o',
+ 'ียะ' => 'ia',
+ 'ีย' => 'ia',
+ 'ือะ' => 'uea',
+ 'ือ' => 'uea',
+ 'ัวะ' => 'ua',
+ 'ัว' => 'ua',
+ 'ใ' => 'ai',
+ 'ไ' => 'ai',
+ 'ัย' => 'ai',
+ 'าย' => 'ai',
+ 'าว' => 'ao',
+ 'ุย' => 'ui',
+ 'อย' => 'oi',
+ 'ือย' => 'ueai',
+ 'วย' => 'uai',
+ 'ิว' => 'io',
+ '็ว' => 'eo',
+ 'ียว' => 'iao',
+ '่' => '',
+ '้' => '',
+ '๊' => '',
+ '๋' => '',
+ '็' => '',
+ '์' => '',
+ '๎' => '',
+ 'ํ' => '',
+ 'ฺ' => '',
+ 'ๆ' => '2',
+ '๏' => 'o',
+ 'ฯ' => '-',
+ '๚' => '-',
+ '๛' => '-',
+ '๐' => '0',
+ '๑' => '1',
+ '๒' => '2',
+ '๓' => '3',
+ '๔' => '4',
+ '๕' => '5',
+ '๖' => '6',
+ '๗' => '7',
+ '๘' => '8',
+ '๙' => '9',
+
+ // Korean
+ 'ㄱ' => 'k', 'ㅋ' => 'kh',
+ 'ㄲ' => 'kk',
+ 'ㄷ' => 't',
+ 'ㅌ' => 'th',
+ 'ㄸ' => 'tt',
+ 'ㅂ' => 'p',
+ 'ㅍ' => 'ph',
+ 'ㅃ' => 'pp',
+ 'ㅈ' => 'c',
+ 'ㅊ' => 'ch',
+ 'ㅉ' => 'cc',
+ 'ㅅ' => 's',
+ 'ㅆ' => 'ss',
+ 'ㅎ' => 'h',
+ 'ㅇ' => 'ng',
+ 'ㄴ' => 'n',
+ 'ㄹ' => 'l',
+ 'ㅁ' => 'm',
+ 'ㅏ' => 'a',
+ 'ㅓ' => 'e',
+ 'ㅗ' => 'o',
+ 'ㅜ' => 'wu',
+ 'ㅡ' => 'u',
+ 'ㅣ' => 'i',
+ 'ㅐ' => 'ay',
+ 'ㅔ' => 'ey',
+ 'ㅚ' => 'oy',
+ 'ㅘ' => 'wa',
+ 'ㅝ' => 'we',
+ 'ㅟ' => 'wi',
+ 'ㅙ' => 'way',
+ 'ㅞ' => 'wey',
+ 'ㅢ' => 'uy',
+ 'ㅑ' => 'ya',
+ 'ㅕ' => 'ye',
+ 'ㅛ' => 'oy',
+ 'ㅠ' => 'yu',
+ 'ㅒ' => 'yay',
+ 'ㅖ' => 'yey',
+];
diff --git a/platform/www/inc/Utf8/tables/specials.php b/platform/www/inc/Utf8/tables/specials.php
new file mode 100644
index 0000000..f6243bc
--- /dev/null
+++ b/platform/www/inc/Utf8/tables/specials.php
@@ -0,0 +1,615 @@
+<?php
+/**
+ * UTF-8 array of common special characters
+ *
+ * This array should contain all special characters (not a letter or digit)
+ * defined in the various local charsets - it's not a complete list of non-alphanum
+ * characters in UTF-8. It's not perfect but should match most cases of special
+ * chars.
+ *
+ * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
+ * These chars are _not_ in the array either: _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see \dokuwiki\Utf8\Clean::stripspecials()
+ */
+return [
+ 0x1a, // 
+ 0x1b, // 
+ 0x1c, // 
+ 0x1d, // 
+ 0x1e, // 
+ 0x1f, // 
+ 0x20, // <space>
+ 0x21, // !
+ 0x22, // "
+ 0x23, // #
+ 0x24, // $
+ 0x25, // %
+ 0x26, // &
+ 0x27, // '
+ 0x28, // (
+ 0x29, // )
+ 0x2b, // +
+ 0x2c, // ,
+ 0x2f, // /
+ 0x3b, // ;
+ 0x3c, // <
+ 0x3d, // =
+ 0x3e, // >
+ 0x3f, // ?
+ 0x40, // @
+ 0x5b, // [
+ 0x5c, // \
+ 0x5d, // ]
+ 0x5e, // ^
+ 0x60, // `
+ 0x7b, // {
+ 0x7c, // |
+ 0x7d, // }
+ 0x7e, // ~
+ 0x7f, // 
+ 0x80, // €
+ 0x81, // 
+ 0x82, // ‚
+ 0x83, // ƒ
+ 0x84, // „
+ 0x85, // …
+ 0x86, // †
+ 0x87, // ‡
+ 0x88, // ˆ
+ 0x89, // ‰
+ 0x8a, // Š
+ 0x8b, // ‹
+ 0x8c, // Œ
+ 0x8d, // 
+ 0x8e, // Ž
+ 0x8f, // 
+ 0x90, // 
+ 0x91, // ‘
+ 0x92, // ’
+ 0x93, // “
+ 0x94, // ”
+ 0x95, // •
+ 0x96, // –
+ 0x97, // —
+ 0x98, // ˜
+ 0x99, // ™
+ 0x9a, // š
+ 0x9b, // ›
+ 0x9c, // œ
+ 0x9d, // 
+ 0x9e, // ž
+ 0x9f, // Ÿ
+ 0xa0, //  
+ 0xa1, // ¡
+ 0xa2, // ¢
+ 0xa3, // £
+ 0xa4, // ¤
+ 0xa5, // ¥
+ 0xa6, // ¦
+ 0xa7, // §
+ 0xa8, // ¨
+ 0xa9, // ©
+ 0xaa, // ª
+ 0xab, // «
+ 0xac, // ¬
+ 0xad, // ­
+ 0xae, // ®
+ 0xaf, // ¯
+ 0xb0, // °
+ 0xb1, // ±
+ 0xb2, // ²
+ 0xb3, // ³
+ 0xb4, // ´
+ 0xb5, // µ
+ 0xb6, // ¶
+ 0xb7, // ·
+ 0xb8, // ¸
+ 0xb9, // ¹
+ 0xba, // º
+ 0xbb, // »
+ 0xbc, // ¼
+ 0xbd, // ½
+ 0xbe, // ¾
+ 0xbf, // ¿
+ 0xd7, // ×
+ 0xf7, // ÷
+ 0x2c7, // ˇ
+ 0x2d8, // ˘
+ 0x2d9, // ˙
+ 0x2da, // ˚
+ 0x2db, // ˛
+ 0x2dc, // ˜
+ 0x2dd, // ˝
+ 0x300, // ̀
+ 0x301, // ́
+ 0x303, // ̃
+ 0x309, // ̉
+ 0x323, // ̣
+ 0x384, // ΄
+ 0x385, // ΅
+ 0x387, // ·
+ 0x5b0, // ְ
+ 0x5b1, // ֱ
+ 0x5b2, // ֲ
+ 0x5b3, // ֳ
+ 0x5b4, // ִ
+ 0x5b5, // ֵ
+ 0x5b6, // ֶ
+ 0x5b7, // ַ
+ 0x5b8, // ָ
+ 0x5b9, // ֹ
+ 0x5bb, // ֻ
+ 0x5bc, // ּ
+ 0x5bd, // ֽ
+ 0x5be, // ־
+ 0x5bf, // ֿ
+ 0x5c0, // ׀
+ 0x5c1, // ׁ
+ 0x5c2, // ׂ
+ 0x5c3, // ׃
+ 0x5f3, // ׳
+ 0x5f4, // ״
+ 0x60c, // ،
+ 0x61b, // ؛
+ 0x61f, // ؟
+ 0x640, // ـ
+ 0x64b, // ً
+ 0x64c, // ٌ
+ 0x64d, // ٍ
+ 0x64e, // َ
+ 0x64f, // ُ
+ 0x650, // ِ
+ 0x651, // ّ
+ 0x652, // ْ
+ 0x66a, // ٪
+ 0xe3f, // ฿
+ 0x200c, // ‌
+ 0x200d, // ‍
+ 0x200e, // ‎
+ 0x200f, // ‏
+ 0x2013, // –
+ 0x2014, // —
+ 0x2015, // ―
+ 0x2017, // ‗
+ 0x2018, // ‘
+ 0x2019, // ’
+ 0x201a, // ‚
+ 0x201c, // “
+ 0x201d, // ”
+ 0x201e, // „
+ 0x2020, // †
+ 0x2021, // ‡
+ 0x2022, // •
+ 0x2026, // …
+ 0x2030, // ‰
+ 0x2032, // ′
+ 0x2033, // ″
+ 0x2039, // ‹
+ 0x203a, // ›
+ 0x2044, // ⁄
+ 0x20a7, // ₧
+ 0x20aa, // ₪
+ 0x20ab, // ₫
+ 0x20ac, // €
+ 0x2116, // №
+ 0x2118, // ℘
+ 0x2122, // ™
+ 0x2126, // Ω
+ 0x2135, // ℵ
+ 0x2190, // ←
+ 0x2191, // ↑
+ 0x2192, // →
+ 0x2193, // ↓
+ 0x2194, // ↔
+ 0x2195, // ↕
+ 0x21b5, // ↵
+ 0x21d0, // ⇐
+ 0x21d1, // ⇑
+ 0x21d2, // ⇒
+ 0x21d3, // ⇓
+ 0x21d4, // ⇔
+ 0x2200, // ∀
+ 0x2202, // ∂
+ 0x2203, // ∃
+ 0x2205, // ∅
+ 0x2206, // ∆
+ 0x2207, // ∇
+ 0x2208, // ∈
+ 0x2209, // ∉
+ 0x220b, // ∋
+ 0x220f, // ∏
+ 0x2211, // ∑
+ 0x2212, // −
+ 0x2215, // ∕
+ 0x2217, // ∗
+ 0x2219, // ∙
+ 0x221a, // √
+ 0x221d, // ∝
+ 0x221e, // ∞
+ 0x2220, // ∠
+ 0x2227, // ∧
+ 0x2228, // ∨
+ 0x2229, // ∩
+ 0x222a, // ∪
+ 0x222b, // ∫
+ 0x2234, // ∴
+ 0x223c, // ∼
+ 0x2245, // ≅
+ 0x2248, // ≈
+ 0x2260, // ≠
+ 0x2261, // ≡
+ 0x2264, // ≤
+ 0x2265, // ≥
+ 0x2282, // ⊂
+ 0x2283, // ⊃
+ 0x2284, // ⊄
+ 0x2286, // ⊆
+ 0x2287, // ⊇
+ 0x2295, // ⊕
+ 0x2297, // ⊗
+ 0x22a5, // ⊥
+ 0x22c5, // ⋅
+ 0x2310, // ⌐
+ 0x2320, // ⌠
+ 0x2321, // ⌡
+ 0x2329, // 〈
+ 0x232a, // 〉
+ 0x2469, // ⑩
+ 0x2500, // ─
+ 0x2502, // │
+ 0x250c, // ┌
+ 0x2510, // ┐
+ 0x2514, // └
+ 0x2518, // ┘
+ 0x251c, // ├
+ 0x2524, // ┤
+ 0x252c, // ┬
+ 0x2534, // ┴
+ 0x253c, // ┼
+ 0x2550, // ═
+ 0x2551, // ║
+ 0x2552, // ╒
+ 0x2553, // ╓
+ 0x2554, // ╔
+ 0x2555, // ╕
+ 0x2556, // ╖
+ 0x2557, // ╗
+ 0x2558, // ╘
+ 0x2559, // ╙
+ 0x255a, // ╚
+ 0x255b, // ╛
+ 0x255c, // ╜
+ 0x255d, // ╝
+ 0x255e, // ╞
+ 0x255f, // ╟
+ 0x2560, // ╠
+ 0x2561, // ╡
+ 0x2562, // ╢
+ 0x2563, // ╣
+ 0x2564, // ╤
+ 0x2565, // ╥
+ 0x2566, // ╦
+ 0x2567, // ╧
+ 0x2568, // ╨
+ 0x2569, // ╩
+ 0x256a, // ╪
+ 0x256b, // ╫
+ 0x256c, // ╬
+ 0x2580, // ▀
+ 0x2584, // ▄
+ 0x2588, // █
+ 0x258c, // ▌
+ 0x2590, // ▐
+ 0x2591, // ░
+ 0x2592, // ▒
+ 0x2593, // ▓
+ 0x25a0, // ■
+ 0x25b2, // ▲
+ 0x25bc, // ▼
+ 0x25c6, // ◆
+ 0x25ca, // ◊
+ 0x25cf, // ●
+ 0x25d7, // ◗
+ 0x2605, // ★
+ 0x260e, // ☎
+ 0x261b, // ☛
+ 0x261e, // ☞
+ 0x2660, // ♠
+ 0x2663, // ♣
+ 0x2665, // ♥
+ 0x2666, // ♦
+ 0x2701, // ✁
+ 0x2702, // ✂
+ 0x2703, // ✃
+ 0x2704, // ✄
+ 0x2706, // ✆
+ 0x2707, // ✇
+ 0x2708, // ✈
+ 0x2709, // ✉
+ 0x270c, // ✌
+ 0x270d, // ✍
+ 0x270e, // ✎
+ 0x270f, // ✏
+ 0x2710, // ✐
+ 0x2711, // ✑
+ 0x2712, // ✒
+ 0x2713, // ✓
+ 0x2714, // ✔
+ 0x2715, // ✕
+ 0x2716, // ✖
+ 0x2717, // ✗
+ 0x2718, // ✘
+ 0x2719, // ✙
+ 0x271a, // ✚
+ 0x271b, // ✛
+ 0x271c, // ✜
+ 0x271d, // ✝
+ 0x271e, // ✞
+ 0x271f, // ✟
+ 0x2720, // ✠
+ 0x2721, // ✡
+ 0x2722, // ✢
+ 0x2723, // ✣
+ 0x2724, // ✤
+ 0x2725, // ✥
+ 0x2726, // ✦
+ 0x2727, // ✧
+ 0x2729, // ✩
+ 0x272a, // ✪
+ 0x272b, // ✫
+ 0x272c, // ✬
+ 0x272d, // ✭
+ 0x272e, // ✮
+ 0x272f, // ✯
+ 0x2730, // ✰
+ 0x2731, // ✱
+ 0x2732, // ✲
+ 0x2733, // ✳
+ 0x2734, // ✴
+ 0x2735, // ✵
+ 0x2736, // ✶
+ 0x2737, // ✷
+ 0x2738, // ✸
+ 0x2739, // ✹
+ 0x273a, // ✺
+ 0x273b, // ✻
+ 0x273c, // ✼
+ 0x273d, // ✽
+ 0x273e, // ✾
+ 0x273f, // ✿
+ 0x2740, // ❀
+ 0x2741, // ❁
+ 0x2742, // ❂
+ 0x2743, // ❃
+ 0x2744, // ❄
+ 0x2745, // ❅
+ 0x2746, // ❆
+ 0x2747, // ❇
+ 0x2748, // ❈
+ 0x2749, // ❉
+ 0x274a, // ❊
+ 0x274b, // ❋
+ 0x274d, // ❍
+ 0x274f, // ❏
+ 0x2750, // ❐
+ 0x2751, // ❑
+ 0x2752, // ❒
+ 0x2756, // ❖
+ 0x2758, // ❘
+ 0x2759, // ❙
+ 0x275a, // ❚
+ 0x275b, // ❛
+ 0x275c, // ❜
+ 0x275d, // ❝
+ 0x275e, // ❞
+ 0x2761, // ❡
+ 0x2762, // ❢
+ 0x2763, // ❣
+ 0x2764, // ❤
+ 0x2765, // ❥
+ 0x2766, // ❦
+ 0x2767, // ❧
+ 0x277f, // ❿
+ 0x2789, // ➉
+ 0x2793, // ➓
+ 0x2794, // ➔
+ 0x2798, // ➘
+ 0x2799, // ➙
+ 0x279a, // ➚
+ 0x279b, // ➛
+ 0x279c, // ➜
+ 0x279d, // ➝
+ 0x279e, // ➞
+ 0x279f, // ➟
+ 0x27a0, // ➠
+ 0x27a1, // ➡
+ 0x27a2, // ➢
+ 0x27a3, // ➣
+ 0x27a4, // ➤
+ 0x27a5, // ➥
+ 0x27a6, // ➦
+ 0x27a7, // ➧
+ 0x27a8, // ➨
+ 0x27a9, // ➩
+ 0x27aa, // ➪
+ 0x27ab, // ➫
+ 0x27ac, // ➬
+ 0x27ad, // ➭
+ 0x27ae, // ➮
+ 0x27af, // ➯
+ 0x27b1, // ➱
+ 0x27b2, // ➲
+ 0x27b3, // ➳
+ 0x27b4, // ➴
+ 0x27b5, // ➵
+ 0x27b6, // ➶
+ 0x27b7, // ➷
+ 0x27b8, // ➸
+ 0x27b9, // ➹
+ 0x27ba, // ➺
+ 0x27bb, // ➻
+ 0x27bc, // ➼
+ 0x27bd, // ➽
+ 0x27be, // ➾
+ 0x3000, //  
+ 0x3001, // 、
+ 0x3002, // 。
+ 0x3003, // 〃
+ 0x3008, // 〈
+ 0x3009, // 〉
+ 0x300a, // 《
+ 0x300b, // 》
+ 0x300c, // 「
+ 0x300d, // 」
+ 0x300e, // 『
+ 0x300f, // 』
+ 0x3010, // 【
+ 0x3011, // 】
+ 0x3012, // 〒
+ 0x3014, // 〔
+ 0x3015, // 〕
+ 0x3016, // 〖
+ 0x3017, // 〗
+ 0x3018, // 〘
+ 0x3019, // 〙
+ 0x301a, // 〚
+ 0x301b, // 〛
+ 0x3036, // 〶
+ 0xf6d9, // 
+ 0xf6da, // 
+ 0xf6db, // 
+ 0xf8d7, // 
+ 0xf8d8, // 
+ 0xf8d9, // 
+ 0xf8da, // 
+ 0xf8db, // 
+ 0xf8dc, // 
+ 0xf8dd, // 
+ 0xf8de, // 
+ 0xf8df, // 
+ 0xf8e0, // 
+ 0xf8e1, // 
+ 0xf8e2, // 
+ 0xf8e3, // 
+ 0xf8e4, // 
+ 0xf8e5, // 
+ 0xf8e6, // 
+ 0xf8e7, // 
+ 0xf8e8, // 
+ 0xf8e9, // 
+ 0xf8ea, // 
+ 0xf8eb, // 
+ 0xf8ec, // 
+ 0xf8ed, // 
+ 0xf8ee, // 
+ 0xf8ef, // 
+ 0xf8f0, // 
+ 0xf8f1, // 
+ 0xf8f2, // 
+ 0xf8f3, // 
+ 0xf8f4, // 
+ 0xf8f5, // 
+ 0xf8f6, // 
+ 0xf8f7, // 
+ 0xf8f8, // 
+ 0xf8f9, // 
+ 0xf8fa, // 
+ 0xf8fb, // 
+ 0xf8fc, // 
+ 0xf8fd, // 
+ 0xf8fe, // 
+ 0xfe7c, // ﹼ
+ 0xfe7d, // ﹽ
+ 0xff01, // !
+ 0xff02, // "
+ 0xff03, // #
+ 0xff04, // $
+ 0xff05, // %
+ 0xff06, // &
+ 0xff07, // '
+ 0xff08, // (
+ 0xff09, // )
+ 0xff09, // )
+ 0xff0a, // *
+ 0xff0b, // +
+ 0xff0c, // ,
+ 0xff0d, // -
+ 0xff0e, // .
+ 0xff0f, // /
+ 0xff1a, // :
+ 0xff1b, // ;
+ 0xff1c, // <
+ 0xff1d, // =
+ 0xff1e, // >
+ 0xff1f, // ?
+ 0xff20, // @
+ 0xff3b, // [
+ 0xff3c, // \
+ 0xff3d, // ]
+ 0xff3e, // ^
+ 0xff40, // `
+ 0xff5b, // {
+ 0xff5c, // |
+ 0xff5d, // }
+ 0xff5e, // ~
+ 0xff5f, // ⦅
+ 0xff60, // ⦆
+ 0xff61, // 。
+ 0xff62, // 「
+ 0xff63, // 」
+ 0xff64, // 、
+ 0xff65, // ・
+ 0xffe0, // ¢
+ 0xffe1, // £
+ 0xffe2, // ¬
+ 0xffe3, //  ̄
+ 0xffe4, // ¦
+ 0xffe5, // ¥
+ 0xffe6, // ₩
+ 0xffe8, // │
+ 0xffe9, // ←
+ 0xffea, // ↑
+ 0xffeb, // →
+ 0xffec, // ↓
+ 0xffed, // ■
+ 0xffee, // ○
+ 0x1d6fc, // 𝛼
+ 0x1d6fd, // 𝛽
+ 0x1d6fe, // 𝛾
+ 0x1d6ff, // 𝛿
+ 0x1d700, // 𝜀
+ 0x1d701, // 𝜁
+ 0x1d702, // 𝜂
+ 0x1d703, // 𝜃
+ 0x1d704, // 𝜄
+ 0x1d705, // 𝜅
+ 0x1d706, // 𝜆
+ 0x1d707, // 𝜇
+ 0x1d708, // 𝜈
+ 0x1d709, // 𝜉
+ 0x1d70a, // 𝜊
+ 0x1d70b, // 𝜋
+ 0x1d70c, // 𝜌
+ 0x1d70d, // 𝜍
+ 0x1d70e, // 𝜎
+ 0x1d70f, // 𝜏
+ 0x1d710, // 𝜐
+ 0x1d711, // 𝜑
+ 0x1d712, // 𝜒
+ 0x1d713, // 𝜓
+ 0x1d714, // 𝜔
+ 0x1d715, // 𝜕
+ 0x1d716, // 𝜖
+ 0x1d717, // 𝜗
+ 0x1d718, // 𝜘
+ 0x1d719, // 𝜙
+ 0x1d71a, // 𝜚
+ 0x1d71b, // 𝜛
+ 0xc2a0, // 슠
+ 0xe28087, //
+ 0xe280af, //
+ 0xe281a0, //
+ 0xefbbbf, //
+];
diff --git a/platform/www/inc/Utf8/tables/upperaccents.php b/platform/www/inc/Utf8/tables/upperaccents.php
new file mode 100644
index 0000000..e6e48de
--- /dev/null
+++ b/platform/www/inc/Utf8/tables/upperaccents.php
@@ -0,0 +1,114 @@
+<?php
+/**
+ * UTF-8 lookup table for upper case accented letters
+ *
+ * This lookuptable defines replacements for accented characters from the ASCII-7
+ * range. This are upper case letters only.
+ *
+ * @author Andreas Gohr <andi@splitbrain.org>
+ * @see \dokuwiki\Utf8\Clean::deaccent()
+ */
+return [
+ 'Á' => 'A',
+ 'À' => 'A',
+ 'Ă' => 'A',
+ 'Â' => 'A',
+ 'Å' => 'A',
+ 'Ä' => 'Ae',
+ 'Ã' => 'A',
+ 'Ą' => 'A',
+ 'Ā' => 'A',
+ 'Æ' => 'Ae',
+ 'Ḃ' => 'B',
+ 'Ć' => 'C',
+ 'Ĉ' => 'C',
+ 'Č' => 'C',
+ 'Ċ' => 'C',
+ 'Ç' => 'C',
+ 'Ď' => 'D',
+ 'Ḋ' => 'D',
+ 'Đ' => 'D',
+ 'Ð' => 'Dh',
+ 'É' => 'E',
+ 'È' => 'E',
+ 'Ĕ' => 'E',
+ 'Ê' => 'E',
+ 'Ě' => 'E',
+ 'Ë' => 'E',
+ 'Ė' => 'E',
+ 'Ę' => 'E',
+ 'Ē' => 'E',
+ 'Ḟ' => 'F',
+ 'Ƒ' => 'F',
+ 'Ğ' => 'G',
+ 'Ĝ' => 'G',
+ 'Ġ' => 'G',
+ 'Ģ' => 'G',
+ 'Ĥ' => 'H',
+ 'Ħ' => 'H',
+ 'Í' => 'I',
+ 'Ì' => 'I',
+ 'Î' => 'I',
+ 'Ï' => 'I',
+ 'Ĩ' => 'I',
+ 'Į' => 'I',
+ 'Ī' => 'I',
+ 'Ĵ' => 'J',
+ 'Ķ' => 'K',
+ 'Ĺ' => 'L',
+ 'Ľ' => 'L',
+ 'Ļ' => 'L',
+ 'Ł' => 'L',
+ 'Ṁ' => 'M',
+ 'Ń' => 'N',
+ 'Ň' => 'N',
+ 'Ñ' => 'N',
+ 'Ņ' => 'N',
+ 'Ó' => 'O',
+ 'Ò' => 'O',
+ 'Ô' => 'O',
+ 'Ö' => 'Oe',
+ 'Ő' => 'O',
+ 'Õ' => 'O',
+ 'Ø' => 'O',
+ 'Ō' => 'O',
+ 'Ơ' => 'O',
+ 'Ṗ' => 'P',
+ 'Ŕ' => 'R',
+ 'Ř' => 'R',
+ 'Ŗ' => 'R',
+ 'Ś' => 'S',
+ 'Ŝ' => 'S',
+ 'Š' => 'S',
+ 'Ṡ' => 'S',
+ 'Ş' => 'S',
+ 'Ș' => 'S',
+ 'Ť' => 'T',
+ 'Ṫ' => 'T',
+ 'Ţ' => 'T',
+ 'Ț' => 'T',
+ 'Ŧ' => 'T',
+ 'Ú' => 'U',
+ 'Ù' => 'U',
+ 'Ŭ' => 'U',
+ 'Û' => 'U',
+ 'Ů' => 'U',
+ 'Ü' => 'Ue',
+ 'Ű' => 'U',
+ 'Ũ' => 'U',
+ 'Ų' => 'U',
+ 'Ū' => 'U',
+ 'Ư' => 'U',
+ 'Ẃ' => 'W',
+ 'Ẁ' => 'W',
+ 'Ŵ' => 'W',
+ 'Ẅ' => 'W',
+ 'Ý' => 'Y',
+ 'Ỳ' => 'Y',
+ 'Ŷ' => 'Y',
+ 'Ÿ' => 'Y',
+ 'Ź' => 'Z',
+ 'Ž' => 'Z',
+ 'Ż' => 'Z',
+ 'Þ' => 'Th',
+];