summaryrefslogtreecommitdiff
path: root/platform/www/inc/Utf8/Clean.php
blob: 0975ff5597a7bca5c091334e801272b0fde69553 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
<?php

namespace dokuwiki\Utf8;

/**
 * Methods to assess and clean UTF-8 strings
 */
class Clean
{
    /**
     * Checks if a string contains 7bit ASCII only
     *
     * @author Andreas Haerter <andreas.haerter@dev.mail-node.com>
     *
     * @param string $str
     * @return bool
     */
    public static function isASCII($str)
    {
        return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
    }

    /**
     * Tries to detect if a string is in Unicode encoding
     *
     * @author <bmorel@ssi.fr>
     * @link   http://php.net/manual/en/function.utf8-encode.php
     *
     * @param string $str
     * @return bool
     */
    public static function isUtf8($str)
    {
        $len = strlen($str);
        for ($i = 0; $i < $len; $i++) {
            $b = ord($str[$i]);
            if ($b < 0x80) continue; # 0bbbbbbb
            elseif (($b & 0xE0) === 0xC0) $n = 1; # 110bbbbb
            elseif (($b & 0xF0) === 0xE0) $n = 2; # 1110bbbb
            elseif (($b & 0xF8) === 0xF0) $n = 3; # 11110bbb
            elseif (($b & 0xFC) === 0xF8) $n = 4; # 111110bb
            elseif (($b & 0xFE) === 0xFC) $n = 5; # 1111110b
            else return false; # Does not match any model

            for ($j = 0; $j < $n; $j++) { # n bytes matching 10bbbbbb follow ?
                if ((++$i === $len) || ((ord($str[$i]) & 0xC0) !== 0x80))
                    return false;
            }
        }
        return true;
    }

    /**
     * Strips all high byte chars
     *
     * Returns a pure ASCII7 string
     *
     * @author Andreas Gohr <andi@splitbrain.org>
     *
     * @param string $str
     * @return string
     */
    public static function strip($str)
    {
        $ascii = '';
        $len = strlen($str);
        for ($i = 0; $i < $len; $i++) {
            if (ord($str[$i]) < 128) {
                $ascii .= $str[$i];
            }
        }
        return $ascii;
    }

    /**
     * Removes special characters (nonalphanumeric) from a UTF-8 string
     *
     * This function adds the controlchars 0x00 to 0x19 to the array of
     * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
     *
     * @author Andreas Gohr <andi@splitbrain.org>
     *
     * @param  string $string The UTF8 string to strip of special chars
     * @param  string $repl Replace special with this string
     * @param  string $additional Additional chars to strip (used in regexp char class)
     * @return string
     */
    public static function stripspecials($string, $repl = '', $additional = '')
    {
        static $specials = null;
        if ($specials === null) {
            $specials = preg_quote(Table::specialChars(), '/');
        }

        return preg_replace('/[' . $additional . '\x00-\x19' . $specials . ']/u', $repl, $string);
    }

    /**
     * Replace bad bytes with an alternative character
     *
     * ASCII character is recommended for replacement char
     *
     * PCRE Pattern to locate bad bytes in a UTF-8 string
     * Comes from W3 FAQ: Multilingual Forms
     * Note: modified to include full ASCII range including control chars
     *
     * @author Harry Fuecks <hfuecks@gmail.com>
     * @see http://www.w3.org/International/questions/qa-forms-utf-8
     *
     * @param string $str to search
     * @param string $replace to replace bad bytes with (defaults to '?') - use ASCII
     * @return string
     */
    public static function replaceBadBytes($str, $replace = '')
    {
        $UTF8_BAD =
            '([\x00-\x7F]' .                          # ASCII (including control chars)
            '|[\xC2-\xDF][\x80-\xBF]' .               # non-overlong 2-byte
            '|\xE0[\xA0-\xBF][\x80-\xBF]' .           # excluding overlongs
            '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}' .    # straight 3-byte
            '|\xED[\x80-\x9F][\x80-\xBF]' .           # excluding surrogates
            '|\xF0[\x90-\xBF][\x80-\xBF]{2}' .        # planes 1-3
            '|[\xF1-\xF3][\x80-\xBF]{3}' .            # planes 4-15
            '|\xF4[\x80-\x8F][\x80-\xBF]{2}' .        # plane 16
            '|(.{1}))';                               # invalid byte
        ob_start();
        while (preg_match('/' . $UTF8_BAD . '/S', $str, $matches)) {
            if (!isset($matches[2])) {
                echo $matches[0];
            } else {
                echo $replace;
            }
            $str = substr($str, strlen($matches[0]));
        }
        return ob_get_clean();
    }


    /**
     * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
     *
     * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
     * letters. Default is to deaccent both cases ($case = 0)
     *
     * @author Andreas Gohr <andi@splitbrain.org>
     *
     * @param string $string
     * @param int $case
     * @return string
     */
    public static function deaccent($string, $case = 0)
    {
        if ($case <= 0) {
            $string = strtr($string, Table::lowerAccents());
        }
        if ($case >= 0) {
            $string = strtr($string, Table::upperAccents());
        }
        return $string;
    }

    /**
     * Romanize a non-latin string
     *
     * @author Andreas Gohr <andi@splitbrain.org>
     *
     * @param string $string
     * @return string
     */
    public static function romanize($string)
    {
        if (self::isASCII($string)) return $string; //nothing to do

        return strtr($string, Table::romanization());
    }

    /**
     * adjust a byte index into a utf8 string to a utf8 character boundary
     *
     * @author       chris smith <chris@jalakai.co.uk>
     *
     * @param string $str utf8 character string
     * @param int $i byte index into $str
     * @param bool $next direction to search for boundary, false = up (current character) true = down (next character)
     * @return int byte index into $str now pointing to a utf8 character boundary
     */
    public static function correctIdx($str, $i, $next = false)
    {

        if ($i <= 0) return 0;

        $limit = strlen($str);
        if ($i >= $limit) return $limit;

        if ($next) {
            while (($i < $limit) && ((ord($str[$i]) & 0xC0) === 0x80)) $i++;
        } else {
            while ($i && ((ord($str[$i]) & 0xC0) === 0x80)) $i--;
        }

        return $i;
    }

}