summaryrefslogtreecommitdiff
path: root/platform/www/inc/Utf8/Conversion.php
blob: fad9cd0b1b0d3678fe343ded9ad65f85eb24fed4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
<?php

namespace dokuwiki\Utf8;

/**
 * Methods to convert from and to UTF-8 strings
 */
class Conversion
{

    /**
     * Encodes UTF-8 characters to HTML entities
     *
     * @author Tom N Harris <tnharris@whoopdedo.org>
     * @author <vpribish at shopping dot com>
     * @link   http://php.net/manual/en/function.utf8-decode.php
     *
     * @param string $str
     * @param bool $all Encode non-utf8 char to HTML as well
     * @return string
     */
    public static function toHtml($str, $all = false)
    {
        $ret = '';
        foreach (Unicode::fromUtf8($str) as $cp) {
            if ($cp < 0x80 && !$all) {
                $ret .= chr($cp);
            } elseif ($cp < 0x100) {
                $ret .= "&#$cp;";
            } else {
                $ret .= '&#x' . dechex($cp) . ';';
            }
        }
        return $ret;
    }

    /**
     * Decodes HTML entities to UTF-8 characters
     *
     * Convert any &#..; entity to a codepoint,
     * The entities flag defaults to only decoding numeric entities.
     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
     * are handled as well. Avoids the problem that would occur if you
     * had to decode "&amp;#38;&#38;amp;#38;"
     *
     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
     * what it should be                   -> "&#38;&amp#38;"
     *
     * @author Tom N Harris <tnharris@whoopdedo.org>
     *
     * @param  string $str UTF-8 encoded string
     * @param  boolean $entities decode name entities in addtition to numeric ones
     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
     */
    public static function fromHtml($str, $entities = false)
    {
        if (!$entities) {
            return preg_replace_callback(
                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
                [__CLASS__, 'decodeNumericEntity'],
                $str
            );
        }

        return preg_replace_callback(
            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
            [__CLASS__, 'decodeAnyEntity'],
            $str
        );
    }

    /**
     * Decodes any HTML entity to it's correct UTF-8 char equivalent
     *
     * @param string $ent An entity
     * @return string
     */
    protected static function decodeAnyEntity($ent)
    {
        // create the named entity lookup table
        static $table = null;
        if ($table === null) {
            $table = get_html_translation_table(HTML_ENTITIES);
            $table = array_flip($table);
            $table = array_map(
                static function ($c) {
                    return Unicode::toUtf8(array(ord($c)));
                },
                $table
            );
        }

        if ($ent[1] === '#') {
            return self::decodeNumericEntity($ent);
        }

        if (array_key_exists($ent[0], $table)) {
            return $table[$ent[0]];
        }

        return $ent[0];
    }

    /**
     * Decodes numeric HTML entities to their correct UTF-8 characters
     *
     * @param $ent string A numeric entity
     * @return string|false
     */
    protected static function decodeNumericEntity($ent)
    {
        switch ($ent[2]) {
            case 'X':
            case 'x':
                $cp = hexdec($ent[3]);
                break;
            default:
                $cp = intval($ent[3]);
                break;
        }
        return Unicode::toUtf8(array($cp));
    }

    /**
     * UTF-8 to UTF-16BE conversion.
     *
     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
     *
     * @param string $str
     * @param bool $bom
     * @return string
     */
    public static function toUtf16be($str, $bom = false)
    {
        $out = $bom ? "\xFE\xFF" : '';
        if (UTF8_MBSTRING) {
            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
        }

        $uni = Unicode::fromUtf8($str);
        foreach ($uni as $cp) {
            $out .= pack('n', $cp);
        }
        return $out;
    }

    /**
     * UTF-8 to UTF-16BE conversion.
     *
     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
     *
     * @param string $str
     * @return false|string
     */
    public static function fromUtf16be($str)
    {
        $uni = unpack('n*', $str);
        return Unicode::toUtf8($uni);
    }

}