first commit after acervus codebase

author: Yaco <franco@reevo.org> 2022-03-08 13:08:34 +0000
committer: Yaco <franco@reevo.org> 2022-03-08 13:08:34 +0000
commit: c985c40d3f3fc6a2be3be3186df3bf2f32189475 (patch)
tree: cee11f5e5a7e351ee0fec36d58d72cbee4f7e49b /platform/www/inc/Utf8/Conversion.php
1 files changed, 162 insertions, 0 deletions
diff --git a/platform/www/inc/Utf8/Conversion.php b/platform/www/inc/Utf8/Conversion.php
new file mode 100644
index 0000000..fad9cd0
--- /dev/null
+++ b/platform/www/inc/Utf8/Conversion.php
@@ -0,0 +1,162 @@
+<?php
+
+namespace dokuwiki\Utf8;
+
+/**
+ * Methods to convert from and to UTF-8 strings
+ */
+class Conversion
+{
+
+    /**
+     * Encodes UTF-8 characters to HTML entities
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     * @author <vpribish at shopping dot com>
+     * @link   http://php.net/manual/en/function.utf8-decode.php
+     *
+     * @param string $str
+     * @param bool $all Encode non-utf8 char to HTML as well
+     * @return string
+     */
+    public static function toHtml($str, $all = false)
+    {
+        $ret = '';
+        foreach (Unicode::fromUtf8($str) as $cp) {
+            if ($cp < 0x80 && !$all) {
+                $ret .= chr($cp);
+            } elseif ($cp < 0x100) {
+                $ret .= "&#$cp;";
+            } else {
+                $ret .= '&#x' . dechex($cp) . ';';
+            }
+        }
+        return $ret;
+    }
+
+    /**
+     * Decodes HTML entities to UTF-8 characters
+     *
+     * Convert any &#..; entity to a codepoint,
+     * The entities flag defaults to only decoding numeric entities.
+     * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
+     * are handled as well. Avoids the problem that would occur if you
+     * had to decode "&amp;#38;&#38;amp;#38;"
+     *
+     * unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&#38;&#38;"
+     * \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&amp#38;"
+     * what it should be                   -> "&#38;&amp#38;"
+     *
+     * @author Tom N Harris <tnharris@whoopdedo.org>
+     *
+     * @param  string $str UTF-8 encoded string
+     * @param  boolean $entities decode name entities in addtition to numeric ones
+     * @return string  UTF-8 encoded string with numeric (and named) entities replaced.
+     */
+    public static function fromHtml($str, $entities = false)
+    {
+        if (!$entities) {
+            return preg_replace_callback(
+                '/(&#([Xx])?([0-9A-Za-z]+);)/m',
+                [__CLASS__, 'decodeNumericEntity'],
+                $str
+            );
+        }
+
+        return preg_replace_callback(
+            '/&(#)?([Xx])?([0-9A-Za-z]+);/m',
+            [__CLASS__, 'decodeAnyEntity'],
+            $str
+        );
+    }
+
+    /**
+     * Decodes any HTML entity to it's correct UTF-8 char equivalent
+     *
+     * @param string $ent An entity
+     * @return string
+     */
+    protected static function decodeAnyEntity($ent)
+    {
+        // create the named entity lookup table
+        static $table = null;
+        if ($table === null) {
+            $table = get_html_translation_table(HTML_ENTITIES);
+            $table = array_flip($table);
+            $table = array_map(
+                static function ($c) {
+                    return Unicode::toUtf8(array(ord($c)));
+                },
+                $table
+            );
+        }
+
+        if ($ent[1] === '#') {
+            return self::decodeNumericEntity($ent);
+        }
+
+        if (array_key_exists($ent[0], $table)) {
+            return $table[$ent[0]];
+        }
+
+        return $ent[0];
+    }
+
+    /**
+     * Decodes numeric HTML entities to their correct UTF-8 characters
+     *
+     * @param $ent string A numeric entity
+     * @return string|false
+     */
+    protected static function decodeNumericEntity($ent)
+    {
+        switch ($ent[2]) {
+            case 'X':
+            case 'x':
+                $cp = hexdec($ent[3]);
+                break;
+            default:
+                $cp = intval($ent[3]);
+                break;
+        }
+        return Unicode::toUtf8(array($cp));
+    }
+
+    /**
+     * UTF-8 to UTF-16BE conversion.
+     *
+     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
+     *
+     * @param string $str
+     * @param bool $bom
+     * @return string
+     */
+    public static function toUtf16be($str, $bom = false)
+    {
+        $out = $bom ? "\xFE\xFF" : '';
+        if (UTF8_MBSTRING) {
+            return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
+        }
+
+        $uni = Unicode::fromUtf8($str);
+        foreach ($uni as $cp) {
+            $out .= pack('n', $cp);
+        }
+        return $out;
+    }
+
+    /**
+     * UTF-8 to UTF-16BE conversion.
+     *
+     * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
+     *
+     * @param string $str
+     * @return false|string
+     */
+    public static function fromUtf16be($str)
+    {
+        $uni = unpack('n*', $str);
+        return Unicode::toUtf8($uni);
+    }
+
+}
author	Yaco <franco@reevo.org>	2022-03-08 13:08:34 +0000
committer	Yaco <franco@reevo.org>	2022-03-08 13:08:34 +0000
commit	c985c40d3f3fc6a2be3be3186df3bf2f32189475 (patch)
tree	cee11f5e5a7e351ee0fec36d58d72cbee4f7e49b /platform/www/inc/Utf8/Conversion.php