summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/SemanticMediaWiki/src/MediaWiki/Collator.php
blob: 5bf10d519b933b45ca7f9f58f086a8c8b5cd2707 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
<?php

namespace SMW\MediaWiki;

use Collation;

/**
 * @license GNU GPL v2+
 * @since 3.0
 *
 * @author mwjames
 */
class Collator {

	/**
	 * @var Collator
	 */
	private static $instance = [];

	/**
	 * @var Collation
	 */
	private $collation;

	/**
	 * @var string
	 */
	private $collationName;

	/**
	 * @private
	 *
	 * @since 3.0
	 *
	 * @param Collation $collation
	 * @param string $collationName
	 */
	public function __construct( Collation $collation, $collationName = '' ) {
		$this->collation = $collation;
		$this->collationName = $collationName;
	}

	/**
	 * @since 3.0
	 *
	 * @param srtring $collationName
	 *
	 * @return Collator
	 */
	public static function singleton( $collationName = '' ) {

		$collationName = $collationName === '' ? $GLOBALS['smwgEntityCollation'] : $collationName;

		if ( !isset( self::$instance[$collationName] ) ) {
			self::$instance[$collationName] = new self( Collation::factory( $collationName ), $collationName );
		}

		return self::$instance[$collationName];
	}

	/**
	 * For any uca-* generated sortkey armor any invalid or unrecognized UTF-8
	 * characters to prevent an invalid XML/UTF output.
	 *
	 * Characters that cannot be expressed are replaced by ? which is surely
	 * inaccurate in comparison to the original uca-* sortkey but it allows to
	 * replicate a near surrogate string to a back-end that requires XML
	 * compliance (triple store).
	 *
	 * @since 3.0
	 *
	 * @param string $text
	 *
	 * @return string
	 */
	public function armor( $text, $source = '' ) {

		if ( strpos( $this->collationName, 'uca' ) === false ) {
			return $text;
		}

		//	$text = mb_convert_encoding( $text, 'UTF-8' );

		// https://magp.ie/2011/01/06/remove-non-utf8-characters-from-string-with-php/
		// Remove all none utf-8 symbols
		$text = str_replace( '�', '', htmlspecialchars( $text, ENT_SUBSTITUTE, 'UTF-8' ) );

		// remove non-breaking spaces and other non-standard spaces
		$text = preg_replace( '~\s+~u', '?', $text );

		// replace controls symbols with "?"
		$text = preg_replace( '~\p{C}+~u', '?', $text );

		return $text;
	}

	/**
	 * @since 3.0
	 *
	 * @param string $text
	 *
	 * @return string
	 */
	public function getSortKey( $text ) {
		return $this->collation->getSortKey( $text );
	}

	/**
	 * @since 3.0
	 *
	 * @param string $text
	 *
	 * @return string
	 */
	public function getFirstLetter( $text ) {

		// Add check otherwise the Collation instance returns with a
		// "Uninitialized string offset: 0"
		if ( $text === '' ) {
			return '';
		}

		return $this->collation->getFirstLetter( $text );
	}

	/**
	 * @since 3.0
	 *
	 * @param string $old
	 * @param string $new
	 *
	 * @return boolean
	 */
	public function isIdentical( $old, $new ) {
		return $this->collation->getSortKey( $old ) === $this->collation->getSortKey( $new );
	}

}