summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php')
-rw-r--r--www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php52
1 files changed, 52 insertions, 0 deletions
diff --git a/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php b/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php
new file mode 100644
index 00000000..0db427a1
--- /dev/null
+++ b/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php
@@ -0,0 +1,52 @@
+<?php
+
+namespace SMW\Utils;
+
+/**
+ * @license GNU GPL v2+
+ * @since 2.5
+ *
+ * @author mwjames
+ */
+class Tokenizer {
+
+ /**
+ * @since 2.5
+ *
+ * @param string $text
+ *
+ * @return array
+ */
+ public static function tokenize( $text ) {
+
+ if ( !class_exists( '\IntlRuleBasedBreakIterator' ) ) {
+ return explode( ' ', $text );
+ }
+
+ // As for CJK, this returns better results as trying to split tokens
+ // by a single character
+ $intlRuleBasedBreakIterator = \IntlRuleBasedBreakIterator::createWordInstance( 'en' );
+ $intlRuleBasedBreakIterator->setText( $text );
+
+ $prev = 0;
+ $tokens = [];
+
+ foreach ( $intlRuleBasedBreakIterator as $token ) {
+
+ if ( $token == 0 ) {
+ continue;
+ }
+
+ $res = substr( $text, $prev, $token - $prev );
+
+ if ( $res !== '' && $res !== ' ' ) {
+ $tokens[] = $res;
+ }
+
+ $prev = $token;
+ }
+
+ return $tokens;
+ }
+
+}