diff options
Diffstat (limited to 'www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php')
-rw-r--r-- | www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php b/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php new file mode 100644 index 00000000..0db427a1 --- /dev/null +++ b/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php @@ -0,0 +1,52 @@ +<?php + +namespace SMW\Utils; + +/** + * @license GNU GPL v2+ + * @since 2.5 + * + * @author mwjames + */ +class Tokenizer { + + /** + * @since 2.5 + * + * @param string $text + * + * @return array + */ + public static function tokenize( $text ) { + + if ( !class_exists( '\IntlRuleBasedBreakIterator' ) ) { + return explode( ' ', $text ); + } + + // As for CJK, this returns better results as trying to split tokens + // by a single character + $intlRuleBasedBreakIterator = \IntlRuleBasedBreakIterator::createWordInstance( 'en' ); + $intlRuleBasedBreakIterator->setText( $text ); + + $prev = 0; + $tokens = []; + + foreach ( $intlRuleBasedBreakIterator as $token ) { + + if ( $token == 0 ) { + continue; + } + + $res = substr( $text, $prev, $token - $prev ); + + if ( $res !== '' && $res !== ' ' ) { + $tokens[] = $res; + } + + $prev = $token; + } + + return $tokens; + } + +} |