summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/SemanticMediaWiki/src/Utils/Tokenizer.php
blob: 0db427a1f79c6b5ded6ad696a4433349ddec47d7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
<?php

namespace SMW\Utils;

/**
 * @license GNU GPL v2+
 * @since 2.5
 *
 * @author mwjames
 */
class Tokenizer {

	/**
	 * @since 2.5
	 *
	 * @param string $text
	 *
	 * @return array
	 */
	public static function tokenize( $text ) {

		if ( !class_exists( '\IntlRuleBasedBreakIterator' ) ) {
			return explode( ' ', $text );
		}

		// As for CJK, this returns better results as trying to split tokens
		// by a single character
		$intlRuleBasedBreakIterator = \IntlRuleBasedBreakIterator::createWordInstance( 'en' );
		$intlRuleBasedBreakIterator->setText( $text );

		$prev = 0;
		$tokens = [];

		foreach ( $intlRuleBasedBreakIterator as $token ) {

			if ( $token == 0 ) {
				continue;
			}

			$res = substr( $text, $prev, $token - $prev );

			if ( $res !== '' && $res !== ' ' ) {
				$tokens[] = $res;
			}

			$prev = $token;
		}

		return $tokens;
	}

}