blob: 0db427a1f79c6b5ded6ad696a4433349ddec47d7 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
<?php
namespace SMW\Utils;
/**
* @license GNU GPL v2+
* @since 2.5
*
* @author mwjames
*/
class Tokenizer {
/**
* @since 2.5
*
* @param string $text
*
* @return array
*/
public static function tokenize( $text ) {
if ( !class_exists( '\IntlRuleBasedBreakIterator' ) ) {
return explode( ' ', $text );
}
// As for CJK, this returns better results as trying to split tokens
// by a single character
$intlRuleBasedBreakIterator = \IntlRuleBasedBreakIterator::createWordInstance( 'en' );
$intlRuleBasedBreakIterator->setText( $text );
$prev = 0;
$tokens = [];
foreach ( $intlRuleBasedBreakIterator as $token ) {
if ( $token == 0 ) {
continue;
}
$res = substr( $text, $prev, $token - $prev );
if ( $res !== '' && $res !== ' ' ) {
$tokens[] = $res;
}
$prev = $token;
}
return $tokens;
}
}
|