$bucket ) { foreach ( $bucket as $name => $code ) { // We can skip checking languages we already have in the list if ( isset( $results[ $code ] ) ) { continue; } // Apply fuzzy search if ( !self::matchNames( $name, $searchKey, $typos ) ) { continue; } // Once we find a match, figure out the best name to display to the user // If $userLanguage is not provided (null), it is the same as autonym $candidates = [ mb_strtolower( Language::fetchLanguageName( $code, $userLanguage ) ), mb_strtolower( Language::fetchLanguageName( $code, null ) ), $name ]; foreach ( $candidates as $candidate ) { if ( $searchKey === $candidate ) { $results[$code] = $candidate; continue 2; } } foreach ( $candidates as $candidate ) { if ( self::matchNames( $candidate, $searchKey, $typos ) ) { $results[$code] = $candidate; continue 2; } } } } return $results; } public static function matchNames( $name, $searchKey, $typos ) { return strrpos( $name, $searchKey, -strlen( $name ) ) !== false || ( $typos > 0 && self::levenshteinDistance( $name, $searchKey ) <= $typos ); } public static function getIndex( $name ) { $codepoint = self::getCodepoint( $name ); if ( $codepoint < 4000 ) { // For latin etc. we need smaller buckets for speed return $codepoint; } else { // Try to group names of same script together return $codepoint - ( $codepoint % 1000 ); } } /** * Get the code point of first letter of string * * @param string $str * @return int Code point of first letter of string */ public static function getCodepoint( $str ) { $values = []; $lookingFor = 1; $strLen = strlen( $str ); $number = 0; for ( $i = 0; $i < $strLen; $i++ ) { $thisValue = ord( $str[$i] ); if ( $thisValue < 128 ) { $number = $thisValue; break; } else { // Codepoints larger than 127 are represented by multi-byte sequences if ( $values === [] ) { // 224 is the lowest non-overlong-encoded codepoint. $lookingFor = ( $thisValue < 224 ) ? 2 : 3; } $values[] = $thisValue; if ( count( $values ) === $lookingFor ) { // Refer http://en.wikipedia.org/wiki/UTF-8#Description if ( $lookingFor === 3 ) { $number = ( $values[0] % 16 ) * 4096; $number += ( $values[1] % 64 ) * 64; $number += $values[2] % 64; } else { $number = ( $values[0] % 32 ) * 64; $number += $values[1] % 64; } break; } } } return $number; } /** * Calculate the Levenshtein distance between two strings * @param string $str1 * @param string $str2 * @return int */ public static function levenshteinDistance( $str1, $str2 ) { if ( $str1 === $str2 ) { return 0; } $length1 = mb_strlen( $str1, 'UTF-8' ); $length2 = mb_strlen( $str2, 'UTF-8' ); if ( $length1 === 0 ) { return $length2; } if ( $length1 < $length2 ) { return self::levenshteinDistance( $str2, $str1 ); } $prevRow = range( 0, $length2 ); for ( $i = 0; $i < $length1; $i++ ) { $currentRow = []; $currentRow[0] = $i + 1; $c1 = mb_substr( $str1, $i, 1, 'UTF-8' ); for ( $j = 0; $j < $length2; $j++ ) { $c2 = mb_substr( $str2, $j, 1, 'UTF-8' ); $insertions = $prevRow[$j + 1] + 1; $deletions = $currentRow[$j] + 1; $substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 ); $currentRow[] = min( $insertions, $deletions, $substitutions ); } $prevRow = $currentRow; } return $prevRow[$length2]; } }