diff options
Diffstat (limited to 'www/wiki/extensions/Translate/utils/ArrayFlattener.php')
-rw-r--r-- | www/wiki/extensions/Translate/utils/ArrayFlattener.php | 236 |
1 files changed, 222 insertions, 14 deletions
diff --git a/www/wiki/extensions/Translate/utils/ArrayFlattener.php b/www/wiki/extensions/Translate/utils/ArrayFlattener.php index 8845ef1c..c5e61769 100644 --- a/www/wiki/extensions/Translate/utils/ArrayFlattener.php +++ b/www/wiki/extensions/Translate/utils/ArrayFlattener.php @@ -1,28 +1,43 @@ <?php /** - * Support for JSON message file format. + * Flattens message arrays for further processing. Supports parsing CLDR + * plural messages and converting them into MediaWiki's {{PLURAL}} syntax + * in a single message. * * @file * @author Niklas Laxström - * @license GPL-2.0+ + * @author Erik Moeller + * @license GPL-2.0-or-later * @since 2016.01 */ class ArrayFlattener { protected $sep; + protected $parseCLDRPlurals; - public function __construct( $sep = '.' ) { + // For CLDR pluralization rules + protected static $pluralWords = [ + 'zero' => 1, + 'one' => 1, + 'many' => 1, + 'few' => 1, + 'other' => 1, + 'two' => 1 + ]; + + public function __construct( $sep = '.', $parseCLDRPlurals = false ) { $this->sep = $sep; + $this->parseCLDRPlurals = $parseCLDRPlurals; } /** * Flattens multidimensional array. * - * @param array $unflat It's an array. + * @param array $unflat Array of messages * @return array */ public function flatten( array $unflat ) { - $flat = array(); + $flat = []; foreach ( $unflat as $key => $value ) { if ( !is_array( $value ) ) { @@ -30,14 +45,21 @@ class ArrayFlattener { continue; } - // Placeholder for special plural processing + $plurals = false; + if ( $this->parseCLDRPlurals ) { + $plurals = $this->flattenCLDRPlurals( $value ); + } - $temp = array(); - foreach ( $value as $subKey => $subValue ) { - $newKey = "$key{$this->sep}$subKey"; - $temp[$newKey] = $subValue; + if ( $this->parseCLDRPlurals && $plurals ) { + $flat[$key] = $plurals; + } else { + $temp = []; + foreach ( $value as $subKey => $subValue ) { + $newKey = "$key{$this->sep}$subKey"; + $temp[$newKey] = $subValue; + } + $flat += $this->flatten( $temp ); } - $flat += $this->flatten( $temp ); // Can as well keep only one copy around. unset( $unflat[$key] ); @@ -47,13 +69,82 @@ class ArrayFlattener { } /** + * Flattens arrays that contain CLDR plural keywords into single values using + * MediaWiki's plural syntax. + * + * @param array $messages Array of messages + * + * @throws MWException + * @return bool|string + */ + public function flattenCLDRPlurals( $messages ) { + $pluralKeys = false; + $nonPluralKeys = false; + foreach ( $messages as $key => $value ) { + if ( is_array( $value ) ) { + // Plurals can only happen in the lowest level of the structure + return false; + } + + // Check if we find any reserved plural keyword + if ( isset( self::$pluralWords[$key] ) ) { + $pluralKeys = true; + } else { + $nonPluralKeys = true; + } + } + + // No plural keys at all, we can skip + if ( !$pluralKeys ) { + return false; + } + + // Mixed plural keys with other keys, should not happen + if ( $nonPluralKeys ) { + $keys = implode( ', ', array_keys( $messages ) ); + throw new MWException( "Reserved plural keywords mixed with other keys: $keys." ); + } + + $pls = '{{PLURAL'; + foreach ( $messages as $key => $value ) { + if ( $key === 'other' ) { + continue; + } + + $pls .= "|$key=$value"; + } + + // Put the "other" alternative last, without other= prefix. + $other = isset( $messages['other'] ) ? '|' . $messages['other'] : ''; + $pls .= "$other}}"; + + return $pls; + } + + /** * Performs the reverse operation of flatten. * - * @param array $flat It's an array + * @param array $flat Array of messages * @return array */ public function unflatten( $flat ) { - $unflat = array(); + $unflat = []; + + if ( $this->parseCLDRPlurals ) { + $unflattenedPlurals = []; + foreach ( $flat as $key => $value ) { + $plurals = false; + if ( !is_array( $value ) ) { + $plurals = $this->unflattenCLDRPlurals( $key, $value ); + } + if ( $plurals ) { + $unflattenedPlurals += $plurals; + } else { + $unflattenedPlurals[$key] = $value; + } + } + $flat = $unflattenedPlurals; + } foreach ( $flat as $key => $value ) { $path = explode( $this->sep, $key ); @@ -67,7 +158,7 @@ class ArrayFlattener { /// Extract the level and make sure it exists. $level = array_shift( $path ); if ( !isset( $pointer[$level] ) ) { - $pointer[$level] = array(); + $pointer[$level] = []; } /// Update the pointer to the new reference. @@ -86,4 +177,121 @@ class ArrayFlattener { return $unflat; } + + /** + * Converts the MediaWiki plural syntax to array of CLDR style plurals + * + * @param string $key Message key prefix + * @param string $message The plural string + * + * @return bool|array + */ + public function unflattenCLDRPlurals( $key, $message ) { + // Quick escape. + if ( strpos( $message, '{{PLURAL' ) === false ) { + return false; + } + + /* + * Replace all variables with placeholders. Possible source of bugs + * if other characters that given below are used. + */ + $regex = '~\{[a-zA-Z_-]+}~'; + $placeholders = []; + $match = []; + + while ( preg_match( $regex, $message, $match ) ) { + $uniqkey = TranslateUtils::getPlaceholder(); + $placeholders[$uniqkey] = $match[0]; + $search = preg_quote( $match[0], '~' ); + $message = preg_replace( "~$search~", $uniqkey, $message ); + } + + // Then replace (possible multiple) plural instances into placeholders. + $regex = '~\{\{PLURAL\|(.*?)}}~s'; + $matches = []; + $match = []; + + while ( preg_match( $regex, $message, $match ) ) { + $uniqkey = TranslateUtils::getPlaceholder(); + $matches[$uniqkey] = $match; + $message = preg_replace( $regex, $uniqkey, $message, 1 ); + } + + // No plurals, should not happen. + if ( !count( $matches ) ) { + return false; + } + + // The final array of alternative plurals forms. + $alts = []; + + /* + * Then loop trough each plural block and replacing the placeholders + * to construct the alternatives. Produces invalid output if there is + * multiple plural bocks which don't have the same set of keys. + */ + $pluralChoice = implode( '|', array_keys( self::$pluralWords ) ); + $regex = "~($pluralChoice)\s*=\s*(.+)~s"; + foreach ( $matches as $ph => $plu ) { + $forms = explode( '|', $plu[1] ); + + foreach ( $forms as $form ) { + if ( $form === '' ) { + continue; + } + + $match = []; + if ( preg_match( $regex, $form, $match ) ) { + $formWord = "$key{$this->sep}{$match[1]}"; + $value = $match[2]; + } else { + $formWord = "$key{$this->sep}other"; + $value = $form; + } + + if ( !isset( $alts[$formWord] ) ) { + $alts[$formWord] = $message; + } + + $string = $alts[$formWord]; + $alts[$formWord] = str_replace( $ph, $value, $string ); + } + } + + // Replace other variables. + foreach ( $alts as &$value ) { + $value = str_replace( array_keys( $placeholders ), array_values( $placeholders ), $value ); + } + + if ( !isset( $alts["$key{$this->sep}other"] ) ) { + wfWarn( "Other not set for key $key" ); + } + + return $alts; + } + + /** + * Compares two strings for equal content, taking PLURAL expansion into account. + * + * @param string $a + * @param string $b + * @return bool Whether two strings are equal + */ + public function compareContent( $a, $b ) { + if ( !$this->parseCLDRPlurals ) { + return $a === $b; + } + + $a2 = $this->unflattenCLDRPlurals( 'prefix', $a ); + $b2 = $this->unflattenCLDRPlurals( 'prefix', $b ); + + // Fall back to regular comparison if parsing fails. + if ( $a2 === false || $b2 === false ) { + return $a === $b; + } + + // Require key-value pairs to match, but ignore order and types (all should be strings). + return $a2 == $b2; + } } |