diff options
Diffstat (limited to 'www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php')
-rw-r--r-- | www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php b/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php new file mode 100644 index 00000000..c9e45750 --- /dev/null +++ b/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php @@ -0,0 +1,160 @@ +<?php +/** + * Script to create language names index. + * + * Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris, + * Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other + * contributors. See CREDITS for a list. + * + * UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't + * have to do anything special to choose one license or the other and you don't + * have to notify anyone which license you are using. You are free to use + * UniversalLanguageSelector in commercial projects as long as the copyright + * header is left intact. See files GPL-LICENSE and MIT-LICENSE for details. + * + * @file + * @ingroup Extensions + * @license GPL-2.0-or-later + * @license MIT + */ + +$IP = getenv( 'MW_INSTALL_PATH' ); +if ( $IP === false ) { + $IP = __DIR__ . '/../../..'; +} +require_once "$IP/maintenance/Maintenance.php"; + +class LanguageNameIndexer extends Maintenance { + public function __construct() { + parent::__construct(); + $this->addDescription( 'Script to create language names index.' ); + } + + public function execute() { + global $wgExtraLanguageNames; + + // Avoid local configuration leaking to this script + $wgExtraLanguageNames = []; + + $languages = Language::fetchLanguageNames( null, 'all' ); + + $buckets = []; + foreach ( $languages as $sourceLanguage => $autonym ) { + $translations = LanguageNames::getNames( $sourceLanguage, 0, 2 ); + + foreach ( $translations as $targetLanguage => $translation ) { + // Remove directionality markers used in Names.php: users are not + // going to type these. + $translation = str_replace( "\xE2\x80\x8E", '', $translation ); + $translation = mb_strtolower( $translation ); + $translation = trim( $translation ); + + // Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane". + // The original name is still shown, but avoid us creating entries such as + // "(dialekti" or "alpeve)". + $basicForm = preg_replace( '/\(.+\)$/', '', $translation ); + $words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY ); + + foreach ( $words as $index => $word ) { + $bucket = LanguageNameSearch::getIndex( $word ); + + $type = 'prefix'; + $display = $translation; + if ( $index > 0 && count( $words ) > 1 ) { + $type = 'infix'; + $display = "$word — $translation"; + } + $buckets[$bucket][$type][$display] = $targetLanguage; + } + } + } + + // Some languages don't have a conveniently searchable name in CLDR. + // For example, the name of Western Punjabi doesn't start with + // the string "punjabi" in any language, so it cannot be found + // by people who search in English. + // To resolve this, some languages are added here locally. + $specialLanguages = [ + // Catalan, sometimes searched as "Valencià" + 'ca' => [ 'valencia' ], + // Spanish, the transliteration of the autonym is often used for searching + 'es' => [ 'castellano' ], + // Armenian, the transliteration of the autonym is often used for searching + 'hy' => [ 'hayeren' ], + // Georgian, the transliteration of the autonym is often used for searching + 'ka' => [ 'kartuli', 'qartuli' ], + // Japanese, the transliteration of the autonym is often used for searching + 'ja' => [ 'nihongo', 'にほんご' ], + // Western Punjabi, doesn't start with the word "Punjabi" in any language + 'pnb' => [ 'punjabi western' ], + // Simplified and Traditional Chinese, because zh-hans and zh-hant + // are not mapped to any English name + 'zh-hans' => [ 'chinese simplified' ], + 'zh-hant' => [ 'chinese traditional' ], + ]; + + foreach ( $specialLanguages as $targetLanguage => $translations ) { + foreach ( $translations as $translation ) { + $bucket = LanguageNameSearch::getIndex( $translation ); + $buckets[$bucket]['prefix'][$translation] = $targetLanguage; + } + } + + $lengths = []; + // Sorting the bucket contents gives two benefits: + // - more consistent output across environments + // - shortest matches appear first, especially exact matches + // Sort buckets by index + ksort( $buckets ); + foreach ( $buckets as $index => &$bucketTypes ) { + $lengths[] = array_sum( array_map( 'count', $bucketTypes ) ); + // Ensure 'prefix' is before 'infix'; + krsort( $bucketTypes ); + // Ensure each bucket has entries sorted + foreach ( $bucketTypes as $type => &$bucket ) { + ksort( $bucket ); + } + } + + $count = count( $buckets ); + $min = min( $lengths ); + $max = max( $lengths ); + $median = $lengths[ceil( $count / 2 )]; + $avg = array_sum( $lengths ) / $count; + $this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" ); + $this->output( " - largest has $max entries\n - median size is $median entries\n" ); + $this->output( " - average size is $avg entries\n" ); + + $this->generateFile( $buckets ); + } + + private function generateFile( array $buckets ) { + $template = <<<PHP +<?php +// This file is generated by script! +class LanguageNameSearchData { + public static \$buckets = ___; +} + +PHP; + + // Format for short array format + $data = var_export( $buckets, true ); + $data = str_replace( "array (", '[', $data ); + $data = str_replace( "),", '],', $data ); + // Closing of the array, add correct indendation + $data = preg_replace( "/\)$/", "\t]", $data ); + // Remove newlines after =>s + $data = preg_replace( '/(=>)\s+(\[)/m', '\1 \2', $data ); + // Convert spaces to tabs. Since we are not top-level need more tabs. + $data = preg_replace( '/^ /m', "\t\t\t", $data ); + $data = preg_replace( '/^ /m', "\t\t", $data ); + + $template = str_replace( '___', $data, $template ); + + file_put_contents( __DIR__ . '/LanguageNameSearchData.php', $template ); + } +} + +$maintClass = LanguageNameIndexer::class; +require_once RUN_MAINTENANCE_IF_MAIN; |