addDescription( 'Script to create language names index.' ); } public function execute() { global $wgExtraLanguageNames; // Avoid local configuration leaking to this script $wgExtraLanguageNames = []; $languages = Language::fetchLanguageNames( null, 'all' ); $buckets = []; foreach ( $languages as $sourceLanguage => $autonym ) { $translations = LanguageNames::getNames( $sourceLanguage, 0, 2 ); foreach ( $translations as $targetLanguage => $translation ) { // Remove directionality markers used in Names.php: users are not // going to type these. $translation = str_replace( "\xE2\x80\x8E", '', $translation ); $translation = mb_strtolower( $translation ); $translation = trim( $translation ); // Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane". // The original name is still shown, but avoid us creating entries such as // "(dialekti" or "alpeve)". $basicForm = preg_replace( '/\(.+\)$/', '', $translation ); $words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY ); foreach ( $words as $index => $word ) { $bucket = LanguageNameSearch::getIndex( $word ); $type = 'prefix'; $display = $translation; if ( $index > 0 && count( $words ) > 1 ) { $type = 'infix'; $display = "$word — $translation"; } $buckets[$bucket][$type][$display] = $targetLanguage; } } } // Some languages don't have a conveniently searchable name in CLDR. // For example, the name of Western Punjabi doesn't start with // the string "punjabi" in any language, so it cannot be found // by people who search in English. // To resolve this, some languages are added here locally. $specialLanguages = [ // Catalan, sometimes searched as "Valencià" 'ca' => [ 'valencia' ], // Spanish, the transliteration of the autonym is often used for searching 'es' => [ 'castellano' ], // Armenian, the transliteration of the autonym is often used for searching 'hy' => [ 'hayeren' ], // Georgian, the transliteration of the autonym is often used for searching 'ka' => [ 'kartuli', 'qartuli' ], // Japanese, the transliteration of the autonym is often used for searching 'ja' => [ 'nihongo', 'にほんご' ], // Western Punjabi, doesn't start with the word "Punjabi" in any language 'pnb' => [ 'punjabi western' ], // Simplified and Traditional Chinese, because zh-hans and zh-hant // are not mapped to any English name 'zh-hans' => [ 'chinese simplified' ], 'zh-hant' => [ 'chinese traditional' ], ]; foreach ( $specialLanguages as $targetLanguage => $translations ) { foreach ( $translations as $translation ) { $bucket = LanguageNameSearch::getIndex( $translation ); $buckets[$bucket]['prefix'][$translation] = $targetLanguage; } } $lengths = []; // Sorting the bucket contents gives two benefits: // - more consistent output across environments // - shortest matches appear first, especially exact matches // Sort buckets by index ksort( $buckets ); foreach ( $buckets as $index => &$bucketTypes ) { $lengths[] = array_sum( array_map( 'count', $bucketTypes ) ); // Ensure 'prefix' is before 'infix'; krsort( $bucketTypes ); // Ensure each bucket has entries sorted foreach ( $bucketTypes as $type => &$bucket ) { ksort( $bucket ); } } $count = count( $buckets ); $min = min( $lengths ); $max = max( $lengths ); $median = $lengths[ceil( $count / 2 )]; $avg = array_sum( $lengths ) / $count; $this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" ); $this->output( " - largest has $max entries\n - median size is $median entries\n" ); $this->output( " - average size is $avg entries\n" ); $this->generateFile( $buckets ); } private function generateFile( array $buckets ) { $template = <<s $data = preg_replace( '/(=>)\s+(\[)/m', '\1 \2', $data ); // Convert spaces to tabs. Since we are not top-level need more tabs. $data = preg_replace( '/^ /m', "\t\t\t", $data ); $data = preg_replace( '/^ /m', "\t\t", $data ); $template = str_replace( '___', $data, $template ); file_put_contents( __DIR__ . '/LanguageNameSearchData.php', $template ); } } $maintClass = LanguageNameIndexer::class; require_once RUN_MAINTENANCE_IF_MAIN;