summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php')
-rw-r--r--www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php160
1 files changed, 160 insertions, 0 deletions
diff --git a/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php b/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php
new file mode 100644
index 00000000..c9e45750
--- /dev/null
+++ b/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php
@@ -0,0 +1,160 @@
+<?php
+/**
+ * Script to create language names index.
+ *
+ * Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
+ * Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
+ * contributors. See CREDITS for a list.
+ *
+ * UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
+ * have to do anything special to choose one license or the other and you don't
+ * have to notify anyone which license you are using. You are free to use
+ * UniversalLanguageSelector in commercial projects as long as the copyright
+ * header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
+ *
+ * @file
+ * @ingroup Extensions
+ * @license GPL-2.0-or-later
+ * @license MIT
+ */
+
+$IP = getenv( 'MW_INSTALL_PATH' );
+if ( $IP === false ) {
+ $IP = __DIR__ . '/../../..';
+}
+require_once "$IP/maintenance/Maintenance.php";
+
+class LanguageNameIndexer extends Maintenance {
+ public function __construct() {
+ parent::__construct();
+ $this->addDescription( 'Script to create language names index.' );
+ }
+
+ public function execute() {
+ global $wgExtraLanguageNames;
+
+ // Avoid local configuration leaking to this script
+ $wgExtraLanguageNames = [];
+
+ $languages = Language::fetchLanguageNames( null, 'all' );
+
+ $buckets = [];
+ foreach ( $languages as $sourceLanguage => $autonym ) {
+ $translations = LanguageNames::getNames( $sourceLanguage, 0, 2 );
+
+ foreach ( $translations as $targetLanguage => $translation ) {
+ // Remove directionality markers used in Names.php: users are not
+ // going to type these.
+ $translation = str_replace( "\xE2\x80\x8E", '', $translation );
+ $translation = mb_strtolower( $translation );
+ $translation = trim( $translation );
+
+ // Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane".
+ // The original name is still shown, but avoid us creating entries such as
+ // "(dialekti" or "alpeve)".
+ $basicForm = preg_replace( '/\(.+\)$/', '', $translation );
+ $words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY );
+
+ foreach ( $words as $index => $word ) {
+ $bucket = LanguageNameSearch::getIndex( $word );
+
+ $type = 'prefix';
+ $display = $translation;
+ if ( $index > 0 && count( $words ) > 1 ) {
+ $type = 'infix';
+ $display = "$word — $translation";
+ }
+ $buckets[$bucket][$type][$display] = $targetLanguage;
+ }
+ }
+ }
+
+ // Some languages don't have a conveniently searchable name in CLDR.
+ // For example, the name of Western Punjabi doesn't start with
+ // the string "punjabi" in any language, so it cannot be found
+ // by people who search in English.
+ // To resolve this, some languages are added here locally.
+ $specialLanguages = [
+ // Catalan, sometimes searched as "Valencià"
+ 'ca' => [ 'valencia' ],
+ // Spanish, the transliteration of the autonym is often used for searching
+ 'es' => [ 'castellano' ],
+ // Armenian, the transliteration of the autonym is often used for searching
+ 'hy' => [ 'hayeren' ],
+ // Georgian, the transliteration of the autonym is often used for searching
+ 'ka' => [ 'kartuli', 'qartuli' ],
+ // Japanese, the transliteration of the autonym is often used for searching
+ 'ja' => [ 'nihongo', 'にほんご' ],
+ // Western Punjabi, doesn't start with the word "Punjabi" in any language
+ 'pnb' => [ 'punjabi western' ],
+ // Simplified and Traditional Chinese, because zh-hans and zh-hant
+ // are not mapped to any English name
+ 'zh-hans' => [ 'chinese simplified' ],
+ 'zh-hant' => [ 'chinese traditional' ],
+ ];
+
+ foreach ( $specialLanguages as $targetLanguage => $translations ) {
+ foreach ( $translations as $translation ) {
+ $bucket = LanguageNameSearch::getIndex( $translation );
+ $buckets[$bucket]['prefix'][$translation] = $targetLanguage;
+ }
+ }
+
+ $lengths = [];
+ // Sorting the bucket contents gives two benefits:
+ // - more consistent output across environments
+ // - shortest matches appear first, especially exact matches
+ // Sort buckets by index
+ ksort( $buckets );
+ foreach ( $buckets as $index => &$bucketTypes ) {
+ $lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
+ // Ensure 'prefix' is before 'infix';
+ krsort( $bucketTypes );
+ // Ensure each bucket has entries sorted
+ foreach ( $bucketTypes as $type => &$bucket ) {
+ ksort( $bucket );
+ }
+ }
+
+ $count = count( $buckets );
+ $min = min( $lengths );
+ $max = max( $lengths );
+ $median = $lengths[ceil( $count / 2 )];
+ $avg = array_sum( $lengths ) / $count;
+ $this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" );
+ $this->output( " - largest has $max entries\n - median size is $median entries\n" );
+ $this->output( " - average size is $avg entries\n" );
+
+ $this->generateFile( $buckets );
+ }
+
+ private function generateFile( array $buckets ) {
+ $template = <<<PHP
+<?php
+// This file is generated by script!
+class LanguageNameSearchData {
+ public static \$buckets = ___;
+}
+
+PHP;
+
+ // Format for short array format
+ $data = var_export( $buckets, true );
+ $data = str_replace( "array (", '[', $data );
+ $data = str_replace( "),", '],', $data );
+ // Closing of the array, add correct indendation
+ $data = preg_replace( "/\)$/", "\t]", $data );
+ // Remove newlines after =>s
+ $data = preg_replace( '/(=>)\s+(\[)/m', '\1 \2', $data );
+ // Convert spaces to tabs. Since we are not top-level need more tabs.
+ $data = preg_replace( '/^ /m', "\t\t\t", $data );
+ $data = preg_replace( '/^ /m', "\t\t", $data );
+
+ $template = str_replace( '___', $data, $template );
+
+ file_put_contents( __DIR__ . '/LanguageNameSearchData.php', $template );
+ }
+}
+
+$maintClass = LanguageNameIndexer::class;
+require_once RUN_MAINTENANCE_IF_MAIN;