summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/UniversalLanguageSelector/data/LanguageNameIndexer.php
blob: c9e457503b88cf3cf95f9d1970353ed7057daf87 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
<?php
/**
 * Script to create language names index.
 *
 * Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
 * Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
 * contributors. See CREDITS for a list.
 *
 * UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don't
 * have to do anything special to choose one license or the other and you don't
 * have to notify anyone which license you are using. You are free to use
 * UniversalLanguageSelector in commercial projects as long as the copyright
 * header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
 *
 * @file
 * @ingroup Extensions
 * @license GPL-2.0-or-later
 * @license MIT
 */

$IP = getenv( 'MW_INSTALL_PATH' );
if ( $IP === false ) {
	$IP = __DIR__ . '/../../..';
}
require_once "$IP/maintenance/Maintenance.php";

class LanguageNameIndexer extends Maintenance {
	public function __construct() {
		parent::__construct();
		$this->addDescription( 'Script to create language names index.' );
	}

	public function execute() {
		global $wgExtraLanguageNames;

		// Avoid local configuration leaking to this script
		$wgExtraLanguageNames = [];

		$languages = Language::fetchLanguageNames( null, 'all' );

		$buckets = [];
		foreach ( $languages as $sourceLanguage => $autonym ) {
			$translations = LanguageNames::getNames( $sourceLanguage, 0, 2 );

			foreach ( $translations as $targetLanguage => $translation ) {
				// Remove directionality markers used in Names.php: users are not
				// going to type these.
				$translation = str_replace( "\xE2\x80\x8E", '', $translation );
				$translation = mb_strtolower( $translation );
				$translation = trim( $translation );

				// Clean up "gjermanishte zvicerane (dialekti i alpeve)" to "gjermanishte zvicerane".
				// The original name is still shown, but avoid us creating entries such as
				// "(dialekti" or "alpeve)".
				$basicForm = preg_replace( '/\(.+\)$/', '', $translation );
				$words = preg_split( '/[\s]+/u', $basicForm, -1, PREG_SPLIT_NO_EMPTY );

				foreach ( $words as $index => $word ) {
					$bucket = LanguageNameSearch::getIndex( $word );

					$type = 'prefix';
					$display = $translation;
					if ( $index > 0 && count( $words ) > 1 ) {
						$type = 'infix';
						$display = "$word — $translation";
					}
					$buckets[$bucket][$type][$display] = $targetLanguage;
				}
			}
		}

		// Some languages don't have a conveniently searchable name in CLDR.
		// For example, the name of Western Punjabi doesn't start with
		// the string "punjabi" in any language, so it cannot be found
		// by people who search in English.
		// To resolve this, some languages are added here locally.
		$specialLanguages = [
			// Catalan, sometimes searched as "Valencià"
			'ca' => [ 'valencia' ],
			// Spanish, the transliteration of the autonym is often used for searching
			'es' => [ 'castellano' ],
			// Armenian, the transliteration of the autonym is often used for searching
			'hy' => [ 'hayeren' ],
			// Georgian, the transliteration of the autonym is often used for searching
			'ka' => [ 'kartuli', 'qartuli' ],
			// Japanese, the transliteration of the autonym is often used for searching
			'ja' => [ 'nihongo', 'にほんご' ],
			// Western Punjabi, doesn't start with the word "Punjabi" in any language
			'pnb' => [ 'punjabi western' ],
			// Simplified and Traditional Chinese, because zh-hans and zh-hant
			// are not mapped to any English name
			'zh-hans' => [ 'chinese simplified' ],
			'zh-hant' => [ 'chinese traditional' ],
		];

		foreach ( $specialLanguages as $targetLanguage => $translations ) {
			foreach ( $translations as $translation ) {
				$bucket = LanguageNameSearch::getIndex( $translation );
				$buckets[$bucket]['prefix'][$translation] = $targetLanguage;
			}
		}

		$lengths = [];
		// Sorting the bucket contents gives two benefits:
		// - more consistent output across environments
		// - shortest matches appear first, especially exact matches
		// Sort buckets by index
		ksort( $buckets );
		foreach ( $buckets as $index => &$bucketTypes ) {
			$lengths[] = array_sum( array_map( 'count', $bucketTypes ) );
			// Ensure 'prefix' is before 'infix';
			krsort( $bucketTypes );
			// Ensure each bucket has entries sorted
			foreach ( $bucketTypes as $type => &$bucket ) {
				ksort( $bucket );
			}
		}

		$count = count( $buckets );
		$min = min( $lengths );
		$max = max( $lengths );
		$median = $lengths[ceil( $count / 2 )];
		$avg = array_sum( $lengths ) / $count;
		$this->output( "Bucket stats:\n - $count buckets\n - smallest has $min entries\n" );
		$this->output( " - largest has $max entries\n - median size is $median entries\n" );
		$this->output( " - average size is $avg entries\n" );

		$this->generateFile( $buckets );
	}

	private function generateFile( array $buckets ) {
		$template = <<<PHP
<?php
// This file is generated by script!
class LanguageNameSearchData {
	public static \$buckets = ___;
}

PHP;

		// Format for short array format
		$data = var_export( $buckets, true );
		$data = str_replace( "array (", '[', $data );
		$data = str_replace( "),", '],', $data );
		// Closing of the array, add correct indendation
		$data = preg_replace( "/\)$/", "\t]", $data );
		// Remove newlines after =>s
		$data = preg_replace( '/(=>)\s+(\[)/m', '\1 \2', $data );
		// Convert spaces to tabs. Since we are not top-level need more tabs.
		$data = preg_replace( '/^    /m', "\t\t\t", $data );
		$data = preg_replace( '/^  /m', "\t\t", $data );

		$template = str_replace( '___', $data, $template );

		file_put_contents( __DIR__ . '/LanguageNameSearchData.php', $template );
	}
}

$maintClass = LanguageNameIndexer::class;
require_once RUN_MAINTENANCE_IF_MAIN;