summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/Translate/scripts/groupStatistics.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/Translate/scripts/groupStatistics.php')
-rw-r--r--www/wiki/extensions/Translate/scripts/groupStatistics.php681
1 files changed, 681 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/scripts/groupStatistics.php b/www/wiki/extensions/Translate/scripts/groupStatistics.php
new file mode 100644
index 00000000..cc685e0f
--- /dev/null
+++ b/www/wiki/extensions/Translate/scripts/groupStatistics.php
@@ -0,0 +1,681 @@
+<?php
+/**
+ * Commandline script to general statistics about the localisation level of
+ * one or more message groups.
+ *
+ * @file
+ * @ingroup Script Stats
+ * @author Niklas Laxström
+ * @author Siebrand Mazeland
+ * @copyright Copyright © 2007-2013, Niklas Laxström, Siebrand Mazeland
+ * @license GPL-2.0-or-later
+ */
+
+// Standard boilerplate to define $IP
+if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
+ $IP = getenv( 'MW_INSTALL_PATH' );
+} else {
+ $dir = __DIR__;
+ $IP = "$dir/../../..";
+}
+require_once "$IP/maintenance/Maintenance.php";
+
+class GroupStatistics extends Maintenance {
+ /**
+ * Array of the most spoken languages in the world.
+ * Source: https://stats.wikimedia.org/EN/Sitemap.htm.
+ *
+ * Key value pairs of:
+ * [MediaWiki localisation code] => array(
+ * [position in top 50],
+ * [speakers in millions],
+ * [continent where localisation is spoken]
+ * )
+ */
+ public $mostSpokenLanguages = [
+ 'en' => [ 1, 1500, 'multiple' ],
+ 'zh-hans' => [ 2, 1300, 'asia' ],
+ 'zh-hant' => [ 2, 1300, 'asia' ],
+ 'hi' => [ 3, 550, 'asia' ],
+ 'ar' => [ 4, 530, 'multiple' ],
+ 'es' => [ 5, 500, 'multiple' ],
+ 'ms' => [ 6, 300, 'asia' ],
+ 'pt' => [ 7, 290, 'multiple' ],
+ 'pt-br' => [ 7, 290, 'america' ],
+ 'ru' => [ 8, 278, 'multiple' ],
+ 'id' => [ 9, 250, 'asia' ],
+ 'bn' => [ 10, 230, 'asia' ],
+ 'fr' => [ 11, 200, 'multiple' ],
+ 'de' => [ 12, 185, 'europe' ],
+ 'ja' => [ 13, 132, 'asia' ],
+ 'fa' => [ 14, 107, 'asia' ],
+ 'pnb' => [ 15, 104, 'asia' ], // Most spoken variant
+ 'tl' => [ 16, 90, 'asia' ],
+ 'mr' => [ 17, 90, 'asia' ],
+ 'vi' => [ 18, 80, 'asia' ],
+ 'jv' => [ 19, 80, 'asia' ],
+ 'te' => [ 20, 80, 'asia' ],
+ 'ko' => [ 21, 78, 'asia' ],
+ 'wuu' => [ 22, 77, 'asia' ],
+ 'arz' => [ 23, 76, 'africa' ],
+ 'th' => [ 24, 73, 'asia' ],
+ 'yue' => [ 25, 71, 'asia' ],
+ 'tr' => [ 26, 70, 'multiple' ],
+ 'it' => [ 27, 70, 'europe' ],
+ 'ta' => [ 28, 66, 'asia' ],
+ 'ur' => [ 29, 60, 'asia' ],
+ 'my' => [ 30, 52, 'asia' ],
+ 'sw' => [ 31, 50, 'africa' ],
+ 'nan' => [ 32, 49, 'asia' ],
+ 'kn' => [ 33, 47, 'asia' ],
+ 'gu' => [ 34, 46, 'asia' ],
+ 'uk' => [ 35, 45, 'europe' ],
+ 'pl' => [ 36, 43, 'europe' ],
+ 'sd' => [ 37, 41, 'asia' ],
+ 'ha' => [ 38, 39, 'africa' ],
+ 'ml' => [ 39, 37, 'asia' ],
+ 'gan-hans' => [ 40, 35, 'asia' ],
+ 'gan-hant' => [ 40, 35, 'asia' ],
+ 'hak' => [ 41, 34, 'asia' ],
+ 'or' => [ 42, 31, 'asia' ],
+ 'ne' => [ 43, 30, 'asia' ],
+ 'ro' => [ 44, 28, 'europe' ],
+ 'su' => [ 45, 27, 'asia' ],
+ 'az' => [ 46, 27, 'asia' ],
+ 'nl' => [ 47, 27, 'europe' ],
+ 'zu' => [ 48, 26, 'africa' ],
+ 'ps' => [ 49, 26, 'asia' ],
+ 'ckb' => [ 50, 26, 'asia' ],
+ 'ku-latn' => [ 50, 26, 'asia' ],
+ ];
+
+ /**
+ * Variable with key-value pairs with a named index and an array of key-value
+ * pairs where the key is a MessageGroup ID and the value is a weight of the
+ * group in the sum of the values for all the groups in the array.
+ *
+ * Definitions in this variable can be used to report weighted meta localisation
+ * scores for the 50 most spoken languages.
+ *
+ * @todo Allow weighted reporting for all available languages.
+ */
+ public $localisedWeights = [
+ 'wikimedia' => [
+ // 'core-0-mostused' => 40,
+ 'core' => 50,
+ 'ext-0-wikimedia' => 50
+ ],
+ 'fundraiser' => [
+ 'ext-di-di' => 16,
+ 'ext-di-pfpg' => 84,
+ ],
+ 'mediawiki' => [
+ // 'core-0-mostused' => 30,
+ 'core' => 50,
+ 'ext-0-wikimedia' => 25,
+ 'ext-0-all' => 25
+ ]
+ ];
+
+ /**
+ * Code map to map localisation codes to Wikimedia project codes. Only
+ * exclusion and remapping is defined here. It is assumed that the first part
+ * of the localisation code is the WMF project name otherwise (zh-hans -> zh).
+ */
+ public $wikimediaCodeMap = [
+ // Codes containing a dash
+ 'bat-smg' => 'bat-smg',
+ 'cbk-zam' => 'cbk-zam',
+ 'map-bms' => 'map-bms',
+ 'nds-nl' => 'nds-nl',
+ 'roa-rup' => 'roa-rup',
+ 'roa-tara' => 'roa-tara',
+
+ // Remaps
+ 'be-tarask' => 'be-x-old',
+ 'gsw' => 'als',
+ 'ike-cans' => 'iu',
+ 'ike-latn' => 'iu',
+ 'lzh' => 'zh-classical',
+ 'nan' => 'zh-min-nan',
+ 'vro' => 'fiu-vro',
+ 'yue' => 'zh-yue',
+
+ // Ignored language codes. See reason.
+ 'als' => '', // gsw
+ 'be-x-old' => '', // be-tarask
+ 'crh' => '', // crh-*
+ 'de-at' => '', // de
+ 'de-ch' => '', // de
+ 'de-formal' => '', // de, not reporting formal form
+ 'dk' => '', // da
+ 'en-au' => '', // en
+ 'en-ca' => '', // no MW code
+ 'en-gb' => '', // no MW code
+ 'es-419' => '', // no MW code
+ 'fiu-vro' => '', // vro
+ 'gan' => '', // gan-*
+ 'got' => '', // extinct. not reporting formal form
+ 'hif' => '', // hif-*
+ 'hu-formal' => '', // not reporting
+ 'iu' => '', // ike-*
+ 'kk' => '', // kk-*
+ 'kk-cn' => '', // kk-arab
+ 'kk-kz' => '', // kk-cyrl
+ 'kk-tr' => '', // kk-latn
+ 'ko-kp' => '', // ko
+ 'ku' => '', // ku-*
+ 'ku-arab' => '', // ckb
+ 'nb' => '', // no
+ 'nl-be' => '', // no MW code
+ 'nl-informal' => '', // nl, not reporting informal form
+ 'ruq' => '', // ruq-*
+ 'simple' => '', // en
+ 'sr' => '', // sr-*
+ 'tg' => '', // tg-*
+ 'tp' => '', // tokipona
+ 'tt' => '', // tt-*
+ 'ug' => '', // ug-*
+ 'zh' => '', // zh-*
+ 'zh-classical' => '', // lzh
+ 'zh-cn' => '', // zh
+ 'zh-sg' => '', // zh
+ 'zh-hk' => '', // zh
+ 'zh-min-nan' => '', // nan
+ 'zh-mo' => '', // zh
+ 'zh-my' => '', // zh
+ 'zh-tw' => '', // zh
+ 'zh-yue' => '', // yue
+ ];
+
+ public function __construct() {
+ parent::__construct();
+ $this->mDescription = 'Script to generate statistics about the localisation ' .
+ 'level of one or more message groups.';
+ $this->addOption(
+ 'groups',
+ '(optional) Comma separated list of groups',
+ false, /*required*/
+ true /*has arg*/
+ );
+ $this->addOption(
+ 'output',
+ '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' .
+ 'text: Text with tabs. Default: default',
+ false, /*required*/
+ true /*has arg*/
+ );
+ $this->addOption(
+ 'skiplanguages',
+ '(optional) Comma separated list of languages to be skipped',
+ false, /*required*/
+ true /*has arg*/
+ );
+ $this->addOption(
+ 'skipzero',
+ '(optional) Skip languages that do not have any localisation at all'
+ );
+ $this->addOption(
+ 'legenddetail',
+ '(optional) Page name for legend to be transcluded at the top of the details table',
+ false, /*required*/
+ true /*has arg*/
+ );
+ $this->addOption(
+ 'legendsummary',
+ '(optional) Page name for legend to be transcluded at the top of the summary table',
+ false, /*required*/
+ true /*has arg*/
+ );
+ $this->addOption(
+ 'fuzzy',
+ '(optional) Add column for fuzzy counts'
+ );
+ $this->addOption(
+ 'speakers',
+ '(optional) Add column for number of speakers (est.). ' .
+ 'Only valid when combined with "most"'
+ );
+ $this->addOption(
+ 'nol10n',
+ '(optional) Do not add localised language name if I18ntags is installed'
+ );
+ $this->addOption(
+ 'continent',
+ '(optional) Add a continent column. Only available when output is ' .
+ '"wiki" or not specified.'
+ );
+ $this->addOption(
+ 'summary',
+ '(optional) Add a summary with counts and scores per continent category ' .
+ 'and totals. Only available for a valid "most" value.',
+ false, /*required*/
+ true /*has arg*/
+ );
+ $this->addOption(
+ 'wmfscore',
+ 'Only output WMF language code and weighted score for all ' .
+ 'language codes for weighing group "wikimedia" in CSV. This ' .
+ 'report must keep a stable layout as it is used/will be ' .
+ 'used in the Wikimedia statistics.'
+ );
+ $this->addOption(
+ 'most',
+ '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' .
+ 'spoken languages. Skipzero is ignored. If a valid scope is ' .
+ 'defined, the group list and fuzzy are ignored and the ' .
+ 'localisation levels are weighted and reported.',
+ false, /*required*/
+ true /*has arg*/
+ );
+ }
+
+ public function execute() {
+ $output = $this->getOption( 'output', 'default' );
+
+ // Select an output engine
+ switch ( $output ) {
+ case 'wiki':
+ $out = new WikiStatsOutput();
+ break;
+ case 'text':
+ $out = new TextStatsOutput();
+ break;
+ case 'csv':
+ $out = new CsvStatsOutput();
+ break;
+ default:
+ $out = new TranslateStatsOutput();
+ }
+
+ $skipLanguages = [];
+ if ( $this->hasOption( 'skiplanguages' ) ) {
+ $skipLanguages = array_map(
+ 'trim',
+ explode( ',', $this->getOption( 'skiplanguages' ) )
+ );
+ }
+
+ $reportScore = false;
+ // Check if score should be reported and prepare weights
+ $most = $this->getOption( 'most' );
+ $weights = [];
+ if ( $most && isset( $this->localisedWeights[$most] ) ) {
+ $reportScore = true;
+
+ foreach ( $this->localisedWeights[$most] as $weight ) {
+ $weights[] = $weight;
+ }
+ }
+
+ // check if l10n should be done
+ $l10n = false;
+ if ( ( $output === 'wiki' || $output === 'default' ) &&
+ !$this->hasOption( 'nol10n' )
+ ) {
+ $l10n = true;
+ }
+
+ $wmfscore = $this->hasOption( 'wmfscore' );
+
+ // Get groups from input
+ $groups = [];
+ if ( $reportScore ) {
+ $reqGroups = array_keys( $this->localisedWeights[$most] );
+ } elseif ( $wmfscore ) {
+ $reqGroups = array_keys( $this->localisedWeights['wikimedia'] );
+ } else {
+ $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) );
+ }
+
+ // List of all groups
+ $allGroups = MessageGroups::singleton()->getGroups();
+
+ // Get list of valid groups
+ foreach ( $reqGroups as $id ) {
+ // Page translation group ids use spaces which are not nice on command line
+ $id = str_replace( '_', ' ', $id );
+ if ( isset( $allGroups[$id] ) ) {
+ $groups[$id] = $allGroups[$id];
+ } else {
+ $this->output( "Unknown group: $id" );
+ }
+ }
+
+ if ( $wmfscore ) {
+ // Override/set parameters
+ $out = new CsvStatsOutput();
+ $reportScore = true;
+
+ $weights = [];
+ foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
+ $weights[] = $weight;
+ }
+ $wmfscores = [];
+ }
+
+ if ( !count( $groups ) ) {
+ $this->error( 'No groups given', true );
+ }
+
+ // List of all languages.
+ $languages = TranslateUtils::getLanguageNames( null );
+ // Default sorting order by language code, users can sort wiki output.
+ ksort( $languages );
+
+ if ( $this->hasOption( 'legenddetail' ) ) {
+ $out->addFreeText( '{{' . $this->getOption( 'legenddetail' ) . "}}\n" );
+ }
+
+ $totalWeight = 0;
+ if ( $reportScore ) {
+ if ( $wmfscore ) {
+ foreach ( $this->localisedWeights['wikimedia'] as $weight ) {
+ $totalWeight += $weight;
+ }
+ } else {
+ foreach ( $this->localisedWeights[$most] as $weight ) {
+ $totalWeight += $weight;
+ }
+ }
+ }
+
+ $showContinent = $this->getOption( 'continent' );
+ if ( !$wmfscore ) {
+ // Output headers
+ $out->heading();
+
+ $out->blockstart();
+
+ if ( $most ) {
+ $out->element( ( $l10n ? '{{int:translate-gs-pos}}' : 'Pos.' ), true );
+ }
+
+ $out->element( ( $l10n ? '{{int:translate-gs-code}}' : 'Code' ), true );
+ $out->element( ( $l10n ? '{{int:translate-page-language}}' : 'Language' ), true );
+ if ( $showContinent ) {
+ $out->element( ( $l10n ? '{{int:translate-gs-continent}}' : 'Continent' ), true );
+ }
+
+ if ( $most && $this->hasOption( 'speakers' ) ) {
+ $out->element( ( $l10n ? '{{int:translate-gs-speakers}}' : 'Speakers' ), true );
+ }
+
+ if ( $reportScore ) {
+ $out->element(
+ ( $l10n ? '{{int:translate-gs-score}}' : 'Score' ) . ' (' . $totalWeight . ')',
+ true
+ );
+ }
+
+ /**
+ * @var $g MessageGroup
+ */
+ foreach ( $groups as $g ) {
+ // Add unprocessed description of group as heading
+ if ( $reportScore ) {
+ $gid = $g->getId();
+ $heading = $g->getLabel() . ' (' . $this->localisedWeights[$most][$gid] . ')';
+ } else {
+ $heading = $g->getLabel();
+ }
+ $out->element( $heading, true );
+ if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) {
+ $out->element( ( $l10n ? '{{int:translate-percentage-fuzzy}}' : 'Fuzzy' ), true );
+ }
+ }
+
+ $out->blockend();
+ }
+
+ $rows = [];
+ foreach ( $languages as $code => $name ) {
+ // Skip list
+ if ( in_array( $code, $skipLanguages ) ) {
+ continue;
+ }
+ $rows[$code] = [];
+ }
+
+ foreach ( $groups as $groupName => $g ) {
+ $stats = MessageGroupStats::forGroup( $groupName );
+
+ // Perform the statistic calculations on every language
+ foreach ( $languages as $code => $name ) {
+ // Skip list
+ if ( !$most && in_array( $code, $skipLanguages ) ) {
+ continue;
+ }
+
+ // Do not calculate if we do not need it for anything.
+ if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
+ && $this->wikimediaCodeMap[$code] === ''
+ ) {
+ continue;
+ }
+
+ // If --most is set, skip all other
+ if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
+ continue;
+ }
+
+ $total = $stats[$code][MessageGroupStats::TOTAL];
+ $translated = $stats[$code][MessageGroupStats::TRANSLATED];
+ $fuzzy = $stats[$code][MessageGroupStats::FUZZY];
+
+ $rows[$code][] = [ false, $translated, $total ];
+
+ if ( $this->hasOption( 'fuzzy' ) ) {
+ $rows[$code][] = [ true, $fuzzy, $total ];
+ }
+ }
+
+ unset( $collection );
+ }
+
+ // init summary array
+ $summarise = false;
+ if ( $this->hasOption( 'summary' ) ) {
+ $summarise = true;
+ $summary = [];
+ }
+
+ foreach ( $languages as $code => $name ) {
+ // Skip list
+ if ( !$most && in_array( $code, $skipLanguages ) ) {
+ continue;
+ }
+
+ // Skip unneeded
+ if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] )
+ && $this->wikimediaCodeMap[$code] === ''
+ ) {
+ continue;
+ }
+
+ // If --most is set, skip all other
+ if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) {
+ continue;
+ }
+
+ $columns = $rows[$code];
+
+ $allZero = true;
+ foreach ( $columns as $fields ) {
+ if ( (int)$fields[1] !== 0 ) {
+ $allZero = false;
+ }
+ }
+
+ // Skip dummy languages if requested
+ if ( $allZero && $this->hasOption( 'skipzero' ) ) {
+ continue;
+ }
+
+ // Output the row
+ if ( !$wmfscore ) {
+ $out->blockstart();
+ }
+
+ // Fill language position field
+ if ( $most ) {
+ $out->element( $this->mostSpokenLanguages[$code][0] );
+ }
+
+ // Fill language name field
+ if ( !$wmfscore ) {
+ // Fill language code field
+ $out->element( $code );
+
+ if ( $l10n && function_exists( 'efI18nTagsInit' ) ) {
+ $out->element( '{{#languagename:' . $code . '}}' );
+ } else {
+ $out->element( $name );
+ }
+ }
+
+ // Fill continent field
+ if ( $showContinent ) {
+ if ( $this->mostSpokenLanguages[$code][2] === 'multiple' ) {
+ $continent = ( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
+ } else {
+ $continent = $l10n ?
+ '{{int:timezoneregion-' . $this->mostSpokenLanguages[$code][2] . '}}' :
+ ucfirst( $this->mostSpokenLanguages[$code][2] );
+ }
+
+ $out->element( $continent );
+ }
+
+ // Fill speakers field
+ if ( $most && $this->hasOption( 'speakers' ) ) {
+ $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) );
+ }
+
+ // Fill the score field
+ if ( $reportScore ) {
+ // Keep count
+ $i = 0;
+ // Start with 0 points
+ $score = 0;
+
+ foreach ( $columns as $fields ) {
+ list( , $upper, $total ) = $fields;
+ // Weigh the score and add it to the current score
+ $score += ( $weights[$i] * $upper ) / $total;
+ $i++;
+ }
+
+ // Report a round numbers
+ $score = number_format( $score, 0 );
+
+ if ( $summarise ) {
+ $continent = $this->mostSpokenLanguages[$code][2];
+ if ( isset( $summary[$continent] ) ) {
+ $newcount = $summary[$continent][0] + 1;
+ $newscore = $summary[$continent][1] + (int)$score;
+ } else {
+ $newcount = 1;
+ $newscore = $score;
+ }
+
+ $summary[$continent] = [ $newcount, $newscore ];
+ }
+
+ if ( $wmfscore ) {
+ // Multiple variants can be used for the same wiki.
+ // Store the scores in an array and output them later
+ // when they can be averaged.
+ if ( isset( $this->wikimediaCodeMap[$code] ) ) {
+ $wmfcode = $this->wikimediaCodeMap[$code];
+ } else {
+ $codeparts = explode( '-', $code );
+ $wmfcode = $codeparts[0];
+ }
+
+ if ( isset( $wmfscores[$wmfcode] ) ) {
+ $count = $wmfscores[$wmfcode]['count'] + 1;
+ $tmpWmfScore = (int)$wmfscores[$wmfcode]['score'];
+ $tmpWmfCount = (int)$wmfscores[$wmfcode]['count'];
+ $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count;
+ $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => $count ];
+ } else {
+ $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => 1 ];
+ }
+ } else {
+ $out->element( $score );
+ }
+ }
+
+ // Fill fields for groups
+ if ( !$wmfscore ) {
+ foreach ( $columns as $fields ) {
+ list( $invert, $upper, $total ) = $fields;
+ $c = $out->formatPercent( $upper, $total, $invert );
+ $out->element( $c );
+ }
+
+ $out->blockend();
+ }
+ }
+
+ $out->footer();
+
+ if ( $reportScore && $this->hasOption( 'summary' ) ) {
+ if ( $reportScore && $this->hasOption( 'legendsummary' ) ) {
+ $out->addFreeText( '{{' . $this->getOption( 'legendsummary' ) . "}}\n" );
+ }
+
+ $out->summaryheading();
+
+ $out->blockstart();
+
+ $out->element( $l10n ? '{{int:translate-gs-continent}}' : 'Continent', true );
+ $out->element( $l10n ? '{{int:translate-gs-count}}' : 'Count', true );
+ $out->element( $l10n ? '{{int:translate-gs-avgscore}}' : 'Avg. score', true );
+
+ $out->blockend();
+
+ ksort( $summary );
+
+ $totals = [ 0, 0 ];
+
+ foreach ( $summary as $key => $values ) {
+ $out->blockstart();
+
+ if ( $key === 'multiple' ) {
+ $out->element( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' );
+ } else {
+ $out->element( $l10n ? '{{int:timezoneregion-' . $key . '}}' : ucfirst( $key ) );
+ }
+ $out->element( $values[0] );
+ $out->element( number_format( $values[1] / $values[0] ) );
+
+ $out->blockend();
+
+ $totals[0] += $values[0];
+ $totals[1] += $values[1];
+ }
+
+ $out->blockstart();
+ $out->element( $l10n ? '{{int:translate-gs-total}}' : 'Total' );
+ $out->element( $totals[0] );
+ $out->element( number_format( $totals[1] / $totals[0] ) );
+ $out->blockend();
+
+ $out->footer();
+ }
+
+ // Custom output
+ if ( $wmfscore ) {
+ ksort( $wmfscores );
+
+ foreach ( $wmfscores as $code => $stats ) {
+ echo $code . ';' . number_format( $stats['score'] ) . ";\n";
+ }
+ }
+ }
+}
+
+$maintClass = GroupStatistics::class;
+require_once RUN_MAINTENANCE_IF_MAIN;