diff options
Diffstat (limited to 'www/wiki/extensions/Translate/scripts/groupStatistics.php')
-rw-r--r-- | www/wiki/extensions/Translate/scripts/groupStatistics.php | 681 |
1 files changed, 681 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/scripts/groupStatistics.php b/www/wiki/extensions/Translate/scripts/groupStatistics.php new file mode 100644 index 00000000..cc685e0f --- /dev/null +++ b/www/wiki/extensions/Translate/scripts/groupStatistics.php @@ -0,0 +1,681 @@ +<?php +/** + * Commandline script to general statistics about the localisation level of + * one or more message groups. + * + * @file + * @ingroup Script Stats + * @author Niklas Laxström + * @author Siebrand Mazeland + * @copyright Copyright © 2007-2013, Niklas Laxström, Siebrand Mazeland + * @license GPL-2.0-or-later + */ + +// Standard boilerplate to define $IP +if ( getenv( 'MW_INSTALL_PATH' ) !== false ) { + $IP = getenv( 'MW_INSTALL_PATH' ); +} else { + $dir = __DIR__; + $IP = "$dir/../../.."; +} +require_once "$IP/maintenance/Maintenance.php"; + +class GroupStatistics extends Maintenance { + /** + * Array of the most spoken languages in the world. + * Source: https://stats.wikimedia.org/EN/Sitemap.htm. + * + * Key value pairs of: + * [MediaWiki localisation code] => array( + * [position in top 50], + * [speakers in millions], + * [continent where localisation is spoken] + * ) + */ + public $mostSpokenLanguages = [ + 'en' => [ 1, 1500, 'multiple' ], + 'zh-hans' => [ 2, 1300, 'asia' ], + 'zh-hant' => [ 2, 1300, 'asia' ], + 'hi' => [ 3, 550, 'asia' ], + 'ar' => [ 4, 530, 'multiple' ], + 'es' => [ 5, 500, 'multiple' ], + 'ms' => [ 6, 300, 'asia' ], + 'pt' => [ 7, 290, 'multiple' ], + 'pt-br' => [ 7, 290, 'america' ], + 'ru' => [ 8, 278, 'multiple' ], + 'id' => [ 9, 250, 'asia' ], + 'bn' => [ 10, 230, 'asia' ], + 'fr' => [ 11, 200, 'multiple' ], + 'de' => [ 12, 185, 'europe' ], + 'ja' => [ 13, 132, 'asia' ], + 'fa' => [ 14, 107, 'asia' ], + 'pnb' => [ 15, 104, 'asia' ], // Most spoken variant + 'tl' => [ 16, 90, 'asia' ], + 'mr' => [ 17, 90, 'asia' ], + 'vi' => [ 18, 80, 'asia' ], + 'jv' => [ 19, 80, 'asia' ], + 'te' => [ 20, 80, 'asia' ], + 'ko' => [ 21, 78, 'asia' ], + 'wuu' => [ 22, 77, 'asia' ], + 'arz' => [ 23, 76, 'africa' ], + 'th' => [ 24, 73, 'asia' ], + 'yue' => [ 25, 71, 'asia' ], + 'tr' => [ 26, 70, 'multiple' ], + 'it' => [ 27, 70, 'europe' ], + 'ta' => [ 28, 66, 'asia' ], + 'ur' => [ 29, 60, 'asia' ], + 'my' => [ 30, 52, 'asia' ], + 'sw' => [ 31, 50, 'africa' ], + 'nan' => [ 32, 49, 'asia' ], + 'kn' => [ 33, 47, 'asia' ], + 'gu' => [ 34, 46, 'asia' ], + 'uk' => [ 35, 45, 'europe' ], + 'pl' => [ 36, 43, 'europe' ], + 'sd' => [ 37, 41, 'asia' ], + 'ha' => [ 38, 39, 'africa' ], + 'ml' => [ 39, 37, 'asia' ], + 'gan-hans' => [ 40, 35, 'asia' ], + 'gan-hant' => [ 40, 35, 'asia' ], + 'hak' => [ 41, 34, 'asia' ], + 'or' => [ 42, 31, 'asia' ], + 'ne' => [ 43, 30, 'asia' ], + 'ro' => [ 44, 28, 'europe' ], + 'su' => [ 45, 27, 'asia' ], + 'az' => [ 46, 27, 'asia' ], + 'nl' => [ 47, 27, 'europe' ], + 'zu' => [ 48, 26, 'africa' ], + 'ps' => [ 49, 26, 'asia' ], + 'ckb' => [ 50, 26, 'asia' ], + 'ku-latn' => [ 50, 26, 'asia' ], + ]; + + /** + * Variable with key-value pairs with a named index and an array of key-value + * pairs where the key is a MessageGroup ID and the value is a weight of the + * group in the sum of the values for all the groups in the array. + * + * Definitions in this variable can be used to report weighted meta localisation + * scores for the 50 most spoken languages. + * + * @todo Allow weighted reporting for all available languages. + */ + public $localisedWeights = [ + 'wikimedia' => [ + // 'core-0-mostused' => 40, + 'core' => 50, + 'ext-0-wikimedia' => 50 + ], + 'fundraiser' => [ + 'ext-di-di' => 16, + 'ext-di-pfpg' => 84, + ], + 'mediawiki' => [ + // 'core-0-mostused' => 30, + 'core' => 50, + 'ext-0-wikimedia' => 25, + 'ext-0-all' => 25 + ] + ]; + + /** + * Code map to map localisation codes to Wikimedia project codes. Only + * exclusion and remapping is defined here. It is assumed that the first part + * of the localisation code is the WMF project name otherwise (zh-hans -> zh). + */ + public $wikimediaCodeMap = [ + // Codes containing a dash + 'bat-smg' => 'bat-smg', + 'cbk-zam' => 'cbk-zam', + 'map-bms' => 'map-bms', + 'nds-nl' => 'nds-nl', + 'roa-rup' => 'roa-rup', + 'roa-tara' => 'roa-tara', + + // Remaps + 'be-tarask' => 'be-x-old', + 'gsw' => 'als', + 'ike-cans' => 'iu', + 'ike-latn' => 'iu', + 'lzh' => 'zh-classical', + 'nan' => 'zh-min-nan', + 'vro' => 'fiu-vro', + 'yue' => 'zh-yue', + + // Ignored language codes. See reason. + 'als' => '', // gsw + 'be-x-old' => '', // be-tarask + 'crh' => '', // crh-* + 'de-at' => '', // de + 'de-ch' => '', // de + 'de-formal' => '', // de, not reporting formal form + 'dk' => '', // da + 'en-au' => '', // en + 'en-ca' => '', // no MW code + 'en-gb' => '', // no MW code + 'es-419' => '', // no MW code + 'fiu-vro' => '', // vro + 'gan' => '', // gan-* + 'got' => '', // extinct. not reporting formal form + 'hif' => '', // hif-* + 'hu-formal' => '', // not reporting + 'iu' => '', // ike-* + 'kk' => '', // kk-* + 'kk-cn' => '', // kk-arab + 'kk-kz' => '', // kk-cyrl + 'kk-tr' => '', // kk-latn + 'ko-kp' => '', // ko + 'ku' => '', // ku-* + 'ku-arab' => '', // ckb + 'nb' => '', // no + 'nl-be' => '', // no MW code + 'nl-informal' => '', // nl, not reporting informal form + 'ruq' => '', // ruq-* + 'simple' => '', // en + 'sr' => '', // sr-* + 'tg' => '', // tg-* + 'tp' => '', // tokipona + 'tt' => '', // tt-* + 'ug' => '', // ug-* + 'zh' => '', // zh-* + 'zh-classical' => '', // lzh + 'zh-cn' => '', // zh + 'zh-sg' => '', // zh + 'zh-hk' => '', // zh + 'zh-min-nan' => '', // nan + 'zh-mo' => '', // zh + 'zh-my' => '', // zh + 'zh-tw' => '', // zh + 'zh-yue' => '', // yue + ]; + + public function __construct() { + parent::__construct(); + $this->mDescription = 'Script to generate statistics about the localisation ' . + 'level of one or more message groups.'; + $this->addOption( + 'groups', + '(optional) Comma separated list of groups', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'output', + '(optional) csv: Comma Separated Values, wiki: MediaWiki syntax, ' . + 'text: Text with tabs. Default: default', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'skiplanguages', + '(optional) Comma separated list of languages to be skipped', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'skipzero', + '(optional) Skip languages that do not have any localisation at all' + ); + $this->addOption( + 'legenddetail', + '(optional) Page name for legend to be transcluded at the top of the details table', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'legendsummary', + '(optional) Page name for legend to be transcluded at the top of the summary table', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'fuzzy', + '(optional) Add column for fuzzy counts' + ); + $this->addOption( + 'speakers', + '(optional) Add column for number of speakers (est.). ' . + 'Only valid when combined with "most"' + ); + $this->addOption( + 'nol10n', + '(optional) Do not add localised language name if I18ntags is installed' + ); + $this->addOption( + 'continent', + '(optional) Add a continent column. Only available when output is ' . + '"wiki" or not specified.' + ); + $this->addOption( + 'summary', + '(optional) Add a summary with counts and scores per continent category ' . + 'and totals. Only available for a valid "most" value.', + false, /*required*/ + true /*has arg*/ + ); + $this->addOption( + 'wmfscore', + 'Only output WMF language code and weighted score for all ' . + 'language codes for weighing group "wikimedia" in CSV. This ' . + 'report must keep a stable layout as it is used/will be ' . + 'used in the Wikimedia statistics.' + ); + $this->addOption( + 'most', + '(optional) "mediawiki" or "wikimedia". Report on the 50 most ' . + 'spoken languages. Skipzero is ignored. If a valid scope is ' . + 'defined, the group list and fuzzy are ignored and the ' . + 'localisation levels are weighted and reported.', + false, /*required*/ + true /*has arg*/ + ); + } + + public function execute() { + $output = $this->getOption( 'output', 'default' ); + + // Select an output engine + switch ( $output ) { + case 'wiki': + $out = new WikiStatsOutput(); + break; + case 'text': + $out = new TextStatsOutput(); + break; + case 'csv': + $out = new CsvStatsOutput(); + break; + default: + $out = new TranslateStatsOutput(); + } + + $skipLanguages = []; + if ( $this->hasOption( 'skiplanguages' ) ) { + $skipLanguages = array_map( + 'trim', + explode( ',', $this->getOption( 'skiplanguages' ) ) + ); + } + + $reportScore = false; + // Check if score should be reported and prepare weights + $most = $this->getOption( 'most' ); + $weights = []; + if ( $most && isset( $this->localisedWeights[$most] ) ) { + $reportScore = true; + + foreach ( $this->localisedWeights[$most] as $weight ) { + $weights[] = $weight; + } + } + + // check if l10n should be done + $l10n = false; + if ( ( $output === 'wiki' || $output === 'default' ) && + !$this->hasOption( 'nol10n' ) + ) { + $l10n = true; + } + + $wmfscore = $this->hasOption( 'wmfscore' ); + + // Get groups from input + $groups = []; + if ( $reportScore ) { + $reqGroups = array_keys( $this->localisedWeights[$most] ); + } elseif ( $wmfscore ) { + $reqGroups = array_keys( $this->localisedWeights['wikimedia'] ); + } else { + $reqGroups = array_map( 'trim', explode( ',', $this->getOption( 'groups' ) ) ); + } + + // List of all groups + $allGroups = MessageGroups::singleton()->getGroups(); + + // Get list of valid groups + foreach ( $reqGroups as $id ) { + // Page translation group ids use spaces which are not nice on command line + $id = str_replace( '_', ' ', $id ); + if ( isset( $allGroups[$id] ) ) { + $groups[$id] = $allGroups[$id]; + } else { + $this->output( "Unknown group: $id" ); + } + } + + if ( $wmfscore ) { + // Override/set parameters + $out = new CsvStatsOutput(); + $reportScore = true; + + $weights = []; + foreach ( $this->localisedWeights['wikimedia'] as $weight ) { + $weights[] = $weight; + } + $wmfscores = []; + } + + if ( !count( $groups ) ) { + $this->error( 'No groups given', true ); + } + + // List of all languages. + $languages = TranslateUtils::getLanguageNames( null ); + // Default sorting order by language code, users can sort wiki output. + ksort( $languages ); + + if ( $this->hasOption( 'legenddetail' ) ) { + $out->addFreeText( '{{' . $this->getOption( 'legenddetail' ) . "}}\n" ); + } + + $totalWeight = 0; + if ( $reportScore ) { + if ( $wmfscore ) { + foreach ( $this->localisedWeights['wikimedia'] as $weight ) { + $totalWeight += $weight; + } + } else { + foreach ( $this->localisedWeights[$most] as $weight ) { + $totalWeight += $weight; + } + } + } + + $showContinent = $this->getOption( 'continent' ); + if ( !$wmfscore ) { + // Output headers + $out->heading(); + + $out->blockstart(); + + if ( $most ) { + $out->element( ( $l10n ? '{{int:translate-gs-pos}}' : 'Pos.' ), true ); + } + + $out->element( ( $l10n ? '{{int:translate-gs-code}}' : 'Code' ), true ); + $out->element( ( $l10n ? '{{int:translate-page-language}}' : 'Language' ), true ); + if ( $showContinent ) { + $out->element( ( $l10n ? '{{int:translate-gs-continent}}' : 'Continent' ), true ); + } + + if ( $most && $this->hasOption( 'speakers' ) ) { + $out->element( ( $l10n ? '{{int:translate-gs-speakers}}' : 'Speakers' ), true ); + } + + if ( $reportScore ) { + $out->element( + ( $l10n ? '{{int:translate-gs-score}}' : 'Score' ) . ' (' . $totalWeight . ')', + true + ); + } + + /** + * @var $g MessageGroup + */ + foreach ( $groups as $g ) { + // Add unprocessed description of group as heading + if ( $reportScore ) { + $gid = $g->getId(); + $heading = $g->getLabel() . ' (' . $this->localisedWeights[$most][$gid] . ')'; + } else { + $heading = $g->getLabel(); + } + $out->element( $heading, true ); + if ( !$reportScore && $this->hasOption( 'fuzzy' ) ) { + $out->element( ( $l10n ? '{{int:translate-percentage-fuzzy}}' : 'Fuzzy' ), true ); + } + } + + $out->blockend(); + } + + $rows = []; + foreach ( $languages as $code => $name ) { + // Skip list + if ( in_array( $code, $skipLanguages ) ) { + continue; + } + $rows[$code] = []; + } + + foreach ( $groups as $groupName => $g ) { + $stats = MessageGroupStats::forGroup( $groupName ); + + // Perform the statistic calculations on every language + foreach ( $languages as $code => $name ) { + // Skip list + if ( !$most && in_array( $code, $skipLanguages ) ) { + continue; + } + + // Do not calculate if we do not need it for anything. + if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] ) + && $this->wikimediaCodeMap[$code] === '' + ) { + continue; + } + + // If --most is set, skip all other + if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) { + continue; + } + + $total = $stats[$code][MessageGroupStats::TOTAL]; + $translated = $stats[$code][MessageGroupStats::TRANSLATED]; + $fuzzy = $stats[$code][MessageGroupStats::FUZZY]; + + $rows[$code][] = [ false, $translated, $total ]; + + if ( $this->hasOption( 'fuzzy' ) ) { + $rows[$code][] = [ true, $fuzzy, $total ]; + } + } + + unset( $collection ); + } + + // init summary array + $summarise = false; + if ( $this->hasOption( 'summary' ) ) { + $summarise = true; + $summary = []; + } + + foreach ( $languages as $code => $name ) { + // Skip list + if ( !$most && in_array( $code, $skipLanguages ) ) { + continue; + } + + // Skip unneeded + if ( $wmfscore && isset( $this->wikimediaCodeMap[$code] ) + && $this->wikimediaCodeMap[$code] === '' + ) { + continue; + } + + // If --most is set, skip all other + if ( $most && !isset( $this->mostSpokenLanguages[$code] ) ) { + continue; + } + + $columns = $rows[$code]; + + $allZero = true; + foreach ( $columns as $fields ) { + if ( (int)$fields[1] !== 0 ) { + $allZero = false; + } + } + + // Skip dummy languages if requested + if ( $allZero && $this->hasOption( 'skipzero' ) ) { + continue; + } + + // Output the row + if ( !$wmfscore ) { + $out->blockstart(); + } + + // Fill language position field + if ( $most ) { + $out->element( $this->mostSpokenLanguages[$code][0] ); + } + + // Fill language name field + if ( !$wmfscore ) { + // Fill language code field + $out->element( $code ); + + if ( $l10n && function_exists( 'efI18nTagsInit' ) ) { + $out->element( '{{#languagename:' . $code . '}}' ); + } else { + $out->element( $name ); + } + } + + // Fill continent field + if ( $showContinent ) { + if ( $this->mostSpokenLanguages[$code][2] === 'multiple' ) { + $continent = ( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' ); + } else { + $continent = $l10n ? + '{{int:timezoneregion-' . $this->mostSpokenLanguages[$code][2] . '}}' : + ucfirst( $this->mostSpokenLanguages[$code][2] ); + } + + $out->element( $continent ); + } + + // Fill speakers field + if ( $most && $this->hasOption( 'speakers' ) ) { + $out->element( number_format( $this->mostSpokenLanguages[$code][1] ) ); + } + + // Fill the score field + if ( $reportScore ) { + // Keep count + $i = 0; + // Start with 0 points + $score = 0; + + foreach ( $columns as $fields ) { + list( , $upper, $total ) = $fields; + // Weigh the score and add it to the current score + $score += ( $weights[$i] * $upper ) / $total; + $i++; + } + + // Report a round numbers + $score = number_format( $score, 0 ); + + if ( $summarise ) { + $continent = $this->mostSpokenLanguages[$code][2]; + if ( isset( $summary[$continent] ) ) { + $newcount = $summary[$continent][0] + 1; + $newscore = $summary[$continent][1] + (int)$score; + } else { + $newcount = 1; + $newscore = $score; + } + + $summary[$continent] = [ $newcount, $newscore ]; + } + + if ( $wmfscore ) { + // Multiple variants can be used for the same wiki. + // Store the scores in an array and output them later + // when they can be averaged. + if ( isset( $this->wikimediaCodeMap[$code] ) ) { + $wmfcode = $this->wikimediaCodeMap[$code]; + } else { + $codeparts = explode( '-', $code ); + $wmfcode = $codeparts[0]; + } + + if ( isset( $wmfscores[$wmfcode] ) ) { + $count = $wmfscores[$wmfcode]['count'] + 1; + $tmpWmfScore = (int)$wmfscores[$wmfcode]['score']; + $tmpWmfCount = (int)$wmfscores[$wmfcode]['count']; + $score = ( ( $tmpWmfCount * $tmpWmfScore ) + (int)$score ) / $count; + $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => $count ]; + } else { + $wmfscores[$wmfcode] = [ 'score' => $score, 'count' => 1 ]; + } + } else { + $out->element( $score ); + } + } + + // Fill fields for groups + if ( !$wmfscore ) { + foreach ( $columns as $fields ) { + list( $invert, $upper, $total ) = $fields; + $c = $out->formatPercent( $upper, $total, $invert ); + $out->element( $c ); + } + + $out->blockend(); + } + } + + $out->footer(); + + if ( $reportScore && $this->hasOption( 'summary' ) ) { + if ( $reportScore && $this->hasOption( 'legendsummary' ) ) { + $out->addFreeText( '{{' . $this->getOption( 'legendsummary' ) . "}}\n" ); + } + + $out->summaryheading(); + + $out->blockstart(); + + $out->element( $l10n ? '{{int:translate-gs-continent}}' : 'Continent', true ); + $out->element( $l10n ? '{{int:translate-gs-count}}' : 'Count', true ); + $out->element( $l10n ? '{{int:translate-gs-avgscore}}' : 'Avg. score', true ); + + $out->blockend(); + + ksort( $summary ); + + $totals = [ 0, 0 ]; + + foreach ( $summary as $key => $values ) { + $out->blockstart(); + + if ( $key === 'multiple' ) { + $out->element( $l10n ? '{{int:translate-gs-multiple}}' : 'Multiple' ); + } else { + $out->element( $l10n ? '{{int:timezoneregion-' . $key . '}}' : ucfirst( $key ) ); + } + $out->element( $values[0] ); + $out->element( number_format( $values[1] / $values[0] ) ); + + $out->blockend(); + + $totals[0] += $values[0]; + $totals[1] += $values[1]; + } + + $out->blockstart(); + $out->element( $l10n ? '{{int:translate-gs-total}}' : 'Total' ); + $out->element( $totals[0] ); + $out->element( number_format( $totals[1] / $totals[0] ) ); + $out->blockend(); + + $out->footer(); + } + + // Custom output + if ( $wmfscore ) { + ksort( $wmfscores ); + + foreach ( $wmfscores as $code => $stats ) { + echo $code . ';' . number_format( $stats['score'] ) . ";\n"; + } + } + } +} + +$maintClass = GroupStatistics::class; +require_once RUN_MAINTENANCE_IF_MAIN; |