summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/Translate/scripts/create-language-models.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/Translate/scripts/create-language-models.php')
-rw-r--r--www/wiki/extensions/Translate/scripts/create-language-models.php234
1 files changed, 234 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/scripts/create-language-models.php b/www/wiki/extensions/Translate/scripts/create-language-models.php
new file mode 100644
index 00000000..71a1ce72
--- /dev/null
+++ b/www/wiki/extensions/Translate/scripts/create-language-models.php
@@ -0,0 +1,234 @@
+<?php
+/**
+ * Create language models for https://github.com/crodas/LanguageDetector based
+ * on translation data in your wiki.
+ *
+ * @author Niklas Laxström
+ *
+ * @copyright Copyright © 2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @file
+ */
+
+// Standard boilerplate to define $IP
+if ( getenv( 'MW_INSTALL_PATH' ) !== false ) {
+ $IP = getenv( 'MW_INSTALL_PATH' );
+} else {
+ $dir = __DIR__;
+ $IP = "$dir/../../..";
+}
+require_once "$IP/maintenance/Maintenance.php";
+
+class LanguageModelCreator extends Maintenance {
+ protected $changes = [];
+
+ public function __construct() {
+ parent::__construct();
+ $this->mDescription = <<<TXT
+Create language models for https://github.com/crodas/LanguageDetector based
+on translation data in your wiki. It is safe to kill and restart the script.
+List of pages and filtered language data is cached for 24 hours. Json files
+present will be used, so don't forget to delete before new run.
+TXT;
+ }
+
+ public function execute() {
+ global $wgTranslateMessageNamespaces;
+
+ ini_set( 'memory_limit', -1 );
+
+ // How many messages per language to use.
+ // Language is skipped if it has less than 1000 translations.
+ $messages = 5000;
+
+ $languages = TranslateUtils::getLanguageNames( 'en' );
+ $cache = wfGetCache( CACHE_DB );
+ $key = wfMemcKey( __METHOD__, $messages );
+
+ $pages = $cache->get( $key );
+ if ( !is_array( $pages ) ) {
+ $dbr = wfGetDB( DB_REPLICA );
+ $conds = [];
+ $conds[] = 'page_title' . $dbr->buildLike( $dbr->anyString(), '/', $dbr->anyString() );
+ $conds['page_namespace'] = $wgTranslateMessageNamespaces;
+
+ echo "Before query\n";
+ $res = $dbr->select(
+ [ 'page' ],
+ [ 'page_title, page_id' ],
+ $conds,
+ __METHOD__
+ );
+ echo "After query\n";
+
+ $total = $res->numRows();
+ $index = 0;
+
+ foreach ( $res as $row ) {
+ $index++;
+ $code = substr( $row->page_title, strrpos( $row->page_title, '/' ) + 1 );
+ if ( isset( $languages[$code] ) ) {
+ $pages[$code][] = $row->page_id;
+ }
+
+ if ( $index % 10000 === 0 ) {
+ $progress = number_format( $index / $total * 100, 2 );
+ echo "$progress%\n";
+ }
+ }
+
+ echo "\n";
+
+ foreach ( array_keys( $pages ) as $code ) {
+ if ( count( $pages[$code] ) > $messages ) {
+ $pages[$code] = array_slice( $pages[$code], 0, $messages );
+ }
+
+ $pages[$code] = implode( '|', $pages[$code] );
+ }
+
+ echo "After code map\n";
+
+ ksort( $pages );
+
+ echo "After sort map\n";
+
+ $cache->set( $key, $pages, 3600 * 24 );
+ echo "After set map\n";
+ }
+
+ unset( $pages['qqq'] );
+ unset( $pages['de-formal'] );
+ unset( $pages['nl-informal'] );
+ unset( $pages['en-gb'] );
+
+ $pids = [];
+ $threads = 2;
+ foreach ( $pages as $code => $pageids ) {
+ $pid = ( $threads > 1 ) ? pcntl_fork() : -1;
+
+ if ( $pid === 0 ) {
+ // Child, reseed because there is no bug in PHP:
+ // https://bugs.php.net/bug.php?id=42465
+ mt_srand( getmypid() );
+ $this->analyzeLanguage( $code, $pageids );
+ exit();
+ } elseif ( $pid === -1 ) {
+ // Fork failed or one thread, do it serialized
+ $this->analyzeLanguage( $code, $pageids );
+ } else {
+ // Main thread
+ $pids[] = $pid;
+ }
+
+ // If we hit the thread limit, wait for any child to finish.
+ if ( count( $pids ) >= $threads ) {
+ $status = 0;
+ $pid = pcntl_wait( $status );
+ unset( $pids[$pid] );
+ }
+ }
+
+ foreach ( $pids as $pid ) {
+ $status = 0;
+ pcntl_waitpid( $pid, $status );
+ }
+
+ $this->output( "Combining languages\n" );
+
+ $huge = [];
+ foreach ( glob( 'temp-*.json' ) as $file ) {
+ $contents = file_get_contents( $file );
+ $json = FormatJson::decode( $contents, true );
+
+ $huge = array_merge( $json, $huge );
+ $huge['data'] = array_merge( $json['data'], $huge['data'] );
+ }
+
+ $json = FormatJson::encode( $huge, true, FormatJson::ALL_OK );
+ file_put_contents( 'translatewiki.net.json', $json );
+ }
+
+ protected function analyzeLanguage( $code, $ids ) {
+ if ( file_exists( "temp-$code.json" ) ) {
+ $this->output( "$code MODEL EXISTS\n" );
+ return;
+ }
+
+ $text = $this->cacheSourceText( $code, $ids );
+ if ( $text === '' ) {
+ return;
+ }
+
+ $config = new LanguageDetector\Config;
+ $config->useMb( true );
+ $c = new LanguageDetector\Learn( $config );
+ $c->addSample( $code, $text );
+ $c->addStepCallback( function ( $lang, $status ) {
+ echo "Learning {$lang}: $status\n";
+ } );
+
+ $target = LanguageDetector\AbstractFormat::initFormatByPath( "temp-$code.json" );
+ $c->save( $target );
+ }
+
+ protected function cacheSourceText( $code, $ids ) {
+ $cache = wfGetCache( CACHE_DB );
+ $key = wfMemcKey( __CLASS__, 'cc', $code );
+ $text = $cache->get( $key );
+ if ( !is_string( $text ) ) {
+ $snippets = [];
+
+ $ids = explode( '|', $ids );
+
+ $len = count( $ids );
+
+ if ( $len < 1000 ) {
+ $this->output( "$code: $len SKIPPED\n" );
+ return '';
+ } else {
+ $this->output( "$code PROCESSING\n" );
+ }
+
+ $time = microtime( true );
+
+ foreach ( $ids as $id ) {
+ $params = new FauxRequest( [
+ 'pageid' => $id,
+ 'action' => 'parse',
+ 'prop' => 'text',
+ 'disablepp' => 'true',
+ ] );
+
+ $api = new ApiMain( $params );
+ $api->execute();
+
+ $result = $api->getResult()->getResultData(
+ null,
+ [ 'BC' => [] ]
+ );
+
+ $text = $result['parse']['text']['*'];
+ $text = strip_tags( $text );
+ $text = str_replace( '!!FUZZY!!', '', $text );
+ $text = preg_replace( '/\$[0-9]/', '', $text );
+ $text = trim( $text );
+
+ $snippets[] = $text;
+ }
+
+ $text = implode( ' ', $snippets );
+ $cache->set( $key, $text, 3600 * 24 );
+
+ $delta = microtime( true ) - $time;
+ $this->output( "$code TOOK $delta\n" );
+ } else {
+ $this->output( "$code FROM CACHE\n" );
+ }
+
+ return $text;
+ }
+}
+
+$maintClass = LanguageModelCreator::class;
+require_once RUN_MAINTENANCE_IF_MAIN;