mDescription = <<get( $key ); if ( !is_array( $pages ) ) { $dbr = wfGetDB( DB_REPLICA ); $conds = []; $conds[] = 'page_title' . $dbr->buildLike( $dbr->anyString(), '/', $dbr->anyString() ); $conds['page_namespace'] = $wgTranslateMessageNamespaces; echo "Before query\n"; $res = $dbr->select( [ 'page' ], [ 'page_title, page_id' ], $conds, __METHOD__ ); echo "After query\n"; $total = $res->numRows(); $index = 0; foreach ( $res as $row ) { $index++; $code = substr( $row->page_title, strrpos( $row->page_title, '/' ) + 1 ); if ( isset( $languages[$code] ) ) { $pages[$code][] = $row->page_id; } if ( $index % 10000 === 0 ) { $progress = number_format( $index / $total * 100, 2 ); echo "$progress%\n"; } } echo "\n"; foreach ( array_keys( $pages ) as $code ) { if ( count( $pages[$code] ) > $messages ) { $pages[$code] = array_slice( $pages[$code], 0, $messages ); } $pages[$code] = implode( '|', $pages[$code] ); } echo "After code map\n"; ksort( $pages ); echo "After sort map\n"; $cache->set( $key, $pages, 3600 * 24 ); echo "After set map\n"; } unset( $pages['qqq'] ); unset( $pages['de-formal'] ); unset( $pages['nl-informal'] ); unset( $pages['en-gb'] ); $pids = []; $threads = 2; foreach ( $pages as $code => $pageids ) { $pid = ( $threads > 1 ) ? pcntl_fork() : -1; if ( $pid === 0 ) { // Child, reseed because there is no bug in PHP: // https://bugs.php.net/bug.php?id=42465 mt_srand( getmypid() ); $this->analyzeLanguage( $code, $pageids ); exit(); } elseif ( $pid === -1 ) { // Fork failed or one thread, do it serialized $this->analyzeLanguage( $code, $pageids ); } else { // Main thread $pids[] = $pid; } // If we hit the thread limit, wait for any child to finish. if ( count( $pids ) >= $threads ) { $status = 0; $pid = pcntl_wait( $status ); unset( $pids[$pid] ); } } foreach ( $pids as $pid ) { $status = 0; pcntl_waitpid( $pid, $status ); } $this->output( "Combining languages\n" ); $huge = []; foreach ( glob( 'temp-*.json' ) as $file ) { $contents = file_get_contents( $file ); $json = FormatJson::decode( $contents, true ); $huge = array_merge( $json, $huge ); $huge['data'] = array_merge( $json['data'], $huge['data'] ); } $json = FormatJson::encode( $huge, true, FormatJson::ALL_OK ); file_put_contents( 'translatewiki.net.json', $json ); } protected function analyzeLanguage( $code, $ids ) { if ( file_exists( "temp-$code.json" ) ) { $this->output( "$code MODEL EXISTS\n" ); return; } $text = $this->cacheSourceText( $code, $ids ); if ( $text === '' ) { return; } $config = new LanguageDetector\Config; $config->useMb( true ); $c = new LanguageDetector\Learn( $config ); $c->addSample( $code, $text ); $c->addStepCallback( function ( $lang, $status ) { echo "Learning {$lang}: $status\n"; } ); $target = LanguageDetector\AbstractFormat::initFormatByPath( "temp-$code.json" ); $c->save( $target ); } protected function cacheSourceText( $code, $ids ) { $cache = wfGetCache( CACHE_DB ); $key = wfMemcKey( __CLASS__, 'cc', $code ); $text = $cache->get( $key ); if ( !is_string( $text ) ) { $snippets = []; $ids = explode( '|', $ids ); $len = count( $ids ); if ( $len < 1000 ) { $this->output( "$code: $len SKIPPED\n" ); return ''; } else { $this->output( "$code PROCESSING\n" ); } $time = microtime( true ); foreach ( $ids as $id ) { $params = new FauxRequest( [ 'pageid' => $id, 'action' => 'parse', 'prop' => 'text', 'disablepp' => 'true', ] ); $api = new ApiMain( $params ); $api->execute(); $result = $api->getResult()->getResultData( null, [ 'BC' => [] ] ); $text = $result['parse']['text']['*']; $text = strip_tags( $text ); $text = str_replace( '!!FUZZY!!', '', $text ); $text = preg_replace( '/\$[0-9]/', '', $text ); $text = trim( $text ); $snippets[] = $text; } $text = implode( ' ', $snippets ); $cache->set( $key, $text, 3600 * 24 ); $delta = microtime( true ) - $time; $this->output( "$code TOOK $delta\n" ); } else { $this->output( "$code FROM CACHE\n" ); } return $text; } } $maintClass = LanguageModelCreator::class; require_once RUN_MAINTENANCE_IF_MAIN;