diff options
author | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
---|---|---|
committer | Yaco <franco@reevo.org> | 2020-06-04 11:01:00 -0300 |
commit | fc7369835258467bf97eb64f184b93691f9a9fd5 (patch) | |
tree | daabd60089d2dd76d9f5fb416b005fbe159c799d /www/wiki/extensions/Translate/ttmserver |
first commit
Diffstat (limited to 'www/wiki/extensions/Translate/ttmserver')
12 files changed, 2843 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php b/www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php new file mode 100644 index 00000000..4b047918 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php @@ -0,0 +1,154 @@ +<?php +/** + * Cross Language Translation Search. + * @since 2015.08 + */ +class CrossLanguageTranslationSearchQuery { + /** @var TTMServer */ + protected $server; + + /** @var array */ + protected $params; + + /** @var ResultSet */ + protected $resultset; + + /** @var int */ + protected $total = 0; + + protected $hl = [ '', '' ]; + + public function __construct( array $params, SearchableTTMServer $server ) { + $this->params = $params; + $this->server = $server; + } + + public function getDocuments() { + $documents = []; + $offset = $this->params['offset']; + $limit = $this->params['limit']; + + $options = $this->params; + $options['language'] = $this->params['sourcelanguage']; + // Use a bigger limit that what was requested, since we are likely to throw away many + // results in the local filtering step at extractMessages + $options['limit'] = $limit * 10; + // TODO: the real offset should be communicated to the frontend. It currently assumes + // next offset is current offset + limit and previous one is current offset - limit. + // It might be difficult to fix scrolling results backwards. For now we handle offset + // locally. + $options['offset'] = 0; + + $search = $this->server->createSearch( $this->params['query'], $options, $this->hl ); + $scroll = $search->scroll( '5s' ); + + // Used for aggregations. Only the first scroll response has them. + $this->resultset = null; + + foreach ( $scroll as $resultSet ) { + if ( !$this->resultset ) { + $this->resultset = $resultSet; + $this->total = $resultSet->getTotalHits(); + } + + $results = $this->extractMessages( $resultSet->getDocuments() ); + $documents = array_merge( $documents, $results ); + + $count = count( $documents ); + + if ( $count >= $offset + $limit ) { + break; + } + } + + // clear was introduced in Elastica 5.3.1, but Elastica extension uses 5.3.0 + if ( is_callable( [ $scroll, 'clear' ] ) ) { + $scroll->clear(); + } + $documents = array_slice( $documents, $offset, $limit ); + + return $documents; + } + + /** + * Extract messages from the documents and build message definitions. + * Create a message collection from the definitions in the target language. + * Filter the message collection to get filtered messages. + * Slice messages according to limit and offset given. + * @param \Elastica\Document[] $documents + * @return array[] + */ + protected function extractMessages( $documents ) { + $messages = $ret = []; + + $language = $this->params['language']; + foreach ( $documents as $document ) { + $data = $document->getData(); + + if ( !$this->server->isLocalSuggestion( $data ) ) { + continue; + } + + $title = Title::newFromText( $data['localid'] ); + if ( !$title ) { + continue; + } + + $handle = new MessageHandle( $title ); + if ( !$handle->isValid() ) { + continue; + } + + $key = $title->getNamespace() . ':' . $title->getDBkey(); + $messages[$key] = $data['content']; + } + + $definitions = new MessageDefinitions( $messages ); + $collection = MessageCollection::newFromDefinitions( $definitions, $language ); + + $filter = $this->params['filter']; + if ( $filter === 'untranslated' ) { + $collection->filter( 'hastranslation', true ); + } elseif ( in_array( $filter, $this->getAvailableFilters() ) ) { + $collection->filter( $filter, false ); + } + + if ( $filter === 'translated' || $filter === 'fuzzy' ) { + $collection->loadTranslations(); + } + + foreach ( $collection->keys() as $mkey => $title ) { + $result = []; + $result['content'] = $messages[$mkey]; + if ( $filter === 'translated' || $filter === 'fuzzy' ) { + $result['content'] = $collection[$mkey]->translation(); + } + $handle = new MessageHandle( $title ); + $result['localid'] = $handle->getTitleForBase()->getPrefixedText(); + $result['language'] = $language; + + $ret[] = $result; + } + + return $ret; + } + + /** + * @return array + */ + public function getAvailableFilters() { + return [ + 'translated', + 'fuzzy', + 'untranslated' + ]; + } + + public function getTotalHits() { + return $this->total; + } + + public function getResultSet() { + return $this->resultset; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php b/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php new file mode 100644 index 00000000..70c99b64 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php @@ -0,0 +1,282 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +use Wikimedia\Rdbms\DBQueryError; + +/** + * Mysql based backend. + * @ingroup TTMServer + * @since 2012-06-27 + */ +class DatabaseTTMServer extends TTMServer implements WritableTTMServer, ReadableTTMServer { + protected $sids; + + /** + * @param int $mode DB_REPLICA|DB_MASTER + * @return \Wikimedia\Rdbms\IDatabase + */ + protected function getDB( $mode = DB_REPLICA ) { + return wfGetDB( $mode, 'ttmserver', $this->config['database'] ); + } + + public function update( MessageHandle $handle, $targetText ) { + if ( !$handle->isValid() || $handle->getCode() === '' ) { + return false; + } + + $mkey = $handle->getKey(); + $group = $handle->getGroup(); + $targetLanguage = $handle->getCode(); + $sourceLanguage = $group->getSourceLanguage(); + + // Skip definitions to not slow down mass imports etc. + // These will be added when the first translation is made + if ( $targetLanguage === $sourceLanguage ) { + return false; + } + + $definition = $group->getMessage( $mkey, $sourceLanguage ); + if ( !is_string( $definition ) || !strlen( trim( $definition ) ) ) { + return false; + } + + $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $mkey ); + $dbw = $this->getDB( DB_MASTER ); + /* Check that the definition exists and fetch the sid. If not, add + * the definition and retrieve the sid. If the definition changes, + * we will create a new entry - otherwise we could at some point + * get suggestions which do not match the original definition any + * longer. The old translations are still kept until purged by + * rerunning the bootstrap script. */ + $conds = [ + 'tms_context' => $context->getPrefixedText(), + 'tms_text' => $definition, + ]; + + $sid = $dbw->selectField( 'translate_tms', 'tms_sid', $conds, __METHOD__ ); + if ( $sid === false ) { + $sid = $this->insertSource( $context, $sourceLanguage, $definition ); + } + + // Delete old translations for this message if any. Could also use replace + $deleteConds = [ + 'tmt_sid' => $sid, + 'tmt_lang' => $targetLanguage, + ]; + $dbw->delete( 'translate_tmt', $deleteConds, __METHOD__ ); + + // Insert the new translation + if ( $targetText !== null ) { + $row = $deleteConds + [ + 'tmt_text' => $targetText, + ]; + + $dbw->insert( 'translate_tmt', $row, __METHOD__ ); + } + + return true; + } + + protected function insertSource( Title $context, $sourceLanguage, $text ) { + $row = [ + 'tms_lang' => $sourceLanguage, + 'tms_len' => mb_strlen( $text ), + 'tms_text' => $text, + 'tms_context' => $context->getPrefixedText(), + ]; + + $dbw = $this->getDB( DB_MASTER ); + $dbw->insert( 'translate_tms', $row, __METHOD__ ); + $sid = $dbw->insertId(); + + $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); + if ( count( $fulltext ) ) { + $row = [ + 'tmf_sid' => $sid, + 'tmf_text' => implode( ' ', $fulltext ), + ]; + $dbw->insert( 'translate_tmf', $row, __METHOD__ ); + } + + return $sid; + } + + /** + * Tokenizes the text for fulltext search. + * Tries to find the most useful tokens. + * + * @param string $language Language code + * @param string $input + * @return array + */ + protected function filterForFulltext( $language, $input ) { + $lang = Language::factory( $language ); + + $text = preg_replace( '/[^[:alnum:]]/u', ' ', $input ); + $text = $lang->segmentByWord( $text ); + $text = $lang->lc( $text ); + $segments = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY ); + if ( count( $segments ) < 4 ) { + return []; + } + + foreach ( $segments as $i => $segment ) { + // Yes strlen + $len = strlen( $segment ); + if ( $len < 4 || $len > 15 ) { + unset( $segments[$i] ); + } + } + + $segments = array_unique( $segments ); + $segments = array_slice( $segments, 0, 10 ); + + return $segments; + } + + public function beginBootstrap() { + $dbw = $this->getDB( DB_MASTER ); + $dbw->delete( 'translate_tms', '*', __METHOD__ ); + $dbw->delete( 'translate_tmt', '*', __METHOD__ ); + $dbw->delete( 'translate_tmf', '*', __METHOD__ ); + $table = $dbw->tableName( 'translate_tmf' ); + try { + $dbw->query( "DROP INDEX tmf_text ON $table" ); + } catch ( DBQueryError $e ) { + // Perhaps the script was aborted before it got + // chance to add the index back. + } + } + + public function beginBatch() { + $this->sids = []; + } + + public function batchInsertDefinitions( array $batch ) { + foreach ( $batch as $key => $item ) { + list( $title, $language, $text ) = $item; + $handle = new MessageHandle( $title ); + $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); + $this->sids[$key] = $this->insertSource( $context, $language, $text ); + } + wfWaitForSlaves( 10 ); + } + + public function batchInsertTranslations( array $batch ) { + $rows = []; + foreach ( $batch as $key => $data ) { + list( , $language, $text ) = $data; + $rows[] = [ + 'tmt_sid' => $this->sids[$key], + 'tmt_lang' => $language, + 'tmt_text' => $text, + ]; + } + + $dbw = $this->getDB( DB_MASTER ); + $dbw->insert( 'translate_tmt', $rows, __METHOD__ ); + wfWaitForSlaves( 10 ); + } + + public function endBatch() { + } + + public function endBootstrap() { + $dbw = $this->getDB( DB_MASTER ); + $table = $dbw->tableName( 'translate_tmf' ); + $dbw->query( "CREATE FULLTEXT INDEX tmf_text ON $table (tmf_text)" ); + } + + /* Reading interface */ + + public function isLocalSuggestion( array $suggestion ) { + return true; + } + + public function expandLocation( array $suggestion ) { + $title = Title::newFromText( $suggestion['location'] ); + + return $title->getCanonicalURL(); + } + + public function query( $sourceLanguage, $targetLanguage, $text ) { + // Calculate the bounds of the string length which are able + // to satisfy the cutoff percentage in edit distance. + $len = mb_strlen( $text ); + $min = ceil( max( $len * $this->config['cutoff'], 2 ) ); + $max = floor( $len / $this->config['cutoff'] ); + + // We could use fulltext index to narrow the results further + $dbr = $this->getDB( DB_REPLICA ); + $tables = [ 'translate_tmt', 'translate_tms' ]; + $fields = [ 'tms_context', 'tms_text', 'tmt_lang', 'tmt_text' ]; + + $conds = [ + 'tms_lang' => $sourceLanguage, + 'tmt_lang' => $targetLanguage, + "tms_len BETWEEN $min AND $max", + 'tms_sid = tmt_sid', + ]; + + $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); + if ( $fulltext ) { + $tables[] = 'translate_tmf'; + $list = implode( ' ', $fulltext ); + $conds[] = 'tmf_sid = tmt_sid'; + $conds[] = "MATCH(tmf_text) AGAINST( '$list' )"; + } + + $res = $dbr->select( $tables, $fields, $conds, __METHOD__ ); + + return $this->processQueryResults( $res, $text, $targetLanguage ); + } + + protected function processQueryResults( $res, $text, $targetLanguage ) { + $timeLimit = microtime( true ) + 5; + + $lenA = mb_strlen( $text ); + $results = []; + foreach ( $res as $row ) { + if ( microtime( true ) > $timeLimit ) { + // Having no suggestions is better than preventing translation + // altogether by timing out the request :( + break; + } + + $a = $text; + $b = $row->tms_text; + $lenB = mb_strlen( $b ); + $len = min( $lenA, $lenB ); + if ( $len > 600 ) { + // two strings of length 1500 ~ 10s + // two strings of length 2250 ~ 30s + $dist = $len; + } else { + $dist = self::levenshtein( $a, $b, $lenA, $lenB ); + } + $quality = 1 - ( $dist * 0.9 / $len ); + + if ( $quality >= $this->config['cutoff'] ) { + $results[] = [ + 'source' => $row->tms_text, + 'target' => $row->tmt_text, + 'context' => $row->tms_context, + 'location' => $row->tms_context . '/' . $targetLanguage, + 'quality' => $quality, + 'wiki' => $row->tms_wiki ?? wfWikiID(), + ]; + } + } + $results = TTMServer::sortSuggestions( $results ); + + return $results; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php b/www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php new file mode 100644 index 00000000..0835d518 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php @@ -0,0 +1,890 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +use MediaWiki\Logger\LoggerFactory; + +/** + * TTMServer backed based on ElasticSearch. Depends on Elastica. + * @since 2014.04 + * @ingroup TTMServer + */ +class ElasticSearchTTMServer + extends TTMServer + implements ReadableTTMServer, WritableTTMServer, SearchableTTMserver +{ + /** + * @const int number of documents that will be loaded and deleted in a + * single operation + */ + const BULK_DELETE_CHUNK_SIZE = 100; + + /** + * @const int in case a write operation fails during a batch process + * this constant controls the number of times we will retry the same + * operation. + */ + const BULK_INDEX_RETRY_ATTEMPTS = 5; + + /** + * @const int time (seconds) to wait for the index to ready before + * starting to index. Since we wait for index status it can be relatively + * long especially if some nodes are restarted. + */ + const WAIT_UNTIL_READY_TIMEOUT = 3600; + + /** + * Flag in the frozen index that indicates that all indices + * are frozen (useful only when this service shares the cluster with + * CirrusSearch) + */ + const ALL_INDEXES_FROZEN_NAME = 'freeze_everything'; + + /** + * Type used in the frozen index + */ + const FROZEN_TYPE = 'frozen'; + + /** + * @var \Elastica\Client + */ + protected $client; + + /** + * Reference to the maintenance script to relay logging output. + */ + protected $logger; + + /** + * Used for Reindex + */ + protected $updateMapping = false; + + public function isLocalSuggestion( array $suggestion ) { + return $suggestion['wiki'] === wfWikiID(); + } + + public function expandLocation( array $suggestion ) { + return $suggestion['uri']; + } + + public function query( $sourceLanguage, $targetLanguage, $text ) { + try { + return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); + } catch ( Exception $e ) { + throw new TranslationHelperException( 'Elastica exception: ' . $e ); + } + } + + protected function doQuery( $sourceLanguage, $targetLanguage, $text ) { + if ( !$this->useWikimediaExtraPlugin() ) { + // ElasticTTM is currently not compatible with elasticsearch 2.x/5.x + // It needs FuzzyLikeThis ported via the wmf extra plugin + throw new \RuntimeException( 'The wikimedia extra plugin is mandatory.' ); + } + /* Two query system: + * 1) Find all strings in source language that match text + * 2) Do another query for translations for those strings + */ + $connection = $this->getClient()->getConnection(); + $oldTimeout = $connection->getTimeout(); + $connection->setTimeout( 10 ); + + $fuzzyQuery = new FuzzyLikeThis(); + $fuzzyQuery->setLikeText( $text ); + $fuzzyQuery->addFields( [ 'content' ] ); + + $boostQuery = new \Elastica\Query\FunctionScore(); + $boostQuery->addFunction( + 'levenshtein_distance_score', + [ + 'text' => $text, + 'field' => 'content' + ] + ); + $boostQuery->setBoostMode( \Elastica\Query\FunctionScore::BOOST_MODE_REPLACE ); + + // Wrap the fuzzy query so it can be used as a filter. + // This is slightly faster, as ES can throw away the scores by this query. + $bool = new \Elastica\Query\BoolQuery(); + $bool->addFilter( $fuzzyQuery ); + $bool->addMust( $boostQuery ); + + $languageFilter = new \Elastica\Query\Term(); + $languageFilter->setTerm( 'language', $sourceLanguage ); + $bool->addFilter( $languageFilter ); + + // The whole query + $query = new \Elastica\Query(); + $query->setQuery( $bool ); + + // The interface usually displays three best candidates. These might + // come from more than three source things, if the translations are + // the same. In other words suggestions are grouped by the suggested + // translation. This algorithm might not find all suggestions, if the + // top N best matching source texts don't have equivalent translations + // in the target language, but worse matches which we did not fetch do. + // This code tries to balance between doing too many or too big queries + // and not fetching enough results to show all possible suggestions. + $sizeFirst = 100; + $sizeSecond = $sizeFirst * 5; + + $query->setFrom( 0 ); + $query->setSize( $sizeFirst ); + $query->setParam( '_source', [ 'content' ] ); + $cutoff = $this->config['cutoff'] ?? 0.65; + $query->setParam( 'min_score', $cutoff ); + $query->setSort( [ '_score', '_uid' ] ); + + /* This query is doing two unrelated things: + * 1) Collect the message contents and scores so that they can + * be accessed later for the translations we found. + * 2) Build the query string for the query that fetches the translations. + */ + $contents = $scores = $terms = []; + do { + $resultset = $this->getType()->search( $query ); + + if ( count( $resultset ) === 0 ) { + break; + } + + foreach ( $resultset->getResults() as $result ) { + $data = $result->getData(); + $score = $result->getScore(); + + $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() ); + $contents[$sourceId] = $data['content']; + $scores[$sourceId] = $score; + $terms[] = "$sourceId/$targetLanguage"; + } + + // Check if it looks like that we are hitting the long tail already. + // Otherwise, we'll do a query to fetch some more to reach a "sane" + // breaking point, i.e. include all suggestions with same content + // for reliable used X times statistics. + if ( count( array_unique( $scores ) ) > 5 ) { + break; + } + + // Okay, We are now in second iteration of the loop. We already got + // lots of suggestions. We will give up for now even if it means we + // return in some sense incomplete results. + if ( count( $resultset ) === $sizeSecond ) { + break; + } + + // After the first query, the smallest score is the new threshold. + $query->setParam( 'min_score', $score ); + $query->setFrom( $query->getParam( 'size' ) + $query->getParam( 'from' ) ); + $query->setSize( $sizeSecond ); + + // Break if we already got all hits + } while ( $resultset->getTotalHits() > count( $contents ) ); + + $suggestions = []; + + // Skip second query if first query found nothing. Keeping only one return + // statement in this method to avoid forgetting to reset connection timeout + if ( $terms !== [] ) { + $idQuery = new \Elastica\Query\Terms(); + $idQuery->setTerms( '_id', $terms ); + + $query = new \Elastica\Query( $idQuery ); + $query->setSize( 25 ); + $query->setParam( '_source', [ 'wiki', 'uri', 'content', 'localid' ] ); + $resultset = $this->getType()->search( $query ); + + foreach ( $resultset->getResults() as $result ) { + $data = $result->getData(); + + // Construct the matching source id + $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() ); + + $suggestions[] = [ + 'source' => $contents[$sourceId], + 'target' => $data['content'], + 'context' => $data['localid'], + 'quality' => $scores[$sourceId], + 'wiki' => $data['wiki'], + 'location' => $data['localid'] . '/' . $targetLanguage, + 'uri' => $data['uri'], + ]; + } + + // Ensure results are in quality order + uasort( $suggestions, function ( $a, $b ) { + if ( $a['quality'] === $b['quality'] ) { + return 0; + } + + return ( $a['quality'] < $b['quality'] ) ? 1 : -1; + } ); + } + + $connection->setTimeout( $oldTimeout ); + + return $suggestions; + } + + /* Write functions */ + + /** + * Add / update translations. + * + * @param MessageHandle $handle + * @param ?string $targetText + * @throws \RuntimeException + * @return bool + */ + public function update( MessageHandle $handle, $targetText ) { + if ( !$handle->isValid() || $handle->getCode() === '' ) { + return false; + } + + /* There are various different cases here: + * [new or updated] [fuzzy|non-fuzzy] [translation|definition] + * 1) We don't distinguish between new or updated here. + * 2) Delete old translation, but not definition + * 3) Insert new translation or definition, if non-fuzzy + * The definition should never be fuzzied anyway. + * + * These only apply to known messages. + */ + + $sourceLanguage = $handle->getGroup()->getSourceLanguage(); + + // Do not delete definitions, because the translations are attached to that + if ( $handle->getCode() !== $sourceLanguage ) { + $localid = $handle->getTitleForBase()->getPrefixedText(); + $this->deleteByQuery( $this->getType(), Elastica\Query::create( + ( new \Elastica\Query\BoolQuery() ) + ->addFilter( new Elastica\Query\Term( [ 'wiki' => wfWikiID() ] ) ) + ->addFilter( new Elastica\Query\Term( [ 'language' => $handle->getCode() ] ) ) + ->addFilter( new Elastica\Query\Term( [ 'localid' => $localid ] ) ) ) ); + } + + // If translation was made fuzzy, we do not need to add anything + if ( $targetText === null ) { + return true; + } + + $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); + $doc = $this->createDocument( $handle, $targetText, $revId ); + $fname = __METHOD__; + + MWElasticUtils::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS, + function () use ( $doc ) { + $this->getType()->addDocument( $doc ); + }, + function ( $e, $errors ) use ( $fname ) { + $c = get_class( $e ); + $msg = $e->getMessage(); + error_log( $fname . ": update failed ($c: $msg); retrying." ); + sleep( 10 ); + } + ); + + return true; + } + + /** + * @param MessageHandle $handle + * @param string $text + * @param int $revId + * @return \Elastica\Document + */ + protected function createDocument( MessageHandle $handle, $text, $revId ) { + $language = $handle->getCode(); + + $localid = $handle->getTitleForBase()->getPrefixedText(); + $wiki = wfWikiID(); + $globalid = "$wiki-$localid-$revId/$language"; + + $data = [ + 'wiki' => $wiki, + 'uri' => $handle->getTitle()->getCanonicalURL(), + 'localid' => $localid, + 'language' => $language, + 'content' => $text, + 'group' => $handle->getGroupIds(), + ]; + + return new \Elastica\Document( $globalid, $data ); + } + + /** + * Create index + * @param bool $rebuild Deletes index first if already exists + */ + public function createIndex( $rebuild ) { + $indexSettings = [ + 'number_of_shards' => $this->getShardCount(), + 'analysis' => [ + 'filter' => [ + 'prefix_filter' => [ + 'type' => 'edge_ngram', + 'min_gram' => 2, + 'max_gram' => 20 + ] + ], + 'analyzer' => [ + 'prefix' => [ + 'type' => 'custom', + 'tokenizer' => 'standard', + 'filter' => [ 'standard', 'lowercase', 'prefix_filter' ] + ], + 'casesensitive' => [ + 'tokenizer' => 'standard', + 'filter' => [ 'standard' ] + ] + ] + ] + ]; + $replicas = $this->getReplicaCount(); + if ( strpos( $replicas, '-' ) === false ) { + $indexSettings['number_of_replicas'] = $replicas; + } else { + $indexSettings['auto_expand_replicas'] = $replicas; + } + + $type = $this->getType(); + $type->getIndex()->create( $indexSettings, $rebuild ); + } + + /** + * Begin the bootstrap process. + * + * @throws \RuntimeException + */ + public function beginBootstrap() { + $type = $this->getType(); + if ( $this->updateMapping ) { + $this->logOutput( 'Updating the index mappings...' ); + $this->createIndex( true ); + } elseif ( !$type->getIndex()->exists() ) { + $this->createIndex( false ); + } + + $settings = $type->getIndex()->getSettings(); + $settings->setRefreshInterval( '-1' ); + + $this->deleteByQuery( $this->getType(), \Elastica\Query::create( + ( new Elastica\Query\Term() )->setTerm( 'wiki', wfWikiID() ) ) ); + + $mapping = new \Elastica\Type\Mapping(); + $mapping->setType( $type ); + $mapping->setProperties( [ + 'wiki' => [ 'type' => 'keyword' ], + 'localid' => [ 'type' => 'keyword' ], + 'uri' => [ 'type' => 'keyword' ], + 'language' => [ 'type' => 'keyword' ], + 'group' => [ 'type' => 'keyword' ], + 'content' => [ + 'type' => 'text', + 'fields' => [ + 'content' => [ + 'type' => 'text', + 'term_vector' => 'yes' + ], + 'prefix_complete' => [ + 'type' => 'text', + 'analyzer' => 'prefix', + 'search_analyzer' => 'standard', + 'term_vector' => 'yes' + ], + 'case_sensitive' => [ + 'type' => 'text', + 'analyzer' => 'casesensitive', + 'term_vector' => 'yes' + ] + ] + ], + ] ); + $mapping->send(); + + $this->waitUntilReady(); + } + + public function beginBatch() { + // I hate the rule that forbids {} + } + + public function batchInsertDefinitions( array $batch ) { + $lb = new LinkBatch(); + foreach ( $batch as $data ) { + $lb->addObj( $data[0]->getTitle() ); + } + $lb->execute(); + + $this->batchInsertTranslations( $batch ); + } + + public function batchInsertTranslations( array $batch ) { + $docs = []; + foreach ( $batch as $data ) { + list( $handle, $sourceLanguage, $text ) = $data; + $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); + $docs[] = $this->createDocument( $handle, $text, $revId ); + } + + MWElasticUtils::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS, + function () use ( $docs ) { + $this->getType()->addDocuments( $docs ); + }, + function ( $e, $errors ) { + $c = get_class( $e ); + $msg = $e->getMessage(); + $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); + sleep( 10 ); + } + ); + } + + public function endBatch() { + // I hate the rule that forbids {} + } + + public function endBootstrap() { + $index = $this->getType()->getIndex(); + $index->refresh(); + $index->forcemerge(); + $index->getSettings()->setRefreshInterval( '5s' ); + } + + public function getClient() { + if ( !$this->client ) { + if ( isset( $this->config['config'] ) ) { + $this->client = new \Elastica\Client( $this->config['config'] ); + } else { + $this->client = new \Elastica\Client(); + } + } + return $this->client; + } + + /** + * @return true if the backend is configured with the wikimedia extra plugin + */ + public function useWikimediaExtraPlugin() { + return isset( $this->config['use_wikimedia_extra'] ) && $this->config['use_wikimedia_extra']; + } + + /** + * @return string + */ + private function getIndexName() { + if ( isset( $this->config['index'] ) ) { + return $this->config['index']; + } else { + return 'ttmserver'; + } + } + + public function getType() { + return $this->getClient() + ->getIndex( $this->getIndexName() ) + ->getType( 'message' ); + } + + protected function getShardCount() { + return $this->config['shards'] ?? 1; + } + + protected function getReplicaCount() { + return $this->config['replicas'] ?? '0-2'; + } + + /** + * Get index health + * TODO: Remove this code in the future as we drop support for + * older versions of the Elastica extension. + * + * @param string $indexName + * @return array the index health status + */ + protected function getIndexHealth( $indexName ) { + $path = "_cluster/health/$indexName"; + $response = $this->getClient()->request( $path ); + if ( $response->hasError() ) { + throw new \Exception( "Error while fetching index health status: " . $response->getError() ); + } + return $response->getData(); + } + + /** + * Wait for the index to go green + * + * NOTE: This method has been copied and adjusted from + * CirrusSearch/includes/Maintenance/ConfigUtils.php. Ideally we'd + * like to make these utility methods available in the Elastica + * extension, but this one requires some refactoring in cirrus first. + * TODO: Remove this code in the future as we drop support for + * older versions of the Elastica extension. + * + * @param string $indexName + * @param int $timeout + * @return bool true if the index is green false otherwise. + */ + protected function waitForGreen( $indexName, $timeout ) { + $startTime = time(); + while ( ( $startTime + $timeout ) > time() ) { + try { + $response = $this->getIndexHealth( $indexName ); + $status = isset( $response['status'] ) ? $response['status'] : 'unknown'; + if ( $status === 'green' ) { + $this->logOutput( "\tGreen!" ); + return true; + } + $this->logOutput( "\tIndex is $status retrying..." ); + sleep( 5 ); + } catch ( \Exception $e ) { + $this->logOutput( "Error while waiting for green ({$e->getMessage()}), retrying..." ); + } + } + return false; + } + + protected function waitUntilReady() { + if ( method_exists( 'MWElasticUtils', 'waitForGreen' ) ) { + $statuses = MWElasticUtils::waitForGreen( + $this->getClient(), + $this->getIndexName(), + self::WAIT_UNTIL_READY_TIMEOUT ); + $this->logOutput( "Waiting for the index to go green..." ); + foreach ( $statuses as $message ) { + $this->logOutput( $message ); + } + + if ( !$statuses->getReturn() ) { + die( "Timeout! Please check server logs for {$this->getIndexName()}." ); + } + + return; + } + + // TODO: This code can be removed in the future as we drop support for + // older versions of the Elastica extension. + $indexName = $this->getType()->getIndex()->getName(); + $this->logOutput( "Waiting for the index to go green..." ); + if ( !$this->waitForGreen( $indexName, self::WAIT_UNTIL_READY_TIMEOUT ) ) { + die( "Timeout! Please check server logs for {$this->getIndex()->getName()}." ); + } + } + + public function setLogger( $logger ) { + $this->logger = $logger; + } + + // Can it get any uglier? + protected function logOutput( $text ) { + if ( $this->logger ) { + $this->logger->statusLine( "$text\n" ); + } + } + + /** + * Force the update of index mappings + * @since 2015.03 + */ + public function doMappingUpdate() { + $this->updateMapping = true; + } + + /** + * Parse query string and build the search query + * @param string $queryString + * @param array $opts + * @return array + */ + protected function parseQueryString( $queryString, array $opts ) { + $fields = $highlights = []; + $terms = preg_split( '/\s+/', $queryString ); + $match = $opts['match']; + $case = $opts['case']; + + // Map each word in the query string with its corresponding field + foreach ( $terms as $term ) { + $prefix = strstr( $term, '*', true ); + if ( $prefix ) { + // For wildcard search + $fields['content.prefix_complete'][] = $prefix; + } elseif ( $case === '1' ) { + // For case sensitive search + $fields['content.case_sensitive'][] = $term; + } else { + $fields['content'][] = $term; + } + } + + // Allow searching either by message content or message id (page name + // without language subpage) with exact match only. + $searchQuery = new \Elastica\Query\BoolQuery(); + foreach ( $fields as $analyzer => $words ) { + foreach ( $words as $word ) { + $boolQuery = new \Elastica\Query\BoolQuery(); + $contentQuery = new \Elastica\Query\Match(); + $contentQuery->setFieldQuery( $analyzer, $word ); + $boolQuery->addShould( $contentQuery ); + $messageQuery = new \Elastica\Query\Term(); + $messageQuery->setTerm( 'localid', $word ); + $boolQuery->addShould( $messageQuery ); + + if ( $match === 'all' ) { + $searchQuery->addMust( $boolQuery ); + } else { + $searchQuery->addShould( $boolQuery ); + } + + // Fields for highlighting + $highlights[$analyzer] = [ + 'number_of_fragments' => 0 + ]; + + // Allow searching by exact message title (page name with + // language subpage). + $title = Title::newFromText( $word ); + if ( !$title ) { + continue; + } + $handle = new MessageHandle( $title ); + if ( $handle->isValid() && $handle->getCode() !== '' ) { + $localid = $handle->getTitleForBase()->getPrefixedText(); + $boolQuery = new \Elastica\Query\BoolQuery(); + $messageId = new \Elastica\Query\Term(); + $messageId->setTerm( 'localid', $localid ); + $boolQuery->addMust( $messageId ); + $searchQuery->addShould( $boolQuery ); + } + } + } + + return [ $searchQuery, $highlights ]; + } + + /** + * Search interface + * @param string $queryString + * @param array $opts + * @param array $highlight + * @return \Elastica\Search + */ + public function createSearch( $queryString, $opts, $highlight ) { + $query = new \Elastica\Query(); + + list( $searchQuery, $highlights ) = $this->parseQueryString( $queryString, $opts ); + $query->setQuery( $searchQuery ); + + $language = new \Elastica\Aggregation\Terms( 'language' ); + $language->setField( 'language' ); + $language->setSize( 500 ); + $query->addAggregation( $language ); + + $group = new \Elastica\Aggregation\Terms( 'group' ); + $group->setField( 'group' ); + // Would like to prioritize the top level groups and not show subgroups + // if the top group has only few hits, but that doesn't seem to be possile. + $group->setSize( 500 ); + $query->addAggregation( $group ); + + $query->setSize( $opts['limit'] ); + $query->setFrom( $opts['offset'] ); + + // BoolAnd filters are executed in sequence per document. Bool filters with + // multiple must clauses are executed by converting each filter into a bit + // field then anding them together. The latter is normally faster if either + // of the subfilters are reused. May not make a difference in this context. + $filters = new \Elastica\Query\BoolQuery(); + + $language = $opts['language']; + if ( $language !== '' ) { + $languageFilter = new \Elastica\Query\Term(); + $languageFilter->setTerm( 'language', $language ); + $filters->addFilter( $languageFilter ); + } + + $group = $opts['group']; + if ( $group !== '' ) { + $groupFilter = new \Elastica\Query\Term(); + $groupFilter->setTerm( 'group', $group ); + $filters->addFilter( $groupFilter ); + } + + // Check that we have at least one filter to avoid invalid query errors. + if ( $language !== '' || $group !== '' ) { + // TODO: This seems wrong, but perhaps for aggregation purposes? + // should make $search a must clause and use the bool query + // as main. + $query->setPostFilter( $filters ); + } + + list( $pre, $post ) = $highlight; + $query->setHighlight( [ + // The value must be an object + 'pre_tags' => [ $pre ], + 'post_tags' => [ $post ], + 'fields' => $highlights, + ] ); + + return $this->getType()->getIndex()->createSearch( $query ); + } + + /** + * Search interface + * @param string $queryString + * @param array $opts + * @param array $highlight + * @throws TTMServerException + * @return \Elastica\ResultSet + */ + public function search( $queryString, $opts, $highlight ) { + $search = $this->createSearch( $queryString, $opts, $highlight ); + + try { + return $search->search(); + } catch ( \Elastica\Exception\ExceptionInterface $e ) { + throw new TTMServerException( $e->getMessage() ); + } + } + + public function getFacets( $resultset ) { + $aggs = $resultset->getAggregations(); + + $ret = [ + 'language' => [], + 'group' => [] + ]; + + foreach ( $aggs as $type => $info ) { + foreach ( $info['buckets'] as $row ) { + $ret[$type][$row['key']] = $row['doc_count']; + } + } + + return $ret; + } + + public function getTotalHits( $resultset ) { + return $resultset->getTotalHits(); + } + + public function getDocuments( $resultset ) { + $ret = []; + foreach ( $resultset->getResults() as $document ) { + $data = $document->getData(); + $hl = $document->getHighlights(); + if ( isset( $hl['content.prefix_complete'][0] ) ) { + $data['content'] = $hl['content.prefix_complete'][0]; + } elseif ( isset( $hl['content.case_sensitive'][0] ) ) { + $data['content'] = $hl['content.case_sensitive'][0]; + } elseif ( isset( $hl['content'][0] ) ) { + $data['content'] = $hl['content'][0]; + } + $ret[] = $data; + } + + return $ret; + } + + /** + * Delete docs by query by using the scroll API. + * TODO: Elastica\Index::deleteByQuery() ? was removed + * in 2.x and returned in 5.x. + * + * @param \Elastica\Type $type the source index + * @param \Elastica\Query $query the query + * @throws \RuntimeException + */ + private function deleteByQuery( \Elastica\Type $type, \Elastica\Query $query ) { + if ( method_exists( 'MWElasticUtils', 'deleteByQuery' ) ) { + try { + MWElasticUtils::deleteByQuery( $type->getIndex(), $query, /* $allowConflicts = */ true ); + } catch ( \Exception $e ) { + LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->error( + 'Problem encountered during deletion.', + [ 'exception' => $e ] + ); + + throw new \RuntimeException( "Problem encountered during deletion.\n" . $e ); + } + return; + } + // TODO: This code can be removed in the future as we drop support for + // older versions of the Elastica extension. + $retryAttempts = self::BULK_INDEX_RETRY_ATTEMPTS; + $search = new \Elastica\Search( $this->getClient() ); + $search->setQuery( $query ); + $search->addType( $type ); + $search->addIndex( $type->getIndex() ); + $scroll = new \Elastica\Scroll( $search, '15m' ); + + foreach ( $scroll as $results ) { + $ids = []; + foreach ( $results as $result ) { + $ids[] = $result->getId(); + } + + if ( $ids === [] ) { + continue; + } + + MWElasticUtils::withRetry( $retryAttempts, + function () use ( $ids, $type ) { + $type->deleteIds( $ids ); + } + ); + } + } + + /** + * @return bool + */ + public function isFrozen() { + if ( method_exists( 'MWElasticUtils', 'isFrozen' ) ) { + try { + return MWElasticUtils::isFrozen( $this->getClient() ); + } catch ( \Exception $e ) { + LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->warning( + 'Problem encountered while checking the frozen index.', + [ 'exception' => $e ] + ); + return false; + } + } + + // TODO: This code can be removed in the future as we drop support for + // older versions of the Elastica extension. + if ( !isset( $this->config['frozen_index'] ) ) { + return false; + } + $frozenIndex = $this->config['frozen_index']; + $indices = [ static::ALL_INDEXES_FROZEN_NAME, $this->getIndexName() ]; + $ids = ( new \Elastica\Query\Ids() ) + ->setIds( $indices ); + + try { + $resp = $this->getClient() + ->getIndex( $frozenIndex ) + ->getType( static::FROZEN_TYPE ) + ->search( \Elastica\Query::create( $ids ) ); + + if ( $resp->count() === 0 ) { + return false; + } else { + return true; + } + } catch ( \Exception $e ) { + LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->warning( + 'Problem encountered while checking the frozen index.', + [ 'exception' => $e ] + ); + return false; + } + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/Exceptions.php b/www/wiki/extensions/Translate/ttmserver/Exceptions.php new file mode 100644 index 00000000..97859cf0 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/Exceptions.php @@ -0,0 +1,4 @@ +<?php + +class TTMServerException extends MWException { +} diff --git a/www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php b/www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php new file mode 100644 index 00000000..16715592 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php @@ -0,0 +1,60 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +/** + * NO-OP version of TTMServer when it is disabled. + * Keeps other code simpler when they can just do + * TTMServer::primary()->update( ... ); + * @since 2012-01-28 + * @ingroup TTMServer + */ +class FakeTTMServer implements ReadableTTMServer, WritableTTMServer { + public function query( $sourceLanguage, $targetLanguage, $text ) { + return []; + } + + public function isLocalSuggestion( array $suggestion ) { + false; + } + + public function expandLocation( array $suggestion ) { + return ''; + } + + public function update( MessageHandle $handle, $targetText ) { + } + + public function beginBootstrap() { + } + + public function beginBatch() { + } + + public function batchInsertDefinitions( array $batch ) { + } + + public function batchInsertTranslations( array $batch ) { + } + + public function endBatch() { + } + + public function endBootstrap() { + } + + public function getMirrors() { + return []; + } + + public function isFrozen() { + return false; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php b/www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php new file mode 100644 index 00000000..143b3222 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php @@ -0,0 +1,222 @@ +<?php +/** + * NOTE: the following class has been copied from elastica 2.3.1 : + * https://github.com/ruflin/Elastica/blob/2.3.1/lib/Elastica/Query/FuzzyLikeThis.php + * (few modifications have been made to comply with phpcs rules used by this extension) + * It is intended to be used as a temporary workaround with the wmf extra + * elasticsearch plugin with elasticsearch 2.x. + * + * The MIT License (MIT) + * + * Copyright (c) 2014 Nicolas Ruflin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * (c.f. https://github.com/ruflin/Elastica/blob/2.3.1/LICENSE.txt) + * + * @file + * @license MIT + * @ingroup TTMServer + */ + +/** + * Fuzzy Like This query. + * + * @author Raul Martinez, Jr <juneym@gmail.com> + * + * @link https://www.elastic.co/guide/en/elasticsearch/reference/1.7/query-dsl-flt-query.html + * + * @since 2016.05 + * @ingroup TTMServer + */ +class FuzzyLikeThis extends \Elastica\Query\AbstractQuery { + // phpcs:disable PSR2.Classes.PropertyDeclaration.Underscore + /** + * Field names. + * + * @var array Field names + */ + protected $_fields = []; + + /** + * Like text. + * + * @var string Like text + */ + protected $_likeText = ''; + + /** + * Ignore term frequency. + * + * @var bool ignore term frequency + */ + protected $_ignoreTF = false; + + /** + * Max query terms value. + * + * @var int Max query terms value + */ + protected $_maxQueryTerms = 25; + + /** + * fuzziness. + * + * @var int fuzziness + */ + protected $_fuzziness = 2; + + /** + * Prefix Length. + * + * @var int Prefix Length + */ + protected $_prefixLength = 0; + + /** + * Analyzer. + * + * @var string Analyzer + */ + protected $_analyzer; + // phpcs:enable + + /** + * Adds field to flt query. + * + * @param array $fields Field names + * + * @return $this + */ + public function addFields( array $fields ) { + $this->_fields = $fields; + + return $this; + } + + /** + * Set the "like_text" value. + * + * @param string $text + * + * @return $this + */ + public function setLikeText( $text ) { + $text = trim( $text ); + $this->_likeText = $text; + + return $this; + } + + /** + * Set the "ignore_tf" value (ignore term frequency). + * + * @param bool $ignoreTF + * + * @return $this + */ + public function setIgnoreTF( $ignoreTF ) { + $this->_ignoreTF = (bool)$ignoreTF; + + return $this; + } + + /** + * Set the minimum similarity. + * + * @param int $value + * + * @return $this + */ + public function setFuzziness( $value ) { + $value = (int)$value; + $this->_fuzziness = $value; + + return $this; + } + + /** + * Set Prefix Length. + * + * @param int $value Prefix length + * + * @return $this + */ + public function setPrefixLength( $value ) { + $this->_prefixLength = (int)$value; + + return $this; + } + + /** + * Set max_query_terms. + * + * @param int $value Max query terms value + * + * @return $this + */ + public function setMaxQueryTerms( $value ) { + $this->_maxQueryTerms = (int)$value; + + return $this; + } + + /** + * Set analyzer. + * + * @param string $text Analyzer text + * + * @return $this + */ + public function setAnalyzer( $text ) { + $text = trim( $text ); + $this->_analyzer = $text; + + return $this; + } + + /** + * Converts fuzzy like this query to array. + * + * @return array Query array + * + * @see \Elastica\Query\AbstractQuery::toArray() + */ + public function toArray() { + if ( !empty( $this->_fields ) ) { + $args['fields'] = $this->_fields; + } + + if ( !empty( $this->_analyzer ) ) { + $args['analyzer'] = $this->_analyzer; + } + + $args['fuzziness'] = ( $this->_fuzziness > 0 ) ? $this->_fuzziness : 0; + + $args['like_text'] = $this->_likeText; + $args['prefix_length'] = $this->_prefixLength; + $args['ignore_tf'] = $this->_ignoreTF; + $args['max_query_terms'] = $this->_maxQueryTerms; + + $data = parent::toArray(); + $args = array_merge( $args, $data['fuzzy_like_this'] ); + + return [ 'fuzzy_like_this' => $args ]; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/Interfaces.php b/www/wiki/extensions/Translate/ttmserver/Interfaces.php new file mode 100644 index 00000000..1f8cb20e --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/Interfaces.php @@ -0,0 +1,154 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +/** + * Interface for TTMServer that can be queried (=all of them). + * @ingroup TTMServer + * @since 2012-06-27 + */ +interface ReadableTTMServer { + /** + * Fetches all relevant suggestions for given text. + * + * @param string $sourceLanguage language code for the provide text + * @param string $targetLanguage language code for the suggestions + * @param string $text the text for which to search suggestions + * @return array List: unordered suggestions, which each has fields: + * - source: String: the original text of the suggestion + * - target: String: the suggestion + * - context: String: title of the page where the suggestion comes from + * - quality: Float: the quality of suggestion, 1 is perfect match + */ + public function query( $sourceLanguage, $targetLanguage, $text ); + + /** + * Determines if the suggestion returned by this TTMServer comes + * from this wiki or any other wiki. + * @param array $suggestion + * @return Bool + */ + public function isLocalSuggestion( array $suggestion ); + + /** + * Given suggestion returned by this TTMServer, constructs fully + * qualified URL to the location of the translation. + * @param array $suggestion + * @return String URL + */ + public function expandLocation( array $suggestion ); +} + +/** + * Interface for TTMServer that can be updated. + * @ingroup TTMServer + * @since 2012-06-27 + */ +interface WritableTTMServer { + /** + * Shovels the new translation into translation memory. + * Use this for single updates (=after message edit). + * If no text is provided, entry will be removed from the translation + * memory. + * + * @param MessageHandle $handle + * @param string|null $targetText Use null to only delete. + */ + public function update( MessageHandle $handle, $targetText ); + + /** + * Called when starting to fill the translation memory. + * Set up necessary variables and remove old content + * from the server. + */ + public function beginBootstrap(); + + /** + * Called before every batch (MessageGroup). + */ + public function beginBatch(); + + /** + * Called multiple times per batch if necessary. + * + * @param array $batch + */ + public function batchInsertDefinitions( array $batch ); + + /** + * Called multiple times per batch if necessary. + * + * @param array $batch + */ + public function batchInsertTranslations( array $batch ); + + /** + * Called before every batch (MessageGroup). + */ + public function endBatch(); + + /** + * Do any cleanup, optimizing etc. + */ + public function endBootstrap(); + + /** + * Get the list of services to duplicate writes to make them "mirrors" + * of this service. + * + * @since 2017.04 + * @return string[] + */ + public function getMirrors(); + + /** + * Check if the service is frozen, attempting to write to + * a frozen service may lead to errors or unexpected behaviors. + * + * @since 2017.04 + * @return bool true if the service is frozen + */ + public function isFrozen(); +} + +/** + * Interface for TTMServer that can act as backend for translation search. + * @ingroup TTMServer + * @since 2014.04 + */ +interface SearchableTTMServer { + /** + * Performs a search in the translation database. + * + * @param string $queryString String to search for. + * @param array $opts Query options like language. + * @param array $highlight Tags for highlighting. + * @return mixed Result set + */ + public function search( $queryString, $opts, $highlight ); + + /** + * @param stdClass $resultset + * @return array[] + */ + public function getFacets( $resultset ); + + /** + * @param stdClass $resultset + * @return int + */ + public function getTotalHits( $resultset ); + + /** + * @param stdClass $resultset + * @return array[] + */ + public function getDocuments( $resultset ); +} diff --git a/www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php b/www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php new file mode 100644 index 00000000..402ad5ac --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php @@ -0,0 +1,32 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +/** + * Class for handling remote TTMServers over MediaWiki API. + * Currently querying is done in TranslationHelpers, and + * this class only handles location retrieval. + * @since 2012-06-27 + * @ingroup TTMServer + */ +class RemoteTTMServer extends TTMServer implements ReadableTTMServer { + public function query( $sourceLanguage, $targetLanguage, $text ) { + // @todo Implement some day perhaps? + return []; + } + + public function isLocalSuggestion( array $suggestion ) { + return false; + } + + public function expandLocation( array $suggestion ) { + return $suggestion['location']; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php new file mode 100644 index 00000000..375d544f --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php @@ -0,0 +1,446 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +/** + * TTMServer backed based on Solr instance. Depends on Solarium. + * @since 2012-06-27 + * @ingroup TTMServer + * @deprecated 1.27. Will be removed in 1.29. + */ +class SolrTTMServer + extends TTMServer + implements ReadableTTMServer, SearchableTTMServer, WritableTTMServer +{ + /** + * In case auto-commit is not enabled, or even if it is, tell solr to + * commit before this time has passed, in milliseconds. + */ + const COMMIT_WITHIN = 5000; + + protected $client; + + /** + * Reference to the maintenance script to relay logging output. + */ + protected $logger; + + public function __construct( $config ) { + wfDeprecated( __METHOD__, '1.24' ); + + parent::__construct( $config ); + + if ( isset( $config['config'] ) ) { + $this->client = new Solarium_Client( $config['config'] ); + } else { + $this->client = new Solarium_Client(); + } + } + + public function isLocalSuggestion( array $suggestion ) { + return $suggestion['wiki'] === wfWikiID(); + } + + public function expandLocation( array $suggestion ) { + return $suggestion['uri']; + } + + public function query( $sourceLanguage, $targetLanguage, $text ) { + try { + return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); + } catch ( Solarium_Exception $e ) { + throw new TranslationHelperException( 'Solarium exception: ' . $e ); + } + } + + /// @see ReadableTTMServer::query + protected function doQuery( $sourceLanguage, $targetLanguage, $text ) { + /* Two query system: + * 1) Find all strings in source language that match text + * 2) Do another query for translations for those strings + */ + // For now impose a length limit on query string to avoid doing + // very slow queries. Magic number. + if ( strlen( $text ) > 789 ) { + return []; + } + + $query = $this->client->createSelect(); + $query->setFields( [ 'globalid', 'content', 'score' ] ); + + /* The interface usually displays three best candidates. These might + * come from more than three matches, if the translation is the same. + * This might not find all suggestions, if the top N best matching + * source texts don't have translations, but worse matches do. We + * could loop with start parameter to fetch more until we have enough + * suggestions or the quality drops below the cutoff point. */ + $query->setRows( 25 ); + + /* Our string can contain all kind of nasty characters, so we need + * escape them with great pain. */ + $helper = $query->getHelper(); + $dist = $helper->escapePhrase( $text ); + // "edit" could also be ngram of other algorithm + $dist = "strdist($dist,content,edit)"; + /* Note how we need to escape twice here, first the string for strdist + * and then the strdist call itself for the query. And of course every- + * thing will be URL encoded once sent over the line. */ + $query->setQuery( '_val_:%P1%', [ $dist ] ); + + /* Filter queries are supposed to be efficient as they are separately + * cached, but I haven't done any benchmarks. */ + $query->createFilterQuery( 'lang' ) + ->setQuery( 'language:%P1%', [ $sourceLanguage ] ); + + $resultset = $this->client->select( $query ); + + /* This query is doing two unrelated things: + * 1) Collect the message contents and scores so that they can + * be accessed later for the translations we found. + * 2) Build the query string for the query that fetches the + * translations. + * This code is a bit uglier than I'd like it to be, since there + * there is no field that globally identifies a message (message + * definition and translations). */ + $contents = $scores = []; + $queryString = ''; + foreach ( $resultset as $doc ) { + $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); + $contents[$sourceId] = $doc->content; + $scores[$sourceId] = $doc->score; + + $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" ); + $queryString .= "globalid:$globalid "; + } + + // Second query to fetch available translations + $fetchQuery = $this->client->createSelect(); + $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] ); + // This come in random order, so have to fetch all and sort + $fetchQuery->setRows( 25 ); + $fetchQuery->setQuery( $queryString ); + // With AND we would not find anything, obviously. + $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR ); + + $translations = $this->client->select( $fetchQuery ); + + $suggestions = []; + foreach ( $translations as $doc ) { + /* Construct the matching source id */ + $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); + + /* Unfortunately we cannot do this on the search server, + * because score is not a real field and thus cannot be + * used in a filter query. */ + $quality = $scores[$sourceId]; + if ( $quality < $this->config['cutoff'] ) { + continue; + } + + $suggestions[] = [ + 'source' => $contents[$sourceId], + 'target' => $doc->content, + 'context' => $doc->messageid, + 'quality' => $quality, + 'wiki' => $doc->wiki, + 'location' => $doc->messageid . '/' . $targetLanguage, + 'uri' => $doc->uri, + ]; + } + + /* Like mentioned above, we get results in random order. Sort them + * now to have best matches first as expected by callers. */ + uasort( $suggestions, function ( $a, $b ) { + if ( $a['quality'] === $b['quality'] ) { + return 0; + } + + return ( $a['quality'] < $b['quality'] ) ? 1 : -1; + } ); + + return $suggestions; + } + + /* Write functions */ + + public function update( MessageHandle $handle, $targetText ) { + if ( $handle->getCode() === '' ) { + return false; + } + + /* There are various different cases here: + * [new or updated] [fuzzy|non-fuzzy] [translation|definition] + * 1) We don't distinguish between new or updated here. + * 2) Delete old translation, but not definition + * 3) Insert new translation or definition, if non-fuzzy + * The definition should never be fuzzied anyway. + * + * These only apply to known messages. + */ + + $update = $this->client->createUpdate(); + $title = $handle->getTitle(); + + $doDelete = true; + $sourceLanguage = ''; + if ( $handle->isValid() ) { + $sourceLanguage = $handle->getGroup()->getSourceLanguage(); + if ( $handle->getCode() === $sourceLanguage ) { + $doDelete = false; + } + } + + if ( $doDelete ) { + $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() ); + $conds = [ + 'wiki' => wfWikiID(), + 'language' => $handle->getCode(), + 'messageid' => $base->getPrefixedText(), + ]; + foreach ( $conds as $key => &$value ) { + $value = "$key:" . $update->getHelper()->escapePhrase( $value ); + } + $update->addDeleteQuery( implode( ' AND ', $conds ) ); + } + + if ( $targetText !== null ) { + if ( $handle->isValid() ) { + // Of the message definition page + $targetTitle = $handle->getTitle(); + $sourceTitle = Title::makeTitle( + $targetTitle->getNamespace(), + $handle->getKey() . '/' . $sourceLanguage + ); + $revId = (int)$sourceTitle->getLatestRevID(); + /* Note: in some cases the source page might not exist, in this case + * we use 0 as message version identifier, to differentiate them from + * orphan messages */ + } else { + $revId = 'orphan'; + } + + $doc = $this->createDocument( $handle, $targetText, $revId ); + // Add document and commit within X seconds. + $update->addDocument( $doc, null, self::COMMIT_WITHIN ); + } + + try { + $this->client->update( $update ); + } catch ( Solarium_Exception $e ) { + error_log( 'SolrTTMServer update-write failed' ); + + return false; + } + + return true; + } + + /** + * @see schema.xml + * @param MessageHandle $handle + * @param string $text + * @param int $revId + * @return Solarium_Document_ReadWrite + */ + protected function createDocument( MessageHandle $handle, $text, $revId ) { + $language = $handle->getCode(); + $translationTitle = $handle->getTitle(); + + $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); + $wiki = wfWikiID(); + $messageid = $title->getPrefixedText(); + $globalid = "$wiki-$messageid-$revId/$language"; + + $doc = new Solarium_Document_ReadWrite(); + $doc->wiki = $wiki; + $doc->uri = $translationTitle->getCanonicalURL(); + $doc->messageid = $messageid; + $doc->globalid = $globalid; + + $doc->language = $language; + $doc->content = $text; + $doc->setField( 'group', $handle->getGroupIds() ); + + return $doc; + } + + public function beginBootstrap() { + $update = $this->client->createUpdate(); + $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() ); + $update->addDeleteQuery( $query ); + $update->addCommit(); + $this->client->update( $update ); + } + + public function beginBatch() { + // I hate the rule that forbids {} + } + + public function batchInsertDefinitions( array $batch ) { + $lb = new LinkBatch(); + foreach ( $batch as $data ) { + $lb->addObj( $data[0]->getTitle() ); + } + $lb->execute(); + + $this->batchInsertTranslations( $batch ); + } + + public function batchInsertTranslations( array $batch ) { + $update = $this->client->createUpdate(); + foreach ( $batch as $key => $data ) { + list( $handle, $sourceLanguage, $text ) = $data; + $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); + $doc = $this->createDocument( $handle, $text, $revId ); + // Add document and commit within X seconds. + $update->addDocument( $doc, null, self::COMMIT_WITHIN ); + } + + $retries = 5; + + while ( $retries-- > 0 ) { + try { + $this->client->update( $update ); + break; + } catch ( Solarium_Client_HttpException $e ) { + if ( $retries === 0 ) { + throw $e; + } else { + $c = get_class( $e ); + $msg = $e->getMessage(); + $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); + sleep( 10 ); + } + } + } + } + + public function endBatch() { + $update = $this->client->createUpdate(); + $this->client->update( $update ); + } + + public function endBootstrap() { + $update = $this->client->createUpdate(); + $update->addCommit(); + $update->addOptimize(); + $this->client->update( $update ); + } + + public function getSolarium() { + return $this->client; + } + + public function setLogger( $logger ) { + $this->logger = $logger; + } + + // Can it get any uglier? + protected function logOutput( $text ) { + if ( $this->logger ) { + $this->logger->statusLine( "$text\n" ); + } + } + + /** + * Search interface + * @param string $queryString + * @param array $opts + * @param array $highlight + * @return array + * @throws TTMServerException + */ + public function search( $queryString, $opts, $highlight ) { + $client = $this->getSolarium(); + + $query = $client->createSelect(); + $dismax = $query->getDisMax(); + $dismax->setQueryParser( 'edismax' ); + $query->setQuery( $queryString ); + $query->setRows( $opts['limit'] ); + $query->setStart( $opts['offset'] ); + + list( $pre, $post ) = $highlight; + $hl = $query->getHighlighting(); + $hl->setFields( 'text' ); + $hl->setSimplePrefix( $pre ); + $hl->setSimplePostfix( $post ); + $hl->setMaxAnalyzedChars( '5000' ); + $hl->setFragSize( '5000' ); + $hl->setSnippets( 1 ); + + $languageFilter = $opts['language']; + if ( $languageFilter !== '' ) { + $query->createFilterQuery( 'languageFilter' ) + ->setQuery( 'language:%P1%', [ $languageFilter ] ) + ->addTag( 'filter' ); + } + + $groupFilter = $opts['group']; + if ( $groupFilter !== '' ) { + $query->createFilterQuery( 'groupFilter' ) + ->setQuery( 'group:%P1%', [ $groupFilter ] ) + ->addTag( 'filter' ); + } + + $facetSet = $query->getFacetSet(); + + $language = $facetSet->createFacetField( 'language' ); + $language->setField( 'language' ); + $language->setMinCount( 1 ); + $language->addExclude( 'filter' ); + + $group = $facetSet->createFacetField( 'group' ); + $group->setField( 'group' ); + $group->setMinCount( 1 ); + $group->setMissing( true ); + $group->addExclude( 'filter' ); + + try { + return $client->select( $query ); + } catch ( Solarium_Client_HttpException $e ) { + throw new TTMServerException( $e->getMessage() ); + } + } + + public function getFacets( $resultset ) { + return [ + 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ), + 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ), + ]; + } + + public function getTotalHits( $resultset ) { + return $resultset->getNumFound(); + } + + public function getDocuments( $resultset ) { + $highlighting = $resultset->getHighlighting(); + $ret = []; + foreach ( $resultset as $document ) { + $fields = iterator_to_array( $document ); + // Compatibility mapping + $fields['localid'] = $fields['messageid']; + + $hdoc = $highlighting->getResult( $document->globalid ); + $text = $hdoc->getField( 'text' ); + if ( $text === [] ) { + $text = $document->text; + } else { + $text = $text[0]; + } + + $fields['content'] = $text; + $ret[] = $fields; + } + + return $ret; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/TTMServer.php b/www/wiki/extensions/Translate/ttmserver/TTMServer.php new file mode 100644 index 00000000..2a7f0900 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/TTMServer.php @@ -0,0 +1,207 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @license GPL-2.0-or-later + * @defgroup TTMServer The Translate extension translation memory interface + */ + +/** + * Some general static methods for instantiating TTMServer and helpers. + * @since 2012-01-28 + * Rewritten in 2012-06-27. + * @ingroup TTMServer + */ +class TTMServer { + /** @var array */ + protected $config; + + /** + * @param array $config + */ + protected function __construct( array $config ) { + $this->config = $config; + } + + /** + * @param array $config + * @return TTMServer|null + * @throws MWException + */ + public static function factory( array $config ) { + if ( isset( $config['class'] ) ) { + $class = $config['class']; + + return new $class( $config ); + } elseif ( isset( $config['type'] ) ) { + $type = $config['type']; + switch ( $type ) { + case 'ttmserver': + return new DatabaseTTMServer( $config ); + case 'remote-ttmserver': + return new RemoteTTMServer( $config ); + default: + return null; + } + } + + throw new MWException( 'TTMServer with no type' ); + } + + /** + * Returns the primary server instance, useful for chaining. + * Primary instance is defined by $wgTranslateTranslationDefaultService + * which is a key to $wgTranslateTranslationServices. + * @return WritableTTMServer + */ + public static function primary() { + global $wgTranslateTranslationServices, + $wgTranslateTranslationDefaultService; + if ( isset( $wgTranslateTranslationServices[$wgTranslateTranslationDefaultService] ) ) { + $obj = self::factory( $wgTranslateTranslationServices[$wgTranslateTranslationDefaultService] ); + if ( $obj instanceof WritableTTMServer ) { + return $obj; + } + } + + return new FakeTTMServer(); + } + + /** + * @param array[] $suggestions + * @return array[] + */ + public static function sortSuggestions( array $suggestions ) { + usort( $suggestions, [ __CLASS__, 'qualitySort' ] ); + + return $suggestions; + } + + /** + * @param array $a + * @param array $b + * @return int + */ + protected static function qualitySort( $a, $b ) { + list( $c, $d ) = [ $a['quality'], $b['quality'] ]; + if ( $c === $d ) { + return 0; + } + + // Descending sort + return ( $c > $d ) ? -1 : 1; + } + + /** + * PHP implementation of Levenshtein edit distance algorithm. + * Uses the native PHP implementation when possible for speed. + * The native levenshtein is limited to 255 bytes. + * + * @param string $str1 + * @param string $str2 + * @param int $length1 + * @param int $length2 + * @return int + */ + public static function levenshtein( $str1, $str2, $length1, $length2 ) { + if ( $length1 === 0 ) { + return $length2; + } + if ( $length2 === 0 ) { + return $length1; + } + if ( $str1 === $str2 ) { + return 0; + } + + $bytelength1 = strlen( $str1 ); + $bytelength2 = strlen( $str2 ); + if ( $bytelength1 === $length1 && $bytelength1 <= 255 + && $bytelength2 === $length2 && $bytelength2 <= 255 + ) { + return levenshtein( $str1, $str2 ); + } + + $prevRow = range( 0, $length2 ); + for ( $i = 0; $i < $length1; $i++ ) { + $currentRow = []; + $currentRow[0] = $i + 1; + $c1 = mb_substr( $str1, $i, 1 ); + for ( $j = 0; $j < $length2; $j++ ) { + $c2 = mb_substr( $str2, $j, 1 ); + $insertions = $prevRow[$j + 1] + 1; + $deletions = $currentRow[$j] + 1; + $substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 ); + $currentRow[] = min( $insertions, $deletions, $substitutions ); + } + $prevRow = $currentRow; + } + + return $prevRow[$length2]; + } + + /** + * Hook: ArticleDeleteComplete + * @param WikiPage $wikipage + */ + public static function onDelete( WikiPage $wikipage ) { + $handle = new MessageHandle( $wikipage->getTitle() ); + $job = TTMServerMessageUpdateJob::newJob( $handle, 'delete' ); + JobQueueGroup::singleton()->push( $job ); + } + + /** + * Called from TranslateEditAddons::onSave + * @param MessageHandle $handle + */ + public static function onChange( MessageHandle $handle ) { + $job = TTMServerMessageUpdateJob::newJob( $handle, 'refresh' ); + JobQueueGroup::singleton()->push( $job ); + } + + /** + * @param MessageHandle $handle + * @param array $old + */ + public static function onGroupChange( MessageHandle $handle, $old ) { + if ( $old === [] ) { + // Don't bother for newly added messages + return; + } + + $job = TTMServerMessageUpdateJob::newJob( $handle, 'rebuild' ); + JobQueueGroup::singleton()->push( $job ); + } + + /** + * @return string[] + */ + public function getMirrors() { + global $wgTranslateTranslationServices; + if ( isset( $this->config['mirrors'] ) ) { + $mirrors = []; + foreach ( $this->config['mirrors'] as $name ) { + if ( !is_string( $name ) ) { + throw new TTMServerException( "Invalid configuration set in " . + "mirrors, expected an array of strings" ); + } + if ( !isset( $wgTranslateTranslationServices[$name] ) ) { + throw new TTMServerException( "Invalid configuration in " . + "mirrors, unknown service $name" ); + } + $mirrors[$name] = true; + } + return array_keys( $mirrors ); + } + return []; + } + + /** + * @return bool + */ + public function isFrozen() { + return false; + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php b/www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php new file mode 100644 index 00000000..7a6a91d7 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php @@ -0,0 +1,347 @@ +<?php +/** + * Contains class with job for updating translation memory. + * + * @file + * @author Niklas Laxström + * @license GPL-2.0-or-later + */ + +use MediaWiki\Logger\LoggerFactory; + +/** + * Job for updating translation memory. + * + * job params: + * - command: the command to run, defaults to 'rebuild' + * - service: the service to write to, if set to null the job will write + * to the default (primary) service and its replicas. + * - errorCount: number of errors encountered while trying to perform the write + * on this service + * + * This job handles retries itself and return false in allowRetries to disable + * JobQueue's internal retry service. + * + * If mirroring is activated on the primary service then the first job + * will try to write to all services, it will resend a new job to + * every single service that failed and will increment errorCount. + * When too many errors occur on single service the job is dropped. + * + * @ingroup JobQueue + */ +class TTMServerMessageUpdateJob extends Job { + /** + * Number of *retries* allowed, 4 means we attempt + * to run the job 5 times (1 initial attempt + 4 retries). + */ + const MAX_ERROR_RETRY = 4; + + /** + * Constant used by backoffDelay(). + * With 7 the cumulative delay between the first and last attempt is + * between 8 and 33 minutes. + */ + const WRITE_BACKOFF_EXPONENT = 7; + + /** + * The maximum amount of time jobs delayed due to frozen services can remain + * in the job queue. + */ + const DROP_DELAYED_JOBS_AFTER = 86400; // 60 * 60 * 24 * 1; + + /** + * @param MessageHandle $handle + * @param string $command + * @return self + */ + public static function newJob( MessageHandle $handle, $command ) { + $job = new self( $handle->getTitle(), [ 'command' => $command ] ); + + return $job; + } + + /** + * @param Title $title + * @param array $params + */ + public function __construct( $title, $params = [] ) { + parent::__construct( + __CLASS__, + $title, + $params + [ + 'command' => 'rebuild', + 'service' => null, + 'errorCount' => 0, + 'createdAt' => time(), + 'retryCount' => 0, + ] + ); + } + + /** + * Fetch all the translations and update them. + * @return bool + */ + public function run() { + global $wgTranslateTranslationServices, + $wgTranslateTranslationDefaultService; + + $service = $this->params['service']; + $writeToMirrors = false; + + if ( $service === null ) { + $service = $wgTranslateTranslationDefaultService; + $writeToMirrors = true; + } + + if ( !isset( $wgTranslateTranslationServices[$service] ) ) { + LoggerFactory::getInstance( 'TTMServerUpdates' )->warning( + 'Received update job for a an unknown service {service}.', + [ 'service' => $service ] + ); + return true; + } + + $services = [ $service ]; + if ( $writeToMirrors ) { + $config = $wgTranslateTranslationServices[$service]; + $server = TTMServer::factory( $config ); + $services = array_unique( + array_merge( $services, $server->getMirrors() ) + ); + } + + foreach ( $services as $service ) { + $this->runCommandWithRetry( $service ); + } + return true; + } + + /** + * @inheritDoc + */ + public function allowRetries() { + return false; + } + + /** + * Run the update on the specified service name. + * + * @param string $serviceName the service name + */ + private function runCommandWithRetry( $serviceName ) { + global $wgTranslateTranslationServices; + + if ( !isset( $wgTranslateTranslationServices[$serviceName] ) ) { + LoggerFactory::getInstance( 'TTMServerUpdates' )->warning( + 'Cannot write to {service}: service is unknown.', + [ 'service' => $serviceName ] + ); + return; + } + $ttmserver = TTMServer::factory( $wgTranslateTranslationServices[$serviceName] ); + + if ( $serviceName === null || !( $ttmserver instanceof WritableTTMServer ) ) { + LoggerFactory::getInstance( 'TTMServerUpdates' )->warning( + 'Received update job for a service that does not implement ' . + 'WritableTTMServer, please check config for {service}.', + [ 'service' => $serviceName ] + ); + return; + } + + try { + if ( $ttmserver->isFrozen() ) { + $this->requeueRetry( $serviceName ); + } else { + $this->runCommand( $ttmserver ); + } + } catch ( \Exception $e ) { + $this->requeueError( $serviceName, $e ); + } + } + + /** + * @param string $serviceName the service in error + * @param Exception $e the error + */ + private function requeueError( $serviceName, $e ) { + LoggerFactory::getInstance( 'TTMServerUpdates' )->warning( + 'Exception thrown while running {command} on ' . + 'service {service}: {errorMessage}', + [ + 'command' => $this->params['command'], + 'service' => $serviceName, + 'errorMessage' => $e->getMessage(), + 'exception' => $e, + ] + ); + if ( $this->params['errorCount'] >= self::MAX_ERROR_RETRY ) { + LoggerFactory::getInstance( 'TTMServerUpdates' )->warning( + 'Dropping failing job {command} for service {service} ' . + 'after repeated failure', + [ + 'command' => $this->params['command'], + 'service' => $serviceName, + ] + ); + return; + } + + $delay = self::backoffDelay( $this->params['errorCount'] ); + $job = clone $this; + $job->params['errorCount']++; + $job->params['service'] = $serviceName; + $job->setDelay( $delay ); + LoggerFactory::getInstance( 'TTMServerUpdates' )->info( + 'Update job reported failure on service {service}. ' . + 'Requeueing job with delay of {delay}.', + [ + 'service' => $serviceName, + 'delay' => $delay + ] + ); + $this->resend( $job ); + } + + /** + * Re-queue job that is frozen, or drop the job if it has + * been frozen for too long. + * + * @param string $serviceName + */ + private function requeueRetry( $serviceName ) { + $diff = time() - $this->params['createdAt']; + $dropTimeout = self::DROP_DELAYED_JOBS_AFTER; + if ( $diff > $dropTimeout ) { + LoggerFactory::getInstance( 'TTMServerUpdates' )->warning( + 'Dropping delayed job {command} for service {service} ' . + 'after waiting {diff}s', + [ + 'command' => $this->params['command'], + 'service' => $serviceName, + 'diff' => $diff, + ] + ); + } else { + $delay = self::backoffDelay( $this->params['retryCount'] ); + $job = clone $this; + $job->params['retryCount']++; + $job->params['service'] = $serviceName; + $job->setDelay( $delay ); + LoggerFactory::getInstance( 'TTMServerUpdates' )->debug( + 'Service {service} reported frozen. ' . + 'Requeueing job with delay of {delay}s', + [ + 'service' => $serviceName, + 'delay' => $delay + ] + ); + $this->resend( $job ); + } + } + + /** + * Extracted for testing purpose + * @param self $job + */ + protected function resend( self $job ) { + JobQueueGroup::singleton()->push( $job ); + } + + private function runCommand( WritableTTMServer $ttmserver ) { + $handle = $this->getHandle(); + $command = $this->params['command']; + + if ( $command === 'delete' ) { + $this->updateItem( $ttmserver, $handle, null, false ); + } elseif ( $command === 'rebuild' ) { + $this->updateMessage( $ttmserver, $handle ); + } elseif ( $command === 'refresh' ) { + $this->updateTranslation( $ttmserver, $handle ); + } + } + + /** + * Extracted for testing purpose + * + * @return MessageHandle + */ + protected function getHandle() { + return new MessageHandle( $this->title ); + } + + /** + * Extracted for testing purpose + * + * @param MessageHandle $handle + * @return string + */ + protected function getTranslation( MessageHandle $handle ) { + return TranslateUtils::getMessageContent( + $handle->getKey(), + $handle->getCode(), + $handle->getTitle()->getNamespace() + ); + } + + private function updateMessage( WritableTTMServer $ttmserver, MessageHandle $handle ) { + // Base page update, e.g. group change. Update everything. + $translations = ApiQueryMessageTranslations::getTranslations( $handle ); + foreach ( $translations as $page => $data ) { + $tTitle = Title::makeTitle( $this->title->getNamespace(), $page ); + $tHandle = new MessageHandle( $tTitle ); + $this->updateItem( $ttmserver, $tHandle, $data[0], $tHandle->isFuzzy() ); + } + } + + private function updateTranslation( WritableTTMServer $ttmserver, MessageHandle $handle ) { + // Update only this translation + $translation = $this->getTranslation( $handle ); + $this->updateItem( $ttmserver, $handle, $translation, $handle->isFuzzy() ); + } + + private function updateItem( WritableTTMServer $ttmserver, MessageHandle $handle, $text, $fuzzy ) { + if ( $fuzzy ) { + $text = null; + } + $ttmserver->update( $handle, $text ); + } + + /** + * Set a delay for this job. Note that this might not be possible, the JobQueue + * implementation handling this job doesn't support it (JobQueueDB) but is possible + * for the high performance JobQueueRedis. Note also that delays are minimums - + * at least JobQueueRedis makes no effort to remove the delay as soon as possible + * after it has expired. By default it only checks every five minutes or so. + * Note yet again that if another delay has been set that is longer then this one + * then the _longer_ delay stays. + * + * @param int $delay seconds to delay this job if possible + */ + public function setDelay( $delay ) { + $jobQueue = JobQueueGroup::singleton()->get( $this->getType() ); + if ( !$delay || !$jobQueue->delayedJobsEnabled() ) { + return; + } + $oldTime = $this->getReleaseTimestamp(); + $newTime = time() + $delay; + if ( $oldTime !== null && $oldTime >= $newTime ) { + return; + } + $this->params[ 'jobReleaseTimestamp' ] = $newTime; + } + + /** + * @param int $retryCount The number of times the job has errored out. + * @return int Number of seconds to delay. With the default minimum exponent + * of 6 the possible return values are 64, 128, 256, 512 and 1024 giving a + * maximum delay of 17 minutes. + */ + public static function backoffDelay( $retryCount ) { + return ceil( pow( + 2, + static::WRITE_BACKOFF_EXPONENT + rand( 0, min( $retryCount, 4 ) ) + ) ); + } +} diff --git a/www/wiki/extensions/Translate/ttmserver/schema.xml b/www/wiki/extensions/Translate/ttmserver/schema.xml new file mode 100644 index 00000000..0ed2f047 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/schema.xml @@ -0,0 +1,45 @@ +<?xml version="1.0" encoding="UTF-8" ?> +<!-- This is schema file for TTMServer using Solr as backend --> +<schema name="ttmserver" version="1.5"> + <types> + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/> + <fieldType name="string" class="solr.StrField" sortMissingLast="true" /> + <fieldType name="tint" class="solr.TrieIntField" precisionStep="50" positionIncrementGap="0"/> + <!-- Our input can basically be in any language, so we use either + language agnostic processing or something that can adapt to + the language in question. --> + <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <!-- Consider using solr.ICUTokenizerFactory --> + <tokenizer class="solr.StandardTokenizerFactory"/> + <!-- Consider using solr.ICUNormalizer2FilterFactory --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + </types> + + <fields> + <field name="_version_" type="long" indexed="true" stored="true" /> + + <!-- If multiple wikis are using the same server, this will tell which one + owns this document. Maps to MediaWiki wfWikiId(). --> + <field name="wiki" type="string" indexed="true" stored="true" required="true" /> + <!-- Title::getPrefixedText() of the message definition page. --> + <field name="messageid" type="string" indexed="true" stored="true" required="true" /> + <!-- Consists of concatenation of wiki and messageid. --> + <field name="globalid" type="string" indexed="true" stored="true" required="true" /> + <!-- URL or something to the translation in the wiki. --> + <field name="uri" type="string" indexed="true" stored="true" required="true" /> + + <!-- FACETs: Language and groups. --> + <field name="language" type="string" indexed="true" stored="true" required="true" /> + <field name="group" multiValued="true" indexed="true" stored="true" type="string" /> + + <field name="content" type="string" indexed="true" stored="true" required="true" /> + + <field name="text" type="text_ws" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> + <copyField source="content" dest="text"/> + </fields> + <defaultSearchField>text</defaultSearchField> + <uniqueKey>globalid</uniqueKey> +</schema> |