summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/Translate/ttmserver
diff options
context:
space:
mode:
authorYaco <franco@reevo.org>2020-06-04 11:01:00 -0300
committerYaco <franco@reevo.org>2020-06-04 11:01:00 -0300
commitfc7369835258467bf97eb64f184b93691f9a9fd5 (patch)
treedaabd60089d2dd76d9f5fb416b005fbe159c799d /www/wiki/extensions/Translate/ttmserver
first commit
Diffstat (limited to 'www/wiki/extensions/Translate/ttmserver')
-rw-r--r--www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php154
-rw-r--r--www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php282
-rw-r--r--www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php890
-rw-r--r--www/wiki/extensions/Translate/ttmserver/Exceptions.php4
-rw-r--r--www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php60
-rw-r--r--www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php222
-rw-r--r--www/wiki/extensions/Translate/ttmserver/Interfaces.php154
-rw-r--r--www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php32
-rw-r--r--www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php446
-rw-r--r--www/wiki/extensions/Translate/ttmserver/TTMServer.php207
-rw-r--r--www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php347
-rw-r--r--www/wiki/extensions/Translate/ttmserver/schema.xml45
12 files changed, 2843 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php b/www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php
new file mode 100644
index 00000000..4b047918
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/CrossLanguageTranslationSearchQuery.php
@@ -0,0 +1,154 @@
+<?php
+/**
+ * Cross Language Translation Search.
+ * @since 2015.08
+ */
+class CrossLanguageTranslationSearchQuery {
+ /** @var TTMServer */
+ protected $server;
+
+ /** @var array */
+ protected $params;
+
+ /** @var ResultSet */
+ protected $resultset;
+
+ /** @var int */
+ protected $total = 0;
+
+ protected $hl = [ '', '' ];
+
+ public function __construct( array $params, SearchableTTMServer $server ) {
+ $this->params = $params;
+ $this->server = $server;
+ }
+
+ public function getDocuments() {
+ $documents = [];
+ $offset = $this->params['offset'];
+ $limit = $this->params['limit'];
+
+ $options = $this->params;
+ $options['language'] = $this->params['sourcelanguage'];
+ // Use a bigger limit that what was requested, since we are likely to throw away many
+ // results in the local filtering step at extractMessages
+ $options['limit'] = $limit * 10;
+ // TODO: the real offset should be communicated to the frontend. It currently assumes
+ // next offset is current offset + limit and previous one is current offset - limit.
+ // It might be difficult to fix scrolling results backwards. For now we handle offset
+ // locally.
+ $options['offset'] = 0;
+
+ $search = $this->server->createSearch( $this->params['query'], $options, $this->hl );
+ $scroll = $search->scroll( '5s' );
+
+ // Used for aggregations. Only the first scroll response has them.
+ $this->resultset = null;
+
+ foreach ( $scroll as $resultSet ) {
+ if ( !$this->resultset ) {
+ $this->resultset = $resultSet;
+ $this->total = $resultSet->getTotalHits();
+ }
+
+ $results = $this->extractMessages( $resultSet->getDocuments() );
+ $documents = array_merge( $documents, $results );
+
+ $count = count( $documents );
+
+ if ( $count >= $offset + $limit ) {
+ break;
+ }
+ }
+
+ // clear was introduced in Elastica 5.3.1, but Elastica extension uses 5.3.0
+ if ( is_callable( [ $scroll, 'clear' ] ) ) {
+ $scroll->clear();
+ }
+ $documents = array_slice( $documents, $offset, $limit );
+
+ return $documents;
+ }
+
+ /**
+ * Extract messages from the documents and build message definitions.
+ * Create a message collection from the definitions in the target language.
+ * Filter the message collection to get filtered messages.
+ * Slice messages according to limit and offset given.
+ * @param \Elastica\Document[] $documents
+ * @return array[]
+ */
+ protected function extractMessages( $documents ) {
+ $messages = $ret = [];
+
+ $language = $this->params['language'];
+ foreach ( $documents as $document ) {
+ $data = $document->getData();
+
+ if ( !$this->server->isLocalSuggestion( $data ) ) {
+ continue;
+ }
+
+ $title = Title::newFromText( $data['localid'] );
+ if ( !$title ) {
+ continue;
+ }
+
+ $handle = new MessageHandle( $title );
+ if ( !$handle->isValid() ) {
+ continue;
+ }
+
+ $key = $title->getNamespace() . ':' . $title->getDBkey();
+ $messages[$key] = $data['content'];
+ }
+
+ $definitions = new MessageDefinitions( $messages );
+ $collection = MessageCollection::newFromDefinitions( $definitions, $language );
+
+ $filter = $this->params['filter'];
+ if ( $filter === 'untranslated' ) {
+ $collection->filter( 'hastranslation', true );
+ } elseif ( in_array( $filter, $this->getAvailableFilters() ) ) {
+ $collection->filter( $filter, false );
+ }
+
+ if ( $filter === 'translated' || $filter === 'fuzzy' ) {
+ $collection->loadTranslations();
+ }
+
+ foreach ( $collection->keys() as $mkey => $title ) {
+ $result = [];
+ $result['content'] = $messages[$mkey];
+ if ( $filter === 'translated' || $filter === 'fuzzy' ) {
+ $result['content'] = $collection[$mkey]->translation();
+ }
+ $handle = new MessageHandle( $title );
+ $result['localid'] = $handle->getTitleForBase()->getPrefixedText();
+ $result['language'] = $language;
+
+ $ret[] = $result;
+ }
+
+ return $ret;
+ }
+
+ /**
+ * @return array
+ */
+ public function getAvailableFilters() {
+ return [
+ 'translated',
+ 'fuzzy',
+ 'untranslated'
+ ];
+ }
+
+ public function getTotalHits() {
+ return $this->total;
+ }
+
+ public function getResultSet() {
+ return $this->resultset;
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php b/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php
new file mode 100644
index 00000000..70c99b64
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php
@@ -0,0 +1,282 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @copyright Copyright © 2012-2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+use Wikimedia\Rdbms\DBQueryError;
+
+/**
+ * Mysql based backend.
+ * @ingroup TTMServer
+ * @since 2012-06-27
+ */
+class DatabaseTTMServer extends TTMServer implements WritableTTMServer, ReadableTTMServer {
+ protected $sids;
+
+ /**
+ * @param int $mode DB_REPLICA|DB_MASTER
+ * @return \Wikimedia\Rdbms\IDatabase
+ */
+ protected function getDB( $mode = DB_REPLICA ) {
+ return wfGetDB( $mode, 'ttmserver', $this->config['database'] );
+ }
+
+ public function update( MessageHandle $handle, $targetText ) {
+ if ( !$handle->isValid() || $handle->getCode() === '' ) {
+ return false;
+ }
+
+ $mkey = $handle->getKey();
+ $group = $handle->getGroup();
+ $targetLanguage = $handle->getCode();
+ $sourceLanguage = $group->getSourceLanguage();
+
+ // Skip definitions to not slow down mass imports etc.
+ // These will be added when the first translation is made
+ if ( $targetLanguage === $sourceLanguage ) {
+ return false;
+ }
+
+ $definition = $group->getMessage( $mkey, $sourceLanguage );
+ if ( !is_string( $definition ) || !strlen( trim( $definition ) ) ) {
+ return false;
+ }
+
+ $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $mkey );
+ $dbw = $this->getDB( DB_MASTER );
+ /* Check that the definition exists and fetch the sid. If not, add
+ * the definition and retrieve the sid. If the definition changes,
+ * we will create a new entry - otherwise we could at some point
+ * get suggestions which do not match the original definition any
+ * longer. The old translations are still kept until purged by
+ * rerunning the bootstrap script. */
+ $conds = [
+ 'tms_context' => $context->getPrefixedText(),
+ 'tms_text' => $definition,
+ ];
+
+ $sid = $dbw->selectField( 'translate_tms', 'tms_sid', $conds, __METHOD__ );
+ if ( $sid === false ) {
+ $sid = $this->insertSource( $context, $sourceLanguage, $definition );
+ }
+
+ // Delete old translations for this message if any. Could also use replace
+ $deleteConds = [
+ 'tmt_sid' => $sid,
+ 'tmt_lang' => $targetLanguage,
+ ];
+ $dbw->delete( 'translate_tmt', $deleteConds, __METHOD__ );
+
+ // Insert the new translation
+ if ( $targetText !== null ) {
+ $row = $deleteConds + [
+ 'tmt_text' => $targetText,
+ ];
+
+ $dbw->insert( 'translate_tmt', $row, __METHOD__ );
+ }
+
+ return true;
+ }
+
+ protected function insertSource( Title $context, $sourceLanguage, $text ) {
+ $row = [
+ 'tms_lang' => $sourceLanguage,
+ 'tms_len' => mb_strlen( $text ),
+ 'tms_text' => $text,
+ 'tms_context' => $context->getPrefixedText(),
+ ];
+
+ $dbw = $this->getDB( DB_MASTER );
+ $dbw->insert( 'translate_tms', $row, __METHOD__ );
+ $sid = $dbw->insertId();
+
+ $fulltext = $this->filterForFulltext( $sourceLanguage, $text );
+ if ( count( $fulltext ) ) {
+ $row = [
+ 'tmf_sid' => $sid,
+ 'tmf_text' => implode( ' ', $fulltext ),
+ ];
+ $dbw->insert( 'translate_tmf', $row, __METHOD__ );
+ }
+
+ return $sid;
+ }
+
+ /**
+ * Tokenizes the text for fulltext search.
+ * Tries to find the most useful tokens.
+ *
+ * @param string $language Language code
+ * @param string $input
+ * @return array
+ */
+ protected function filterForFulltext( $language, $input ) {
+ $lang = Language::factory( $language );
+
+ $text = preg_replace( '/[^[:alnum:]]/u', ' ', $input );
+ $text = $lang->segmentByWord( $text );
+ $text = $lang->lc( $text );
+ $segments = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY );
+ if ( count( $segments ) < 4 ) {
+ return [];
+ }
+
+ foreach ( $segments as $i => $segment ) {
+ // Yes strlen
+ $len = strlen( $segment );
+ if ( $len < 4 || $len > 15 ) {
+ unset( $segments[$i] );
+ }
+ }
+
+ $segments = array_unique( $segments );
+ $segments = array_slice( $segments, 0, 10 );
+
+ return $segments;
+ }
+
+ public function beginBootstrap() {
+ $dbw = $this->getDB( DB_MASTER );
+ $dbw->delete( 'translate_tms', '*', __METHOD__ );
+ $dbw->delete( 'translate_tmt', '*', __METHOD__ );
+ $dbw->delete( 'translate_tmf', '*', __METHOD__ );
+ $table = $dbw->tableName( 'translate_tmf' );
+ try {
+ $dbw->query( "DROP INDEX tmf_text ON $table" );
+ } catch ( DBQueryError $e ) {
+ // Perhaps the script was aborted before it got
+ // chance to add the index back.
+ }
+ }
+
+ public function beginBatch() {
+ $this->sids = [];
+ }
+
+ public function batchInsertDefinitions( array $batch ) {
+ foreach ( $batch as $key => $item ) {
+ list( $title, $language, $text ) = $item;
+ $handle = new MessageHandle( $title );
+ $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() );
+ $this->sids[$key] = $this->insertSource( $context, $language, $text );
+ }
+ wfWaitForSlaves( 10 );
+ }
+
+ public function batchInsertTranslations( array $batch ) {
+ $rows = [];
+ foreach ( $batch as $key => $data ) {
+ list( , $language, $text ) = $data;
+ $rows[] = [
+ 'tmt_sid' => $this->sids[$key],
+ 'tmt_lang' => $language,
+ 'tmt_text' => $text,
+ ];
+ }
+
+ $dbw = $this->getDB( DB_MASTER );
+ $dbw->insert( 'translate_tmt', $rows, __METHOD__ );
+ wfWaitForSlaves( 10 );
+ }
+
+ public function endBatch() {
+ }
+
+ public function endBootstrap() {
+ $dbw = $this->getDB( DB_MASTER );
+ $table = $dbw->tableName( 'translate_tmf' );
+ $dbw->query( "CREATE FULLTEXT INDEX tmf_text ON $table (tmf_text)" );
+ }
+
+ /* Reading interface */
+
+ public function isLocalSuggestion( array $suggestion ) {
+ return true;
+ }
+
+ public function expandLocation( array $suggestion ) {
+ $title = Title::newFromText( $suggestion['location'] );
+
+ return $title->getCanonicalURL();
+ }
+
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ // Calculate the bounds of the string length which are able
+ // to satisfy the cutoff percentage in edit distance.
+ $len = mb_strlen( $text );
+ $min = ceil( max( $len * $this->config['cutoff'], 2 ) );
+ $max = floor( $len / $this->config['cutoff'] );
+
+ // We could use fulltext index to narrow the results further
+ $dbr = $this->getDB( DB_REPLICA );
+ $tables = [ 'translate_tmt', 'translate_tms' ];
+ $fields = [ 'tms_context', 'tms_text', 'tmt_lang', 'tmt_text' ];
+
+ $conds = [
+ 'tms_lang' => $sourceLanguage,
+ 'tmt_lang' => $targetLanguage,
+ "tms_len BETWEEN $min AND $max",
+ 'tms_sid = tmt_sid',
+ ];
+
+ $fulltext = $this->filterForFulltext( $sourceLanguage, $text );
+ if ( $fulltext ) {
+ $tables[] = 'translate_tmf';
+ $list = implode( ' ', $fulltext );
+ $conds[] = 'tmf_sid = tmt_sid';
+ $conds[] = "MATCH(tmf_text) AGAINST( '$list' )";
+ }
+
+ $res = $dbr->select( $tables, $fields, $conds, __METHOD__ );
+
+ return $this->processQueryResults( $res, $text, $targetLanguage );
+ }
+
+ protected function processQueryResults( $res, $text, $targetLanguage ) {
+ $timeLimit = microtime( true ) + 5;
+
+ $lenA = mb_strlen( $text );
+ $results = [];
+ foreach ( $res as $row ) {
+ if ( microtime( true ) > $timeLimit ) {
+ // Having no suggestions is better than preventing translation
+ // altogether by timing out the request :(
+ break;
+ }
+
+ $a = $text;
+ $b = $row->tms_text;
+ $lenB = mb_strlen( $b );
+ $len = min( $lenA, $lenB );
+ if ( $len > 600 ) {
+ // two strings of length 1500 ~ 10s
+ // two strings of length 2250 ~ 30s
+ $dist = $len;
+ } else {
+ $dist = self::levenshtein( $a, $b, $lenA, $lenB );
+ }
+ $quality = 1 - ( $dist * 0.9 / $len );
+
+ if ( $quality >= $this->config['cutoff'] ) {
+ $results[] = [
+ 'source' => $row->tms_text,
+ 'target' => $row->tmt_text,
+ 'context' => $row->tms_context,
+ 'location' => $row->tms_context . '/' . $targetLanguage,
+ 'quality' => $quality,
+ 'wiki' => $row->tms_wiki ?? wfWikiID(),
+ ];
+ }
+ }
+ $results = TTMServer::sortSuggestions( $results );
+
+ return $results;
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php b/www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php
new file mode 100644
index 00000000..0835d518
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/ElasticSearchTTMServer.php
@@ -0,0 +1,890 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+use MediaWiki\Logger\LoggerFactory;
+
+/**
+ * TTMServer backed based on ElasticSearch. Depends on Elastica.
+ * @since 2014.04
+ * @ingroup TTMServer
+ */
+class ElasticSearchTTMServer
+ extends TTMServer
+ implements ReadableTTMServer, WritableTTMServer, SearchableTTMserver
+{
+ /**
+ * @const int number of documents that will be loaded and deleted in a
+ * single operation
+ */
+ const BULK_DELETE_CHUNK_SIZE = 100;
+
+ /**
+ * @const int in case a write operation fails during a batch process
+ * this constant controls the number of times we will retry the same
+ * operation.
+ */
+ const BULK_INDEX_RETRY_ATTEMPTS = 5;
+
+ /**
+ * @const int time (seconds) to wait for the index to ready before
+ * starting to index. Since we wait for index status it can be relatively
+ * long especially if some nodes are restarted.
+ */
+ const WAIT_UNTIL_READY_TIMEOUT = 3600;
+
+ /**
+ * Flag in the frozen index that indicates that all indices
+ * are frozen (useful only when this service shares the cluster with
+ * CirrusSearch)
+ */
+ const ALL_INDEXES_FROZEN_NAME = 'freeze_everything';
+
+ /**
+ * Type used in the frozen index
+ */
+ const FROZEN_TYPE = 'frozen';
+
+ /**
+ * @var \Elastica\Client
+ */
+ protected $client;
+
+ /**
+ * Reference to the maintenance script to relay logging output.
+ */
+ protected $logger;
+
+ /**
+ * Used for Reindex
+ */
+ protected $updateMapping = false;
+
+ public function isLocalSuggestion( array $suggestion ) {
+ return $suggestion['wiki'] === wfWikiID();
+ }
+
+ public function expandLocation( array $suggestion ) {
+ return $suggestion['uri'];
+ }
+
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ try {
+ return $this->doQuery( $sourceLanguage, $targetLanguage, $text );
+ } catch ( Exception $e ) {
+ throw new TranslationHelperException( 'Elastica exception: ' . $e );
+ }
+ }
+
+ protected function doQuery( $sourceLanguage, $targetLanguage, $text ) {
+ if ( !$this->useWikimediaExtraPlugin() ) {
+ // ElasticTTM is currently not compatible with elasticsearch 2.x/5.x
+ // It needs FuzzyLikeThis ported via the wmf extra plugin
+ throw new \RuntimeException( 'The wikimedia extra plugin is mandatory.' );
+ }
+ /* Two query system:
+ * 1) Find all strings in source language that match text
+ * 2) Do another query for translations for those strings
+ */
+ $connection = $this->getClient()->getConnection();
+ $oldTimeout = $connection->getTimeout();
+ $connection->setTimeout( 10 );
+
+ $fuzzyQuery = new FuzzyLikeThis();
+ $fuzzyQuery->setLikeText( $text );
+ $fuzzyQuery->addFields( [ 'content' ] );
+
+ $boostQuery = new \Elastica\Query\FunctionScore();
+ $boostQuery->addFunction(
+ 'levenshtein_distance_score',
+ [
+ 'text' => $text,
+ 'field' => 'content'
+ ]
+ );
+ $boostQuery->setBoostMode( \Elastica\Query\FunctionScore::BOOST_MODE_REPLACE );
+
+ // Wrap the fuzzy query so it can be used as a filter.
+ // This is slightly faster, as ES can throw away the scores by this query.
+ $bool = new \Elastica\Query\BoolQuery();
+ $bool->addFilter( $fuzzyQuery );
+ $bool->addMust( $boostQuery );
+
+ $languageFilter = new \Elastica\Query\Term();
+ $languageFilter->setTerm( 'language', $sourceLanguage );
+ $bool->addFilter( $languageFilter );
+
+ // The whole query
+ $query = new \Elastica\Query();
+ $query->setQuery( $bool );
+
+ // The interface usually displays three best candidates. These might
+ // come from more than three source things, if the translations are
+ // the same. In other words suggestions are grouped by the suggested
+ // translation. This algorithm might not find all suggestions, if the
+ // top N best matching source texts don't have equivalent translations
+ // in the target language, but worse matches which we did not fetch do.
+ // This code tries to balance between doing too many or too big queries
+ // and not fetching enough results to show all possible suggestions.
+ $sizeFirst = 100;
+ $sizeSecond = $sizeFirst * 5;
+
+ $query->setFrom( 0 );
+ $query->setSize( $sizeFirst );
+ $query->setParam( '_source', [ 'content' ] );
+ $cutoff = $this->config['cutoff'] ?? 0.65;
+ $query->setParam( 'min_score', $cutoff );
+ $query->setSort( [ '_score', '_uid' ] );
+
+ /* This query is doing two unrelated things:
+ * 1) Collect the message contents and scores so that they can
+ * be accessed later for the translations we found.
+ * 2) Build the query string for the query that fetches the translations.
+ */
+ $contents = $scores = $terms = [];
+ do {
+ $resultset = $this->getType()->search( $query );
+
+ if ( count( $resultset ) === 0 ) {
+ break;
+ }
+
+ foreach ( $resultset->getResults() as $result ) {
+ $data = $result->getData();
+ $score = $result->getScore();
+
+ $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() );
+ $contents[$sourceId] = $data['content'];
+ $scores[$sourceId] = $score;
+ $terms[] = "$sourceId/$targetLanguage";
+ }
+
+ // Check if it looks like that we are hitting the long tail already.
+ // Otherwise, we'll do a query to fetch some more to reach a "sane"
+ // breaking point, i.e. include all suggestions with same content
+ // for reliable used X times statistics.
+ if ( count( array_unique( $scores ) ) > 5 ) {
+ break;
+ }
+
+ // Okay, We are now in second iteration of the loop. We already got
+ // lots of suggestions. We will give up for now even if it means we
+ // return in some sense incomplete results.
+ if ( count( $resultset ) === $sizeSecond ) {
+ break;
+ }
+
+ // After the first query, the smallest score is the new threshold.
+ $query->setParam( 'min_score', $score );
+ $query->setFrom( $query->getParam( 'size' ) + $query->getParam( 'from' ) );
+ $query->setSize( $sizeSecond );
+
+ // Break if we already got all hits
+ } while ( $resultset->getTotalHits() > count( $contents ) );
+
+ $suggestions = [];
+
+ // Skip second query if first query found nothing. Keeping only one return
+ // statement in this method to avoid forgetting to reset connection timeout
+ if ( $terms !== [] ) {
+ $idQuery = new \Elastica\Query\Terms();
+ $idQuery->setTerms( '_id', $terms );
+
+ $query = new \Elastica\Query( $idQuery );
+ $query->setSize( 25 );
+ $query->setParam( '_source', [ 'wiki', 'uri', 'content', 'localid' ] );
+ $resultset = $this->getType()->search( $query );
+
+ foreach ( $resultset->getResults() as $result ) {
+ $data = $result->getData();
+
+ // Construct the matching source id
+ $sourceId = preg_replace( '~/[^/]+$~', '', $result->getId() );
+
+ $suggestions[] = [
+ 'source' => $contents[$sourceId],
+ 'target' => $data['content'],
+ 'context' => $data['localid'],
+ 'quality' => $scores[$sourceId],
+ 'wiki' => $data['wiki'],
+ 'location' => $data['localid'] . '/' . $targetLanguage,
+ 'uri' => $data['uri'],
+ ];
+ }
+
+ // Ensure results are in quality order
+ uasort( $suggestions, function ( $a, $b ) {
+ if ( $a['quality'] === $b['quality'] ) {
+ return 0;
+ }
+
+ return ( $a['quality'] < $b['quality'] ) ? 1 : -1;
+ } );
+ }
+
+ $connection->setTimeout( $oldTimeout );
+
+ return $suggestions;
+ }
+
+ /* Write functions */
+
+ /**
+ * Add / update translations.
+ *
+ * @param MessageHandle $handle
+ * @param ?string $targetText
+ * @throws \RuntimeException
+ * @return bool
+ */
+ public function update( MessageHandle $handle, $targetText ) {
+ if ( !$handle->isValid() || $handle->getCode() === '' ) {
+ return false;
+ }
+
+ /* There are various different cases here:
+ * [new or updated] [fuzzy|non-fuzzy] [translation|definition]
+ * 1) We don't distinguish between new or updated here.
+ * 2) Delete old translation, but not definition
+ * 3) Insert new translation or definition, if non-fuzzy
+ * The definition should never be fuzzied anyway.
+ *
+ * These only apply to known messages.
+ */
+
+ $sourceLanguage = $handle->getGroup()->getSourceLanguage();
+
+ // Do not delete definitions, because the translations are attached to that
+ if ( $handle->getCode() !== $sourceLanguage ) {
+ $localid = $handle->getTitleForBase()->getPrefixedText();
+ $this->deleteByQuery( $this->getType(), Elastica\Query::create(
+ ( new \Elastica\Query\BoolQuery() )
+ ->addFilter( new Elastica\Query\Term( [ 'wiki' => wfWikiID() ] ) )
+ ->addFilter( new Elastica\Query\Term( [ 'language' => $handle->getCode() ] ) )
+ ->addFilter( new Elastica\Query\Term( [ 'localid' => $localid ] ) ) ) );
+ }
+
+ // If translation was made fuzzy, we do not need to add anything
+ if ( $targetText === null ) {
+ return true;
+ }
+
+ $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID();
+ $doc = $this->createDocument( $handle, $targetText, $revId );
+ $fname = __METHOD__;
+
+ MWElasticUtils::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS,
+ function () use ( $doc ) {
+ $this->getType()->addDocument( $doc );
+ },
+ function ( $e, $errors ) use ( $fname ) {
+ $c = get_class( $e );
+ $msg = $e->getMessage();
+ error_log( $fname . ": update failed ($c: $msg); retrying." );
+ sleep( 10 );
+ }
+ );
+
+ return true;
+ }
+
+ /**
+ * @param MessageHandle $handle
+ * @param string $text
+ * @param int $revId
+ * @return \Elastica\Document
+ */
+ protected function createDocument( MessageHandle $handle, $text, $revId ) {
+ $language = $handle->getCode();
+
+ $localid = $handle->getTitleForBase()->getPrefixedText();
+ $wiki = wfWikiID();
+ $globalid = "$wiki-$localid-$revId/$language";
+
+ $data = [
+ 'wiki' => $wiki,
+ 'uri' => $handle->getTitle()->getCanonicalURL(),
+ 'localid' => $localid,
+ 'language' => $language,
+ 'content' => $text,
+ 'group' => $handle->getGroupIds(),
+ ];
+
+ return new \Elastica\Document( $globalid, $data );
+ }
+
+ /**
+ * Create index
+ * @param bool $rebuild Deletes index first if already exists
+ */
+ public function createIndex( $rebuild ) {
+ $indexSettings = [
+ 'number_of_shards' => $this->getShardCount(),
+ 'analysis' => [
+ 'filter' => [
+ 'prefix_filter' => [
+ 'type' => 'edge_ngram',
+ 'min_gram' => 2,
+ 'max_gram' => 20
+ ]
+ ],
+ 'analyzer' => [
+ 'prefix' => [
+ 'type' => 'custom',
+ 'tokenizer' => 'standard',
+ 'filter' => [ 'standard', 'lowercase', 'prefix_filter' ]
+ ],
+ 'casesensitive' => [
+ 'tokenizer' => 'standard',
+ 'filter' => [ 'standard' ]
+ ]
+ ]
+ ]
+ ];
+ $replicas = $this->getReplicaCount();
+ if ( strpos( $replicas, '-' ) === false ) {
+ $indexSettings['number_of_replicas'] = $replicas;
+ } else {
+ $indexSettings['auto_expand_replicas'] = $replicas;
+ }
+
+ $type = $this->getType();
+ $type->getIndex()->create( $indexSettings, $rebuild );
+ }
+
+ /**
+ * Begin the bootstrap process.
+ *
+ * @throws \RuntimeException
+ */
+ public function beginBootstrap() {
+ $type = $this->getType();
+ if ( $this->updateMapping ) {
+ $this->logOutput( 'Updating the index mappings...' );
+ $this->createIndex( true );
+ } elseif ( !$type->getIndex()->exists() ) {
+ $this->createIndex( false );
+ }
+
+ $settings = $type->getIndex()->getSettings();
+ $settings->setRefreshInterval( '-1' );
+
+ $this->deleteByQuery( $this->getType(), \Elastica\Query::create(
+ ( new Elastica\Query\Term() )->setTerm( 'wiki', wfWikiID() ) ) );
+
+ $mapping = new \Elastica\Type\Mapping();
+ $mapping->setType( $type );
+ $mapping->setProperties( [
+ 'wiki' => [ 'type' => 'keyword' ],
+ 'localid' => [ 'type' => 'keyword' ],
+ 'uri' => [ 'type' => 'keyword' ],
+ 'language' => [ 'type' => 'keyword' ],
+ 'group' => [ 'type' => 'keyword' ],
+ 'content' => [
+ 'type' => 'text',
+ 'fields' => [
+ 'content' => [
+ 'type' => 'text',
+ 'term_vector' => 'yes'
+ ],
+ 'prefix_complete' => [
+ 'type' => 'text',
+ 'analyzer' => 'prefix',
+ 'search_analyzer' => 'standard',
+ 'term_vector' => 'yes'
+ ],
+ 'case_sensitive' => [
+ 'type' => 'text',
+ 'analyzer' => 'casesensitive',
+ 'term_vector' => 'yes'
+ ]
+ ]
+ ],
+ ] );
+ $mapping->send();
+
+ $this->waitUntilReady();
+ }
+
+ public function beginBatch() {
+ // I hate the rule that forbids {}
+ }
+
+ public function batchInsertDefinitions( array $batch ) {
+ $lb = new LinkBatch();
+ foreach ( $batch as $data ) {
+ $lb->addObj( $data[0]->getTitle() );
+ }
+ $lb->execute();
+
+ $this->batchInsertTranslations( $batch );
+ }
+
+ public function batchInsertTranslations( array $batch ) {
+ $docs = [];
+ foreach ( $batch as $data ) {
+ list( $handle, $sourceLanguage, $text ) = $data;
+ $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID();
+ $docs[] = $this->createDocument( $handle, $text, $revId );
+ }
+
+ MWElasticUtils::withRetry( self::BULK_INDEX_RETRY_ATTEMPTS,
+ function () use ( $docs ) {
+ $this->getType()->addDocuments( $docs );
+ },
+ function ( $e, $errors ) {
+ $c = get_class( $e );
+ $msg = $e->getMessage();
+ $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" );
+ sleep( 10 );
+ }
+ );
+ }
+
+ public function endBatch() {
+ // I hate the rule that forbids {}
+ }
+
+ public function endBootstrap() {
+ $index = $this->getType()->getIndex();
+ $index->refresh();
+ $index->forcemerge();
+ $index->getSettings()->setRefreshInterval( '5s' );
+ }
+
+ public function getClient() {
+ if ( !$this->client ) {
+ if ( isset( $this->config['config'] ) ) {
+ $this->client = new \Elastica\Client( $this->config['config'] );
+ } else {
+ $this->client = new \Elastica\Client();
+ }
+ }
+ return $this->client;
+ }
+
+ /**
+ * @return true if the backend is configured with the wikimedia extra plugin
+ */
+ public function useWikimediaExtraPlugin() {
+ return isset( $this->config['use_wikimedia_extra'] ) && $this->config['use_wikimedia_extra'];
+ }
+
+ /**
+ * @return string
+ */
+ private function getIndexName() {
+ if ( isset( $this->config['index'] ) ) {
+ return $this->config['index'];
+ } else {
+ return 'ttmserver';
+ }
+ }
+
+ public function getType() {
+ return $this->getClient()
+ ->getIndex( $this->getIndexName() )
+ ->getType( 'message' );
+ }
+
+ protected function getShardCount() {
+ return $this->config['shards'] ?? 1;
+ }
+
+ protected function getReplicaCount() {
+ return $this->config['replicas'] ?? '0-2';
+ }
+
+ /**
+ * Get index health
+ * TODO: Remove this code in the future as we drop support for
+ * older versions of the Elastica extension.
+ *
+ * @param string $indexName
+ * @return array the index health status
+ */
+ protected function getIndexHealth( $indexName ) {
+ $path = "_cluster/health/$indexName";
+ $response = $this->getClient()->request( $path );
+ if ( $response->hasError() ) {
+ throw new \Exception( "Error while fetching index health status: " . $response->getError() );
+ }
+ return $response->getData();
+ }
+
+ /**
+ * Wait for the index to go green
+ *
+ * NOTE: This method has been copied and adjusted from
+ * CirrusSearch/includes/Maintenance/ConfigUtils.php. Ideally we'd
+ * like to make these utility methods available in the Elastica
+ * extension, but this one requires some refactoring in cirrus first.
+ * TODO: Remove this code in the future as we drop support for
+ * older versions of the Elastica extension.
+ *
+ * @param string $indexName
+ * @param int $timeout
+ * @return bool true if the index is green false otherwise.
+ */
+ protected function waitForGreen( $indexName, $timeout ) {
+ $startTime = time();
+ while ( ( $startTime + $timeout ) > time() ) {
+ try {
+ $response = $this->getIndexHealth( $indexName );
+ $status = isset( $response['status'] ) ? $response['status'] : 'unknown';
+ if ( $status === 'green' ) {
+ $this->logOutput( "\tGreen!" );
+ return true;
+ }
+ $this->logOutput( "\tIndex is $status retrying..." );
+ sleep( 5 );
+ } catch ( \Exception $e ) {
+ $this->logOutput( "Error while waiting for green ({$e->getMessage()}), retrying..." );
+ }
+ }
+ return false;
+ }
+
+ protected function waitUntilReady() {
+ if ( method_exists( 'MWElasticUtils', 'waitForGreen' ) ) {
+ $statuses = MWElasticUtils::waitForGreen(
+ $this->getClient(),
+ $this->getIndexName(),
+ self::WAIT_UNTIL_READY_TIMEOUT );
+ $this->logOutput( "Waiting for the index to go green..." );
+ foreach ( $statuses as $message ) {
+ $this->logOutput( $message );
+ }
+
+ if ( !$statuses->getReturn() ) {
+ die( "Timeout! Please check server logs for {$this->getIndexName()}." );
+ }
+
+ return;
+ }
+
+ // TODO: This code can be removed in the future as we drop support for
+ // older versions of the Elastica extension.
+ $indexName = $this->getType()->getIndex()->getName();
+ $this->logOutput( "Waiting for the index to go green..." );
+ if ( !$this->waitForGreen( $indexName, self::WAIT_UNTIL_READY_TIMEOUT ) ) {
+ die( "Timeout! Please check server logs for {$this->getIndex()->getName()}." );
+ }
+ }
+
+ public function setLogger( $logger ) {
+ $this->logger = $logger;
+ }
+
+ // Can it get any uglier?
+ protected function logOutput( $text ) {
+ if ( $this->logger ) {
+ $this->logger->statusLine( "$text\n" );
+ }
+ }
+
+ /**
+ * Force the update of index mappings
+ * @since 2015.03
+ */
+ public function doMappingUpdate() {
+ $this->updateMapping = true;
+ }
+
+ /**
+ * Parse query string and build the search query
+ * @param string $queryString
+ * @param array $opts
+ * @return array
+ */
+ protected function parseQueryString( $queryString, array $opts ) {
+ $fields = $highlights = [];
+ $terms = preg_split( '/\s+/', $queryString );
+ $match = $opts['match'];
+ $case = $opts['case'];
+
+ // Map each word in the query string with its corresponding field
+ foreach ( $terms as $term ) {
+ $prefix = strstr( $term, '*', true );
+ if ( $prefix ) {
+ // For wildcard search
+ $fields['content.prefix_complete'][] = $prefix;
+ } elseif ( $case === '1' ) {
+ // For case sensitive search
+ $fields['content.case_sensitive'][] = $term;
+ } else {
+ $fields['content'][] = $term;
+ }
+ }
+
+ // Allow searching either by message content or message id (page name
+ // without language subpage) with exact match only.
+ $searchQuery = new \Elastica\Query\BoolQuery();
+ foreach ( $fields as $analyzer => $words ) {
+ foreach ( $words as $word ) {
+ $boolQuery = new \Elastica\Query\BoolQuery();
+ $contentQuery = new \Elastica\Query\Match();
+ $contentQuery->setFieldQuery( $analyzer, $word );
+ $boolQuery->addShould( $contentQuery );
+ $messageQuery = new \Elastica\Query\Term();
+ $messageQuery->setTerm( 'localid', $word );
+ $boolQuery->addShould( $messageQuery );
+
+ if ( $match === 'all' ) {
+ $searchQuery->addMust( $boolQuery );
+ } else {
+ $searchQuery->addShould( $boolQuery );
+ }
+
+ // Fields for highlighting
+ $highlights[$analyzer] = [
+ 'number_of_fragments' => 0
+ ];
+
+ // Allow searching by exact message title (page name with
+ // language subpage).
+ $title = Title::newFromText( $word );
+ if ( !$title ) {
+ continue;
+ }
+ $handle = new MessageHandle( $title );
+ if ( $handle->isValid() && $handle->getCode() !== '' ) {
+ $localid = $handle->getTitleForBase()->getPrefixedText();
+ $boolQuery = new \Elastica\Query\BoolQuery();
+ $messageId = new \Elastica\Query\Term();
+ $messageId->setTerm( 'localid', $localid );
+ $boolQuery->addMust( $messageId );
+ $searchQuery->addShould( $boolQuery );
+ }
+ }
+ }
+
+ return [ $searchQuery, $highlights ];
+ }
+
+ /**
+ * Search interface
+ * @param string $queryString
+ * @param array $opts
+ * @param array $highlight
+ * @return \Elastica\Search
+ */
+ public function createSearch( $queryString, $opts, $highlight ) {
+ $query = new \Elastica\Query();
+
+ list( $searchQuery, $highlights ) = $this->parseQueryString( $queryString, $opts );
+ $query->setQuery( $searchQuery );
+
+ $language = new \Elastica\Aggregation\Terms( 'language' );
+ $language->setField( 'language' );
+ $language->setSize( 500 );
+ $query->addAggregation( $language );
+
+ $group = new \Elastica\Aggregation\Terms( 'group' );
+ $group->setField( 'group' );
+ // Would like to prioritize the top level groups and not show subgroups
+ // if the top group has only few hits, but that doesn't seem to be possile.
+ $group->setSize( 500 );
+ $query->addAggregation( $group );
+
+ $query->setSize( $opts['limit'] );
+ $query->setFrom( $opts['offset'] );
+
+ // BoolAnd filters are executed in sequence per document. Bool filters with
+ // multiple must clauses are executed by converting each filter into a bit
+ // field then anding them together. The latter is normally faster if either
+ // of the subfilters are reused. May not make a difference in this context.
+ $filters = new \Elastica\Query\BoolQuery();
+
+ $language = $opts['language'];
+ if ( $language !== '' ) {
+ $languageFilter = new \Elastica\Query\Term();
+ $languageFilter->setTerm( 'language', $language );
+ $filters->addFilter( $languageFilter );
+ }
+
+ $group = $opts['group'];
+ if ( $group !== '' ) {
+ $groupFilter = new \Elastica\Query\Term();
+ $groupFilter->setTerm( 'group', $group );
+ $filters->addFilter( $groupFilter );
+ }
+
+ // Check that we have at least one filter to avoid invalid query errors.
+ if ( $language !== '' || $group !== '' ) {
+ // TODO: This seems wrong, but perhaps for aggregation purposes?
+ // should make $search a must clause and use the bool query
+ // as main.
+ $query->setPostFilter( $filters );
+ }
+
+ list( $pre, $post ) = $highlight;
+ $query->setHighlight( [
+ // The value must be an object
+ 'pre_tags' => [ $pre ],
+ 'post_tags' => [ $post ],
+ 'fields' => $highlights,
+ ] );
+
+ return $this->getType()->getIndex()->createSearch( $query );
+ }
+
+ /**
+ * Search interface
+ * @param string $queryString
+ * @param array $opts
+ * @param array $highlight
+ * @throws TTMServerException
+ * @return \Elastica\ResultSet
+ */
+ public function search( $queryString, $opts, $highlight ) {
+ $search = $this->createSearch( $queryString, $opts, $highlight );
+
+ try {
+ return $search->search();
+ } catch ( \Elastica\Exception\ExceptionInterface $e ) {
+ throw new TTMServerException( $e->getMessage() );
+ }
+ }
+
+ public function getFacets( $resultset ) {
+ $aggs = $resultset->getAggregations();
+
+ $ret = [
+ 'language' => [],
+ 'group' => []
+ ];
+
+ foreach ( $aggs as $type => $info ) {
+ foreach ( $info['buckets'] as $row ) {
+ $ret[$type][$row['key']] = $row['doc_count'];
+ }
+ }
+
+ return $ret;
+ }
+
+ public function getTotalHits( $resultset ) {
+ return $resultset->getTotalHits();
+ }
+
+ public function getDocuments( $resultset ) {
+ $ret = [];
+ foreach ( $resultset->getResults() as $document ) {
+ $data = $document->getData();
+ $hl = $document->getHighlights();
+ if ( isset( $hl['content.prefix_complete'][0] ) ) {
+ $data['content'] = $hl['content.prefix_complete'][0];
+ } elseif ( isset( $hl['content.case_sensitive'][0] ) ) {
+ $data['content'] = $hl['content.case_sensitive'][0];
+ } elseif ( isset( $hl['content'][0] ) ) {
+ $data['content'] = $hl['content'][0];
+ }
+ $ret[] = $data;
+ }
+
+ return $ret;
+ }
+
+ /**
+ * Delete docs by query by using the scroll API.
+ * TODO: Elastica\Index::deleteByQuery() ? was removed
+ * in 2.x and returned in 5.x.
+ *
+ * @param \Elastica\Type $type the source index
+ * @param \Elastica\Query $query the query
+ * @throws \RuntimeException
+ */
+ private function deleteByQuery( \Elastica\Type $type, \Elastica\Query $query ) {
+ if ( method_exists( 'MWElasticUtils', 'deleteByQuery' ) ) {
+ try {
+ MWElasticUtils::deleteByQuery( $type->getIndex(), $query, /* $allowConflicts = */ true );
+ } catch ( \Exception $e ) {
+ LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->error(
+ 'Problem encountered during deletion.',
+ [ 'exception' => $e ]
+ );
+
+ throw new \RuntimeException( "Problem encountered during deletion.\n" . $e );
+ }
+ return;
+ }
+ // TODO: This code can be removed in the future as we drop support for
+ // older versions of the Elastica extension.
+ $retryAttempts = self::BULK_INDEX_RETRY_ATTEMPTS;
+ $search = new \Elastica\Search( $this->getClient() );
+ $search->setQuery( $query );
+ $search->addType( $type );
+ $search->addIndex( $type->getIndex() );
+ $scroll = new \Elastica\Scroll( $search, '15m' );
+
+ foreach ( $scroll as $results ) {
+ $ids = [];
+ foreach ( $results as $result ) {
+ $ids[] = $result->getId();
+ }
+
+ if ( $ids === [] ) {
+ continue;
+ }
+
+ MWElasticUtils::withRetry( $retryAttempts,
+ function () use ( $ids, $type ) {
+ $type->deleteIds( $ids );
+ }
+ );
+ }
+ }
+
+ /**
+ * @return bool
+ */
+ public function isFrozen() {
+ if ( method_exists( 'MWElasticUtils', 'isFrozen' ) ) {
+ try {
+ return MWElasticUtils::isFrozen( $this->getClient() );
+ } catch ( \Exception $e ) {
+ LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->warning(
+ 'Problem encountered while checking the frozen index.',
+ [ 'exception' => $e ]
+ );
+ return false;
+ }
+ }
+
+ // TODO: This code can be removed in the future as we drop support for
+ // older versions of the Elastica extension.
+ if ( !isset( $this->config['frozen_index'] ) ) {
+ return false;
+ }
+ $frozenIndex = $this->config['frozen_index'];
+ $indices = [ static::ALL_INDEXES_FROZEN_NAME, $this->getIndexName() ];
+ $ids = ( new \Elastica\Query\Ids() )
+ ->setIds( $indices );
+
+ try {
+ $resp = $this->getClient()
+ ->getIndex( $frozenIndex )
+ ->getType( static::FROZEN_TYPE )
+ ->search( \Elastica\Query::create( $ids ) );
+
+ if ( $resp->count() === 0 ) {
+ return false;
+ } else {
+ return true;
+ }
+ } catch ( \Exception $e ) {
+ LoggerFactory::getInstance( 'ElasticSearchTTMServer' )->warning(
+ 'Problem encountered while checking the frozen index.',
+ [ 'exception' => $e ]
+ );
+ return false;
+ }
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/Exceptions.php b/www/wiki/extensions/Translate/ttmserver/Exceptions.php
new file mode 100644
index 00000000..97859cf0
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/Exceptions.php
@@ -0,0 +1,4 @@
+<?php
+
+class TTMServerException extends MWException {
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php b/www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php
new file mode 100644
index 00000000..16715592
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/FakeTTMServer.php
@@ -0,0 +1,60 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @copyright Copyright © 2012-2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+/**
+ * NO-OP version of TTMServer when it is disabled.
+ * Keeps other code simpler when they can just do
+ * TTMServer::primary()->update( ... );
+ * @since 2012-01-28
+ * @ingroup TTMServer
+ */
+class FakeTTMServer implements ReadableTTMServer, WritableTTMServer {
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ return [];
+ }
+
+ public function isLocalSuggestion( array $suggestion ) {
+ false;
+ }
+
+ public function expandLocation( array $suggestion ) {
+ return '';
+ }
+
+ public function update( MessageHandle $handle, $targetText ) {
+ }
+
+ public function beginBootstrap() {
+ }
+
+ public function beginBatch() {
+ }
+
+ public function batchInsertDefinitions( array $batch ) {
+ }
+
+ public function batchInsertTranslations( array $batch ) {
+ }
+
+ public function endBatch() {
+ }
+
+ public function endBootstrap() {
+ }
+
+ public function getMirrors() {
+ return [];
+ }
+
+ public function isFrozen() {
+ return false;
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php b/www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php
new file mode 100644
index 00000000..143b3222
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/FuzzyLikeThis.php
@@ -0,0 +1,222 @@
+<?php
+/**
+ * NOTE: the following class has been copied from elastica 2.3.1 :
+ * https://github.com/ruflin/Elastica/blob/2.3.1/lib/Elastica/Query/FuzzyLikeThis.php
+ * (few modifications have been made to comply with phpcs rules used by this extension)
+ * It is intended to be used as a temporary workaround with the wmf extra
+ * elasticsearch plugin with elasticsearch 2.x.
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2014 Nicolas Ruflin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * (c.f. https://github.com/ruflin/Elastica/blob/2.3.1/LICENSE.txt)
+ *
+ * @file
+ * @license MIT
+ * @ingroup TTMServer
+ */
+
+/**
+ * Fuzzy Like This query.
+ *
+ * @author Raul Martinez, Jr <juneym@gmail.com>
+ *
+ * @link https://www.elastic.co/guide/en/elasticsearch/reference/1.7/query-dsl-flt-query.html
+ *
+ * @since 2016.05
+ * @ingroup TTMServer
+ */
+class FuzzyLikeThis extends \Elastica\Query\AbstractQuery {
+ // phpcs:disable PSR2.Classes.PropertyDeclaration.Underscore
+ /**
+ * Field names.
+ *
+ * @var array Field names
+ */
+ protected $_fields = [];
+
+ /**
+ * Like text.
+ *
+ * @var string Like text
+ */
+ protected $_likeText = '';
+
+ /**
+ * Ignore term frequency.
+ *
+ * @var bool ignore term frequency
+ */
+ protected $_ignoreTF = false;
+
+ /**
+ * Max query terms value.
+ *
+ * @var int Max query terms value
+ */
+ protected $_maxQueryTerms = 25;
+
+ /**
+ * fuzziness.
+ *
+ * @var int fuzziness
+ */
+ protected $_fuzziness = 2;
+
+ /**
+ * Prefix Length.
+ *
+ * @var int Prefix Length
+ */
+ protected $_prefixLength = 0;
+
+ /**
+ * Analyzer.
+ *
+ * @var string Analyzer
+ */
+ protected $_analyzer;
+ // phpcs:enable
+
+ /**
+ * Adds field to flt query.
+ *
+ * @param array $fields Field names
+ *
+ * @return $this
+ */
+ public function addFields( array $fields ) {
+ $this->_fields = $fields;
+
+ return $this;
+ }
+
+ /**
+ * Set the "like_text" value.
+ *
+ * @param string $text
+ *
+ * @return $this
+ */
+ public function setLikeText( $text ) {
+ $text = trim( $text );
+ $this->_likeText = $text;
+
+ return $this;
+ }
+
+ /**
+ * Set the "ignore_tf" value (ignore term frequency).
+ *
+ * @param bool $ignoreTF
+ *
+ * @return $this
+ */
+ public function setIgnoreTF( $ignoreTF ) {
+ $this->_ignoreTF = (bool)$ignoreTF;
+
+ return $this;
+ }
+
+ /**
+ * Set the minimum similarity.
+ *
+ * @param int $value
+ *
+ * @return $this
+ */
+ public function setFuzziness( $value ) {
+ $value = (int)$value;
+ $this->_fuzziness = $value;
+
+ return $this;
+ }
+
+ /**
+ * Set Prefix Length.
+ *
+ * @param int $value Prefix length
+ *
+ * @return $this
+ */
+ public function setPrefixLength( $value ) {
+ $this->_prefixLength = (int)$value;
+
+ return $this;
+ }
+
+ /**
+ * Set max_query_terms.
+ *
+ * @param int $value Max query terms value
+ *
+ * @return $this
+ */
+ public function setMaxQueryTerms( $value ) {
+ $this->_maxQueryTerms = (int)$value;
+
+ return $this;
+ }
+
+ /**
+ * Set analyzer.
+ *
+ * @param string $text Analyzer text
+ *
+ * @return $this
+ */
+ public function setAnalyzer( $text ) {
+ $text = trim( $text );
+ $this->_analyzer = $text;
+
+ return $this;
+ }
+
+ /**
+ * Converts fuzzy like this query to array.
+ *
+ * @return array Query array
+ *
+ * @see \Elastica\Query\AbstractQuery::toArray()
+ */
+ public function toArray() {
+ if ( !empty( $this->_fields ) ) {
+ $args['fields'] = $this->_fields;
+ }
+
+ if ( !empty( $this->_analyzer ) ) {
+ $args['analyzer'] = $this->_analyzer;
+ }
+
+ $args['fuzziness'] = ( $this->_fuzziness > 0 ) ? $this->_fuzziness : 0;
+
+ $args['like_text'] = $this->_likeText;
+ $args['prefix_length'] = $this->_prefixLength;
+ $args['ignore_tf'] = $this->_ignoreTF;
+ $args['max_query_terms'] = $this->_maxQueryTerms;
+
+ $data = parent::toArray();
+ $args = array_merge( $args, $data['fuzzy_like_this'] );
+
+ return [ 'fuzzy_like_this' => $args ];
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/Interfaces.php b/www/wiki/extensions/Translate/ttmserver/Interfaces.php
new file mode 100644
index 00000000..1f8cb20e
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/Interfaces.php
@@ -0,0 +1,154 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @copyright Copyright © 2012-2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+/**
+ * Interface for TTMServer that can be queried (=all of them).
+ * @ingroup TTMServer
+ * @since 2012-06-27
+ */
+interface ReadableTTMServer {
+ /**
+ * Fetches all relevant suggestions for given text.
+ *
+ * @param string $sourceLanguage language code for the provide text
+ * @param string $targetLanguage language code for the suggestions
+ * @param string $text the text for which to search suggestions
+ * @return array List: unordered suggestions, which each has fields:
+ * - source: String: the original text of the suggestion
+ * - target: String: the suggestion
+ * - context: String: title of the page where the suggestion comes from
+ * - quality: Float: the quality of suggestion, 1 is perfect match
+ */
+ public function query( $sourceLanguage, $targetLanguage, $text );
+
+ /**
+ * Determines if the suggestion returned by this TTMServer comes
+ * from this wiki or any other wiki.
+ * @param array $suggestion
+ * @return Bool
+ */
+ public function isLocalSuggestion( array $suggestion );
+
+ /**
+ * Given suggestion returned by this TTMServer, constructs fully
+ * qualified URL to the location of the translation.
+ * @param array $suggestion
+ * @return String URL
+ */
+ public function expandLocation( array $suggestion );
+}
+
+/**
+ * Interface for TTMServer that can be updated.
+ * @ingroup TTMServer
+ * @since 2012-06-27
+ */
+interface WritableTTMServer {
+ /**
+ * Shovels the new translation into translation memory.
+ * Use this for single updates (=after message edit).
+ * If no text is provided, entry will be removed from the translation
+ * memory.
+ *
+ * @param MessageHandle $handle
+ * @param string|null $targetText Use null to only delete.
+ */
+ public function update( MessageHandle $handle, $targetText );
+
+ /**
+ * Called when starting to fill the translation memory.
+ * Set up necessary variables and remove old content
+ * from the server.
+ */
+ public function beginBootstrap();
+
+ /**
+ * Called before every batch (MessageGroup).
+ */
+ public function beginBatch();
+
+ /**
+ * Called multiple times per batch if necessary.
+ *
+ * @param array $batch
+ */
+ public function batchInsertDefinitions( array $batch );
+
+ /**
+ * Called multiple times per batch if necessary.
+ *
+ * @param array $batch
+ */
+ public function batchInsertTranslations( array $batch );
+
+ /**
+ * Called before every batch (MessageGroup).
+ */
+ public function endBatch();
+
+ /**
+ * Do any cleanup, optimizing etc.
+ */
+ public function endBootstrap();
+
+ /**
+ * Get the list of services to duplicate writes to make them "mirrors"
+ * of this service.
+ *
+ * @since 2017.04
+ * @return string[]
+ */
+ public function getMirrors();
+
+ /**
+ * Check if the service is frozen, attempting to write to
+ * a frozen service may lead to errors or unexpected behaviors.
+ *
+ * @since 2017.04
+ * @return bool true if the service is frozen
+ */
+ public function isFrozen();
+}
+
+/**
+ * Interface for TTMServer that can act as backend for translation search.
+ * @ingroup TTMServer
+ * @since 2014.04
+ */
+interface SearchableTTMServer {
+ /**
+ * Performs a search in the translation database.
+ *
+ * @param string $queryString String to search for.
+ * @param array $opts Query options like language.
+ * @param array $highlight Tags for highlighting.
+ * @return mixed Result set
+ */
+ public function search( $queryString, $opts, $highlight );
+
+ /**
+ * @param stdClass $resultset
+ * @return array[]
+ */
+ public function getFacets( $resultset );
+
+ /**
+ * @param stdClass $resultset
+ * @return int
+ */
+ public function getTotalHits( $resultset );
+
+ /**
+ * @param stdClass $resultset
+ * @return array[]
+ */
+ public function getDocuments( $resultset );
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php b/www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php
new file mode 100644
index 00000000..402ad5ac
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/RemoteTTMServer.php
@@ -0,0 +1,32 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @copyright Copyright © 2012-2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+/**
+ * Class for handling remote TTMServers over MediaWiki API.
+ * Currently querying is done in TranslationHelpers, and
+ * this class only handles location retrieval.
+ * @since 2012-06-27
+ * @ingroup TTMServer
+ */
+class RemoteTTMServer extends TTMServer implements ReadableTTMServer {
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ // @todo Implement some day perhaps?
+ return [];
+ }
+
+ public function isLocalSuggestion( array $suggestion ) {
+ return false;
+ }
+
+ public function expandLocation( array $suggestion ) {
+ return $suggestion['location'];
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php
new file mode 100644
index 00000000..375d544f
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php
@@ -0,0 +1,446 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @copyright Copyright © 2012-2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+/**
+ * TTMServer backed based on Solr instance. Depends on Solarium.
+ * @since 2012-06-27
+ * @ingroup TTMServer
+ * @deprecated 1.27. Will be removed in 1.29.
+ */
+class SolrTTMServer
+ extends TTMServer
+ implements ReadableTTMServer, SearchableTTMServer, WritableTTMServer
+{
+ /**
+ * In case auto-commit is not enabled, or even if it is, tell solr to
+ * commit before this time has passed, in milliseconds.
+ */
+ const COMMIT_WITHIN = 5000;
+
+ protected $client;
+
+ /**
+ * Reference to the maintenance script to relay logging output.
+ */
+ protected $logger;
+
+ public function __construct( $config ) {
+ wfDeprecated( __METHOD__, '1.24' );
+
+ parent::__construct( $config );
+
+ if ( isset( $config['config'] ) ) {
+ $this->client = new Solarium_Client( $config['config'] );
+ } else {
+ $this->client = new Solarium_Client();
+ }
+ }
+
+ public function isLocalSuggestion( array $suggestion ) {
+ return $suggestion['wiki'] === wfWikiID();
+ }
+
+ public function expandLocation( array $suggestion ) {
+ return $suggestion['uri'];
+ }
+
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ try {
+ return $this->doQuery( $sourceLanguage, $targetLanguage, $text );
+ } catch ( Solarium_Exception $e ) {
+ throw new TranslationHelperException( 'Solarium exception: ' . $e );
+ }
+ }
+
+ /// @see ReadableTTMServer::query
+ protected function doQuery( $sourceLanguage, $targetLanguage, $text ) {
+ /* Two query system:
+ * 1) Find all strings in source language that match text
+ * 2) Do another query for translations for those strings
+ */
+ // For now impose a length limit on query string to avoid doing
+ // very slow queries. Magic number.
+ if ( strlen( $text ) > 789 ) {
+ return [];
+ }
+
+ $query = $this->client->createSelect();
+ $query->setFields( [ 'globalid', 'content', 'score' ] );
+
+ /* The interface usually displays three best candidates. These might
+ * come from more than three matches, if the translation is the same.
+ * This might not find all suggestions, if the top N best matching
+ * source texts don't have translations, but worse matches do. We
+ * could loop with start parameter to fetch more until we have enough
+ * suggestions or the quality drops below the cutoff point. */
+ $query->setRows( 25 );
+
+ /* Our string can contain all kind of nasty characters, so we need
+ * escape them with great pain. */
+ $helper = $query->getHelper();
+ $dist = $helper->escapePhrase( $text );
+ // "edit" could also be ngram of other algorithm
+ $dist = "strdist($dist,content,edit)";
+ /* Note how we need to escape twice here, first the string for strdist
+ * and then the strdist call itself for the query. And of course every-
+ * thing will be URL encoded once sent over the line. */
+ $query->setQuery( '_val_:%P1%', [ $dist ] );
+
+ /* Filter queries are supposed to be efficient as they are separately
+ * cached, but I haven't done any benchmarks. */
+ $query->createFilterQuery( 'lang' )
+ ->setQuery( 'language:%P1%', [ $sourceLanguage ] );
+
+ $resultset = $this->client->select( $query );
+
+ /* This query is doing two unrelated things:
+ * 1) Collect the message contents and scores so that they can
+ * be accessed later for the translations we found.
+ * 2) Build the query string for the query that fetches the
+ * translations.
+ * This code is a bit uglier than I'd like it to be, since there
+ * there is no field that globally identifies a message (message
+ * definition and translations). */
+ $contents = $scores = [];
+ $queryString = '';
+ foreach ( $resultset as $doc ) {
+ $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid );
+ $contents[$sourceId] = $doc->content;
+ $scores[$sourceId] = $doc->score;
+
+ $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" );
+ $queryString .= "globalid:$globalid ";
+ }
+
+ // Second query to fetch available translations
+ $fetchQuery = $this->client->createSelect();
+ $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] );
+ // This come in random order, so have to fetch all and sort
+ $fetchQuery->setRows( 25 );
+ $fetchQuery->setQuery( $queryString );
+ // With AND we would not find anything, obviously.
+ $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR );
+
+ $translations = $this->client->select( $fetchQuery );
+
+ $suggestions = [];
+ foreach ( $translations as $doc ) {
+ /* Construct the matching source id */
+ $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid );
+
+ /* Unfortunately we cannot do this on the search server,
+ * because score is not a real field and thus cannot be
+ * used in a filter query. */
+ $quality = $scores[$sourceId];
+ if ( $quality < $this->config['cutoff'] ) {
+ continue;
+ }
+
+ $suggestions[] = [
+ 'source' => $contents[$sourceId],
+ 'target' => $doc->content,
+ 'context' => $doc->messageid,
+ 'quality' => $quality,
+ 'wiki' => $doc->wiki,
+ 'location' => $doc->messageid . '/' . $targetLanguage,
+ 'uri' => $doc->uri,
+ ];
+ }
+
+ /* Like mentioned above, we get results in random order. Sort them
+ * now to have best matches first as expected by callers. */
+ uasort( $suggestions, function ( $a, $b ) {
+ if ( $a['quality'] === $b['quality'] ) {
+ return 0;
+ }
+
+ return ( $a['quality'] < $b['quality'] ) ? 1 : -1;
+ } );
+
+ return $suggestions;
+ }
+
+ /* Write functions */
+
+ public function update( MessageHandle $handle, $targetText ) {
+ if ( $handle->getCode() === '' ) {
+ return false;
+ }
+
+ /* There are various different cases here:
+ * [new or updated] [fuzzy|non-fuzzy] [translation|definition]
+ * 1) We don't distinguish between new or updated here.
+ * 2) Delete old translation, but not definition
+ * 3) Insert new translation or definition, if non-fuzzy
+ * The definition should never be fuzzied anyway.
+ *
+ * These only apply to known messages.
+ */
+
+ $update = $this->client->createUpdate();
+ $title = $handle->getTitle();
+
+ $doDelete = true;
+ $sourceLanguage = '';
+ if ( $handle->isValid() ) {
+ $sourceLanguage = $handle->getGroup()->getSourceLanguage();
+ if ( $handle->getCode() === $sourceLanguage ) {
+ $doDelete = false;
+ }
+ }
+
+ if ( $doDelete ) {
+ $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() );
+ $conds = [
+ 'wiki' => wfWikiID(),
+ 'language' => $handle->getCode(),
+ 'messageid' => $base->getPrefixedText(),
+ ];
+ foreach ( $conds as $key => &$value ) {
+ $value = "$key:" . $update->getHelper()->escapePhrase( $value );
+ }
+ $update->addDeleteQuery( implode( ' AND ', $conds ) );
+ }
+
+ if ( $targetText !== null ) {
+ if ( $handle->isValid() ) {
+ // Of the message definition page
+ $targetTitle = $handle->getTitle();
+ $sourceTitle = Title::makeTitle(
+ $targetTitle->getNamespace(),
+ $handle->getKey() . '/' . $sourceLanguage
+ );
+ $revId = (int)$sourceTitle->getLatestRevID();
+ /* Note: in some cases the source page might not exist, in this case
+ * we use 0 as message version identifier, to differentiate them from
+ * orphan messages */
+ } else {
+ $revId = 'orphan';
+ }
+
+ $doc = $this->createDocument( $handle, $targetText, $revId );
+ // Add document and commit within X seconds.
+ $update->addDocument( $doc, null, self::COMMIT_WITHIN );
+ }
+
+ try {
+ $this->client->update( $update );
+ } catch ( Solarium_Exception $e ) {
+ error_log( 'SolrTTMServer update-write failed' );
+
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * @see schema.xml
+ * @param MessageHandle $handle
+ * @param string $text
+ * @param int $revId
+ * @return Solarium_Document_ReadWrite
+ */
+ protected function createDocument( MessageHandle $handle, $text, $revId ) {
+ $language = $handle->getCode();
+ $translationTitle = $handle->getTitle();
+
+ $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() );
+ $wiki = wfWikiID();
+ $messageid = $title->getPrefixedText();
+ $globalid = "$wiki-$messageid-$revId/$language";
+
+ $doc = new Solarium_Document_ReadWrite();
+ $doc->wiki = $wiki;
+ $doc->uri = $translationTitle->getCanonicalURL();
+ $doc->messageid = $messageid;
+ $doc->globalid = $globalid;
+
+ $doc->language = $language;
+ $doc->content = $text;
+ $doc->setField( 'group', $handle->getGroupIds() );
+
+ return $doc;
+ }
+
+ public function beginBootstrap() {
+ $update = $this->client->createUpdate();
+ $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() );
+ $update->addDeleteQuery( $query );
+ $update->addCommit();
+ $this->client->update( $update );
+ }
+
+ public function beginBatch() {
+ // I hate the rule that forbids {}
+ }
+
+ public function batchInsertDefinitions( array $batch ) {
+ $lb = new LinkBatch();
+ foreach ( $batch as $data ) {
+ $lb->addObj( $data[0]->getTitle() );
+ }
+ $lb->execute();
+
+ $this->batchInsertTranslations( $batch );
+ }
+
+ public function batchInsertTranslations( array $batch ) {
+ $update = $this->client->createUpdate();
+ foreach ( $batch as $key => $data ) {
+ list( $handle, $sourceLanguage, $text ) = $data;
+ $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID();
+ $doc = $this->createDocument( $handle, $text, $revId );
+ // Add document and commit within X seconds.
+ $update->addDocument( $doc, null, self::COMMIT_WITHIN );
+ }
+
+ $retries = 5;
+
+ while ( $retries-- > 0 ) {
+ try {
+ $this->client->update( $update );
+ break;
+ } catch ( Solarium_Client_HttpException $e ) {
+ if ( $retries === 0 ) {
+ throw $e;
+ } else {
+ $c = get_class( $e );
+ $msg = $e->getMessage();
+ $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" );
+ sleep( 10 );
+ }
+ }
+ }
+ }
+
+ public function endBatch() {
+ $update = $this->client->createUpdate();
+ $this->client->update( $update );
+ }
+
+ public function endBootstrap() {
+ $update = $this->client->createUpdate();
+ $update->addCommit();
+ $update->addOptimize();
+ $this->client->update( $update );
+ }
+
+ public function getSolarium() {
+ return $this->client;
+ }
+
+ public function setLogger( $logger ) {
+ $this->logger = $logger;
+ }
+
+ // Can it get any uglier?
+ protected function logOutput( $text ) {
+ if ( $this->logger ) {
+ $this->logger->statusLine( "$text\n" );
+ }
+ }
+
+ /**
+ * Search interface
+ * @param string $queryString
+ * @param array $opts
+ * @param array $highlight
+ * @return array
+ * @throws TTMServerException
+ */
+ public function search( $queryString, $opts, $highlight ) {
+ $client = $this->getSolarium();
+
+ $query = $client->createSelect();
+ $dismax = $query->getDisMax();
+ $dismax->setQueryParser( 'edismax' );
+ $query->setQuery( $queryString );
+ $query->setRows( $opts['limit'] );
+ $query->setStart( $opts['offset'] );
+
+ list( $pre, $post ) = $highlight;
+ $hl = $query->getHighlighting();
+ $hl->setFields( 'text' );
+ $hl->setSimplePrefix( $pre );
+ $hl->setSimplePostfix( $post );
+ $hl->setMaxAnalyzedChars( '5000' );
+ $hl->setFragSize( '5000' );
+ $hl->setSnippets( 1 );
+
+ $languageFilter = $opts['language'];
+ if ( $languageFilter !== '' ) {
+ $query->createFilterQuery( 'languageFilter' )
+ ->setQuery( 'language:%P1%', [ $languageFilter ] )
+ ->addTag( 'filter' );
+ }
+
+ $groupFilter = $opts['group'];
+ if ( $groupFilter !== '' ) {
+ $query->createFilterQuery( 'groupFilter' )
+ ->setQuery( 'group:%P1%', [ $groupFilter ] )
+ ->addTag( 'filter' );
+ }
+
+ $facetSet = $query->getFacetSet();
+
+ $language = $facetSet->createFacetField( 'language' );
+ $language->setField( 'language' );
+ $language->setMinCount( 1 );
+ $language->addExclude( 'filter' );
+
+ $group = $facetSet->createFacetField( 'group' );
+ $group->setField( 'group' );
+ $group->setMinCount( 1 );
+ $group->setMissing( true );
+ $group->addExclude( 'filter' );
+
+ try {
+ return $client->select( $query );
+ } catch ( Solarium_Client_HttpException $e ) {
+ throw new TTMServerException( $e->getMessage() );
+ }
+ }
+
+ public function getFacets( $resultset ) {
+ return [
+ 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ),
+ 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ),
+ ];
+ }
+
+ public function getTotalHits( $resultset ) {
+ return $resultset->getNumFound();
+ }
+
+ public function getDocuments( $resultset ) {
+ $highlighting = $resultset->getHighlighting();
+ $ret = [];
+ foreach ( $resultset as $document ) {
+ $fields = iterator_to_array( $document );
+ // Compatibility mapping
+ $fields['localid'] = $fields['messageid'];
+
+ $hdoc = $highlighting->getResult( $document->globalid );
+ $text = $hdoc->getField( 'text' );
+ if ( $text === [] ) {
+ $text = $document->text;
+ } else {
+ $text = $text[0];
+ }
+
+ $fields['content'] = $text;
+ $ret[] = $fields;
+ }
+
+ return $ret;
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/TTMServer.php b/www/wiki/extensions/Translate/ttmserver/TTMServer.php
new file mode 100644
index 00000000..2a7f0900
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/TTMServer.php
@@ -0,0 +1,207 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @defgroup TTMServer The Translate extension translation memory interface
+ */
+
+/**
+ * Some general static methods for instantiating TTMServer and helpers.
+ * @since 2012-01-28
+ * Rewritten in 2012-06-27.
+ * @ingroup TTMServer
+ */
+class TTMServer {
+ /** @var array */
+ protected $config;
+
+ /**
+ * @param array $config
+ */
+ protected function __construct( array $config ) {
+ $this->config = $config;
+ }
+
+ /**
+ * @param array $config
+ * @return TTMServer|null
+ * @throws MWException
+ */
+ public static function factory( array $config ) {
+ if ( isset( $config['class'] ) ) {
+ $class = $config['class'];
+
+ return new $class( $config );
+ } elseif ( isset( $config['type'] ) ) {
+ $type = $config['type'];
+ switch ( $type ) {
+ case 'ttmserver':
+ return new DatabaseTTMServer( $config );
+ case 'remote-ttmserver':
+ return new RemoteTTMServer( $config );
+ default:
+ return null;
+ }
+ }
+
+ throw new MWException( 'TTMServer with no type' );
+ }
+
+ /**
+ * Returns the primary server instance, useful for chaining.
+ * Primary instance is defined by $wgTranslateTranslationDefaultService
+ * which is a key to $wgTranslateTranslationServices.
+ * @return WritableTTMServer
+ */
+ public static function primary() {
+ global $wgTranslateTranslationServices,
+ $wgTranslateTranslationDefaultService;
+ if ( isset( $wgTranslateTranslationServices[$wgTranslateTranslationDefaultService] ) ) {
+ $obj = self::factory( $wgTranslateTranslationServices[$wgTranslateTranslationDefaultService] );
+ if ( $obj instanceof WritableTTMServer ) {
+ return $obj;
+ }
+ }
+
+ return new FakeTTMServer();
+ }
+
+ /**
+ * @param array[] $suggestions
+ * @return array[]
+ */
+ public static function sortSuggestions( array $suggestions ) {
+ usort( $suggestions, [ __CLASS__, 'qualitySort' ] );
+
+ return $suggestions;
+ }
+
+ /**
+ * @param array $a
+ * @param array $b
+ * @return int
+ */
+ protected static function qualitySort( $a, $b ) {
+ list( $c, $d ) = [ $a['quality'], $b['quality'] ];
+ if ( $c === $d ) {
+ return 0;
+ }
+
+ // Descending sort
+ return ( $c > $d ) ? -1 : 1;
+ }
+
+ /**
+ * PHP implementation of Levenshtein edit distance algorithm.
+ * Uses the native PHP implementation when possible for speed.
+ * The native levenshtein is limited to 255 bytes.
+ *
+ * @param string $str1
+ * @param string $str2
+ * @param int $length1
+ * @param int $length2
+ * @return int
+ */
+ public static function levenshtein( $str1, $str2, $length1, $length2 ) {
+ if ( $length1 === 0 ) {
+ return $length2;
+ }
+ if ( $length2 === 0 ) {
+ return $length1;
+ }
+ if ( $str1 === $str2 ) {
+ return 0;
+ }
+
+ $bytelength1 = strlen( $str1 );
+ $bytelength2 = strlen( $str2 );
+ if ( $bytelength1 === $length1 && $bytelength1 <= 255
+ && $bytelength2 === $length2 && $bytelength2 <= 255
+ ) {
+ return levenshtein( $str1, $str2 );
+ }
+
+ $prevRow = range( 0, $length2 );
+ for ( $i = 0; $i < $length1; $i++ ) {
+ $currentRow = [];
+ $currentRow[0] = $i + 1;
+ $c1 = mb_substr( $str1, $i, 1 );
+ for ( $j = 0; $j < $length2; $j++ ) {
+ $c2 = mb_substr( $str2, $j, 1 );
+ $insertions = $prevRow[$j + 1] + 1;
+ $deletions = $currentRow[$j] + 1;
+ $substitutions = $prevRow[$j] + ( ( $c1 !== $c2 ) ? 1 : 0 );
+ $currentRow[] = min( $insertions, $deletions, $substitutions );
+ }
+ $prevRow = $currentRow;
+ }
+
+ return $prevRow[$length2];
+ }
+
+ /**
+ * Hook: ArticleDeleteComplete
+ * @param WikiPage $wikipage
+ */
+ public static function onDelete( WikiPage $wikipage ) {
+ $handle = new MessageHandle( $wikipage->getTitle() );
+ $job = TTMServerMessageUpdateJob::newJob( $handle, 'delete' );
+ JobQueueGroup::singleton()->push( $job );
+ }
+
+ /**
+ * Called from TranslateEditAddons::onSave
+ * @param MessageHandle $handle
+ */
+ public static function onChange( MessageHandle $handle ) {
+ $job = TTMServerMessageUpdateJob::newJob( $handle, 'refresh' );
+ JobQueueGroup::singleton()->push( $job );
+ }
+
+ /**
+ * @param MessageHandle $handle
+ * @param array $old
+ */
+ public static function onGroupChange( MessageHandle $handle, $old ) {
+ if ( $old === [] ) {
+ // Don't bother for newly added messages
+ return;
+ }
+
+ $job = TTMServerMessageUpdateJob::newJob( $handle, 'rebuild' );
+ JobQueueGroup::singleton()->push( $job );
+ }
+
+ /**
+ * @return string[]
+ */
+ public function getMirrors() {
+ global $wgTranslateTranslationServices;
+ if ( isset( $this->config['mirrors'] ) ) {
+ $mirrors = [];
+ foreach ( $this->config['mirrors'] as $name ) {
+ if ( !is_string( $name ) ) {
+ throw new TTMServerException( "Invalid configuration set in " .
+ "mirrors, expected an array of strings" );
+ }
+ if ( !isset( $wgTranslateTranslationServices[$name] ) ) {
+ throw new TTMServerException( "Invalid configuration in " .
+ "mirrors, unknown service $name" );
+ }
+ $mirrors[$name] = true;
+ }
+ return array_keys( $mirrors );
+ }
+ return [];
+ }
+
+ /**
+ * @return bool
+ */
+ public function isFrozen() {
+ return false;
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php b/www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php
new file mode 100644
index 00000000..7a6a91d7
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/TTMServerMessageUpdateJob.php
@@ -0,0 +1,347 @@
+<?php
+/**
+ * Contains class with job for updating translation memory.
+ *
+ * @file
+ * @author Niklas Laxström
+ * @license GPL-2.0-or-later
+ */
+
+use MediaWiki\Logger\LoggerFactory;
+
+/**
+ * Job for updating translation memory.
+ *
+ * job params:
+ * - command: the command to run, defaults to 'rebuild'
+ * - service: the service to write to, if set to null the job will write
+ * to the default (primary) service and its replicas.
+ * - errorCount: number of errors encountered while trying to perform the write
+ * on this service
+ *
+ * This job handles retries itself and return false in allowRetries to disable
+ * JobQueue's internal retry service.
+ *
+ * If mirroring is activated on the primary service then the first job
+ * will try to write to all services, it will resend a new job to
+ * every single service that failed and will increment errorCount.
+ * When too many errors occur on single service the job is dropped.
+ *
+ * @ingroup JobQueue
+ */
+class TTMServerMessageUpdateJob extends Job {
+ /**
+ * Number of *retries* allowed, 4 means we attempt
+ * to run the job 5 times (1 initial attempt + 4 retries).
+ */
+ const MAX_ERROR_RETRY = 4;
+
+ /**
+ * Constant used by backoffDelay().
+ * With 7 the cumulative delay between the first and last attempt is
+ * between 8 and 33 minutes.
+ */
+ const WRITE_BACKOFF_EXPONENT = 7;
+
+ /**
+ * The maximum amount of time jobs delayed due to frozen services can remain
+ * in the job queue.
+ */
+ const DROP_DELAYED_JOBS_AFTER = 86400; // 60 * 60 * 24 * 1;
+
+ /**
+ * @param MessageHandle $handle
+ * @param string $command
+ * @return self
+ */
+ public static function newJob( MessageHandle $handle, $command ) {
+ $job = new self( $handle->getTitle(), [ 'command' => $command ] );
+
+ return $job;
+ }
+
+ /**
+ * @param Title $title
+ * @param array $params
+ */
+ public function __construct( $title, $params = [] ) {
+ parent::__construct(
+ __CLASS__,
+ $title,
+ $params + [
+ 'command' => 'rebuild',
+ 'service' => null,
+ 'errorCount' => 0,
+ 'createdAt' => time(),
+ 'retryCount' => 0,
+ ]
+ );
+ }
+
+ /**
+ * Fetch all the translations and update them.
+ * @return bool
+ */
+ public function run() {
+ global $wgTranslateTranslationServices,
+ $wgTranslateTranslationDefaultService;
+
+ $service = $this->params['service'];
+ $writeToMirrors = false;
+
+ if ( $service === null ) {
+ $service = $wgTranslateTranslationDefaultService;
+ $writeToMirrors = true;
+ }
+
+ if ( !isset( $wgTranslateTranslationServices[$service] ) ) {
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->warning(
+ 'Received update job for a an unknown service {service}.',
+ [ 'service' => $service ]
+ );
+ return true;
+ }
+
+ $services = [ $service ];
+ if ( $writeToMirrors ) {
+ $config = $wgTranslateTranslationServices[$service];
+ $server = TTMServer::factory( $config );
+ $services = array_unique(
+ array_merge( $services, $server->getMirrors() )
+ );
+ }
+
+ foreach ( $services as $service ) {
+ $this->runCommandWithRetry( $service );
+ }
+ return true;
+ }
+
+ /**
+ * @inheritDoc
+ */
+ public function allowRetries() {
+ return false;
+ }
+
+ /**
+ * Run the update on the specified service name.
+ *
+ * @param string $serviceName the service name
+ */
+ private function runCommandWithRetry( $serviceName ) {
+ global $wgTranslateTranslationServices;
+
+ if ( !isset( $wgTranslateTranslationServices[$serviceName] ) ) {
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->warning(
+ 'Cannot write to {service}: service is unknown.',
+ [ 'service' => $serviceName ]
+ );
+ return;
+ }
+ $ttmserver = TTMServer::factory( $wgTranslateTranslationServices[$serviceName] );
+
+ if ( $serviceName === null || !( $ttmserver instanceof WritableTTMServer ) ) {
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->warning(
+ 'Received update job for a service that does not implement ' .
+ 'WritableTTMServer, please check config for {service}.',
+ [ 'service' => $serviceName ]
+ );
+ return;
+ }
+
+ try {
+ if ( $ttmserver->isFrozen() ) {
+ $this->requeueRetry( $serviceName );
+ } else {
+ $this->runCommand( $ttmserver );
+ }
+ } catch ( \Exception $e ) {
+ $this->requeueError( $serviceName, $e );
+ }
+ }
+
+ /**
+ * @param string $serviceName the service in error
+ * @param Exception $e the error
+ */
+ private function requeueError( $serviceName, $e ) {
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->warning(
+ 'Exception thrown while running {command} on ' .
+ 'service {service}: {errorMessage}',
+ [
+ 'command' => $this->params['command'],
+ 'service' => $serviceName,
+ 'errorMessage' => $e->getMessage(),
+ 'exception' => $e,
+ ]
+ );
+ if ( $this->params['errorCount'] >= self::MAX_ERROR_RETRY ) {
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->warning(
+ 'Dropping failing job {command} for service {service} ' .
+ 'after repeated failure',
+ [
+ 'command' => $this->params['command'],
+ 'service' => $serviceName,
+ ]
+ );
+ return;
+ }
+
+ $delay = self::backoffDelay( $this->params['errorCount'] );
+ $job = clone $this;
+ $job->params['errorCount']++;
+ $job->params['service'] = $serviceName;
+ $job->setDelay( $delay );
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->info(
+ 'Update job reported failure on service {service}. ' .
+ 'Requeueing job with delay of {delay}.',
+ [
+ 'service' => $serviceName,
+ 'delay' => $delay
+ ]
+ );
+ $this->resend( $job );
+ }
+
+ /**
+ * Re-queue job that is frozen, or drop the job if it has
+ * been frozen for too long.
+ *
+ * @param string $serviceName
+ */
+ private function requeueRetry( $serviceName ) {
+ $diff = time() - $this->params['createdAt'];
+ $dropTimeout = self::DROP_DELAYED_JOBS_AFTER;
+ if ( $diff > $dropTimeout ) {
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->warning(
+ 'Dropping delayed job {command} for service {service} ' .
+ 'after waiting {diff}s',
+ [
+ 'command' => $this->params['command'],
+ 'service' => $serviceName,
+ 'diff' => $diff,
+ ]
+ );
+ } else {
+ $delay = self::backoffDelay( $this->params['retryCount'] );
+ $job = clone $this;
+ $job->params['retryCount']++;
+ $job->params['service'] = $serviceName;
+ $job->setDelay( $delay );
+ LoggerFactory::getInstance( 'TTMServerUpdates' )->debug(
+ 'Service {service} reported frozen. ' .
+ 'Requeueing job with delay of {delay}s',
+ [
+ 'service' => $serviceName,
+ 'delay' => $delay
+ ]
+ );
+ $this->resend( $job );
+ }
+ }
+
+ /**
+ * Extracted for testing purpose
+ * @param self $job
+ */
+ protected function resend( self $job ) {
+ JobQueueGroup::singleton()->push( $job );
+ }
+
+ private function runCommand( WritableTTMServer $ttmserver ) {
+ $handle = $this->getHandle();
+ $command = $this->params['command'];
+
+ if ( $command === 'delete' ) {
+ $this->updateItem( $ttmserver, $handle, null, false );
+ } elseif ( $command === 'rebuild' ) {
+ $this->updateMessage( $ttmserver, $handle );
+ } elseif ( $command === 'refresh' ) {
+ $this->updateTranslation( $ttmserver, $handle );
+ }
+ }
+
+ /**
+ * Extracted for testing purpose
+ *
+ * @return MessageHandle
+ */
+ protected function getHandle() {
+ return new MessageHandle( $this->title );
+ }
+
+ /**
+ * Extracted for testing purpose
+ *
+ * @param MessageHandle $handle
+ * @return string
+ */
+ protected function getTranslation( MessageHandle $handle ) {
+ return TranslateUtils::getMessageContent(
+ $handle->getKey(),
+ $handle->getCode(),
+ $handle->getTitle()->getNamespace()
+ );
+ }
+
+ private function updateMessage( WritableTTMServer $ttmserver, MessageHandle $handle ) {
+ // Base page update, e.g. group change. Update everything.
+ $translations = ApiQueryMessageTranslations::getTranslations( $handle );
+ foreach ( $translations as $page => $data ) {
+ $tTitle = Title::makeTitle( $this->title->getNamespace(), $page );
+ $tHandle = new MessageHandle( $tTitle );
+ $this->updateItem( $ttmserver, $tHandle, $data[0], $tHandle->isFuzzy() );
+ }
+ }
+
+ private function updateTranslation( WritableTTMServer $ttmserver, MessageHandle $handle ) {
+ // Update only this translation
+ $translation = $this->getTranslation( $handle );
+ $this->updateItem( $ttmserver, $handle, $translation, $handle->isFuzzy() );
+ }
+
+ private function updateItem( WritableTTMServer $ttmserver, MessageHandle $handle, $text, $fuzzy ) {
+ if ( $fuzzy ) {
+ $text = null;
+ }
+ $ttmserver->update( $handle, $text );
+ }
+
+ /**
+ * Set a delay for this job. Note that this might not be possible, the JobQueue
+ * implementation handling this job doesn't support it (JobQueueDB) but is possible
+ * for the high performance JobQueueRedis. Note also that delays are minimums -
+ * at least JobQueueRedis makes no effort to remove the delay as soon as possible
+ * after it has expired. By default it only checks every five minutes or so.
+ * Note yet again that if another delay has been set that is longer then this one
+ * then the _longer_ delay stays.
+ *
+ * @param int $delay seconds to delay this job if possible
+ */
+ public function setDelay( $delay ) {
+ $jobQueue = JobQueueGroup::singleton()->get( $this->getType() );
+ if ( !$delay || !$jobQueue->delayedJobsEnabled() ) {
+ return;
+ }
+ $oldTime = $this->getReleaseTimestamp();
+ $newTime = time() + $delay;
+ if ( $oldTime !== null && $oldTime >= $newTime ) {
+ return;
+ }
+ $this->params[ 'jobReleaseTimestamp' ] = $newTime;
+ }
+
+ /**
+ * @param int $retryCount The number of times the job has errored out.
+ * @return int Number of seconds to delay. With the default minimum exponent
+ * of 6 the possible return values are 64, 128, 256, 512 and 1024 giving a
+ * maximum delay of 17 minutes.
+ */
+ public static function backoffDelay( $retryCount ) {
+ return ceil( pow(
+ 2,
+ static::WRITE_BACKOFF_EXPONENT + rand( 0, min( $retryCount, 4 ) )
+ ) );
+ }
+}
diff --git a/www/wiki/extensions/Translate/ttmserver/schema.xml b/www/wiki/extensions/Translate/ttmserver/schema.xml
new file mode 100644
index 00000000..0ed2f047
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/schema.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!-- This is schema file for TTMServer using Solr as backend -->
+<schema name="ttmserver" version="1.5">
+ <types>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="50" positionIncrementGap="0"/>
+ <!-- Our input can basically be in any language, so we use either
+ language agnostic processing or something that can adapt to
+ the language in question. -->
+ <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <!-- Consider using solr.ICUTokenizerFactory -->
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- Consider using solr.ICUNormalizer2FilterFactory -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+ </types>
+
+ <fields>
+ <field name="_version_" type="long" indexed="true" stored="true" />
+
+ <!-- If multiple wikis are using the same server, this will tell which one
+ owns this document. Maps to MediaWiki wfWikiId(). -->
+ <field name="wiki" type="string" indexed="true" stored="true" required="true" />
+ <!-- Title::getPrefixedText() of the message definition page. -->
+ <field name="messageid" type="string" indexed="true" stored="true" required="true" />
+ <!-- Consists of concatenation of wiki and messageid. -->
+ <field name="globalid" type="string" indexed="true" stored="true" required="true" />
+ <!-- URL or something to the translation in the wiki. -->
+ <field name="uri" type="string" indexed="true" stored="true" required="true" />
+
+ <!-- FACETs: Language and groups. -->
+ <field name="language" type="string" indexed="true" stored="true" required="true" />
+ <field name="group" multiValued="true" indexed="true" stored="true" type="string" />
+
+ <field name="content" type="string" indexed="true" stored="true" required="true" />
+
+ <field name="text" type="text_ws" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
+ <copyField source="content" dest="text"/>
+ </fields>
+ <defaultSearchField>text</defaultSearchField>
+ <uniqueKey>globalid</uniqueKey>
+</schema>