summaryrefslogtreecommitdiff
path: root/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php')
-rw-r--r--www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php446
1 files changed, 446 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php
new file mode 100644
index 00000000..375d544f
--- /dev/null
+++ b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php
@@ -0,0 +1,446 @@
+<?php
+/**
+ * TTMServer - The Translate extension translation memory interface
+ *
+ * @file
+ * @author Niklas Laxström
+ * @copyright Copyright © 2012-2013, Niklas Laxström
+ * @license GPL-2.0-or-later
+ * @ingroup TTMServer
+ */
+
+/**
+ * TTMServer backed based on Solr instance. Depends on Solarium.
+ * @since 2012-06-27
+ * @ingroup TTMServer
+ * @deprecated 1.27. Will be removed in 1.29.
+ */
+class SolrTTMServer
+ extends TTMServer
+ implements ReadableTTMServer, SearchableTTMServer, WritableTTMServer
+{
+ /**
+ * In case auto-commit is not enabled, or even if it is, tell solr to
+ * commit before this time has passed, in milliseconds.
+ */
+ const COMMIT_WITHIN = 5000;
+
+ protected $client;
+
+ /**
+ * Reference to the maintenance script to relay logging output.
+ */
+ protected $logger;
+
+ public function __construct( $config ) {
+ wfDeprecated( __METHOD__, '1.24' );
+
+ parent::__construct( $config );
+
+ if ( isset( $config['config'] ) ) {
+ $this->client = new Solarium_Client( $config['config'] );
+ } else {
+ $this->client = new Solarium_Client();
+ }
+ }
+
+ public function isLocalSuggestion( array $suggestion ) {
+ return $suggestion['wiki'] === wfWikiID();
+ }
+
+ public function expandLocation( array $suggestion ) {
+ return $suggestion['uri'];
+ }
+
+ public function query( $sourceLanguage, $targetLanguage, $text ) {
+ try {
+ return $this->doQuery( $sourceLanguage, $targetLanguage, $text );
+ } catch ( Solarium_Exception $e ) {
+ throw new TranslationHelperException( 'Solarium exception: ' . $e );
+ }
+ }
+
+ /// @see ReadableTTMServer::query
+ protected function doQuery( $sourceLanguage, $targetLanguage, $text ) {
+ /* Two query system:
+ * 1) Find all strings in source language that match text
+ * 2) Do another query for translations for those strings
+ */
+ // For now impose a length limit on query string to avoid doing
+ // very slow queries. Magic number.
+ if ( strlen( $text ) > 789 ) {
+ return [];
+ }
+
+ $query = $this->client->createSelect();
+ $query->setFields( [ 'globalid', 'content', 'score' ] );
+
+ /* The interface usually displays three best candidates. These might
+ * come from more than three matches, if the translation is the same.
+ * This might not find all suggestions, if the top N best matching
+ * source texts don't have translations, but worse matches do. We
+ * could loop with start parameter to fetch more until we have enough
+ * suggestions or the quality drops below the cutoff point. */
+ $query->setRows( 25 );
+
+ /* Our string can contain all kind of nasty characters, so we need
+ * escape them with great pain. */
+ $helper = $query->getHelper();
+ $dist = $helper->escapePhrase( $text );
+ // "edit" could also be ngram of other algorithm
+ $dist = "strdist($dist,content,edit)";
+ /* Note how we need to escape twice here, first the string for strdist
+ * and then the strdist call itself for the query. And of course every-
+ * thing will be URL encoded once sent over the line. */
+ $query->setQuery( '_val_:%P1%', [ $dist ] );
+
+ /* Filter queries are supposed to be efficient as they are separately
+ * cached, but I haven't done any benchmarks. */
+ $query->createFilterQuery( 'lang' )
+ ->setQuery( 'language:%P1%', [ $sourceLanguage ] );
+
+ $resultset = $this->client->select( $query );
+
+ /* This query is doing two unrelated things:
+ * 1) Collect the message contents and scores so that they can
+ * be accessed later for the translations we found.
+ * 2) Build the query string for the query that fetches the
+ * translations.
+ * This code is a bit uglier than I'd like it to be, since there
+ * there is no field that globally identifies a message (message
+ * definition and translations). */
+ $contents = $scores = [];
+ $queryString = '';
+ foreach ( $resultset as $doc ) {
+ $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid );
+ $contents[$sourceId] = $doc->content;
+ $scores[$sourceId] = $doc->score;
+
+ $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" );
+ $queryString .= "globalid:$globalid ";
+ }
+
+ // Second query to fetch available translations
+ $fetchQuery = $this->client->createSelect();
+ $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] );
+ // This come in random order, so have to fetch all and sort
+ $fetchQuery->setRows( 25 );
+ $fetchQuery->setQuery( $queryString );
+ // With AND we would not find anything, obviously.
+ $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR );
+
+ $translations = $this->client->select( $fetchQuery );
+
+ $suggestions = [];
+ foreach ( $translations as $doc ) {
+ /* Construct the matching source id */
+ $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid );
+
+ /* Unfortunately we cannot do this on the search server,
+ * because score is not a real field and thus cannot be
+ * used in a filter query. */
+ $quality = $scores[$sourceId];
+ if ( $quality < $this->config['cutoff'] ) {
+ continue;
+ }
+
+ $suggestions[] = [
+ 'source' => $contents[$sourceId],
+ 'target' => $doc->content,
+ 'context' => $doc->messageid,
+ 'quality' => $quality,
+ 'wiki' => $doc->wiki,
+ 'location' => $doc->messageid . '/' . $targetLanguage,
+ 'uri' => $doc->uri,
+ ];
+ }
+
+ /* Like mentioned above, we get results in random order. Sort them
+ * now to have best matches first as expected by callers. */
+ uasort( $suggestions, function ( $a, $b ) {
+ if ( $a['quality'] === $b['quality'] ) {
+ return 0;
+ }
+
+ return ( $a['quality'] < $b['quality'] ) ? 1 : -1;
+ } );
+
+ return $suggestions;
+ }
+
+ /* Write functions */
+
+ public function update( MessageHandle $handle, $targetText ) {
+ if ( $handle->getCode() === '' ) {
+ return false;
+ }
+
+ /* There are various different cases here:
+ * [new or updated] [fuzzy|non-fuzzy] [translation|definition]
+ * 1) We don't distinguish between new or updated here.
+ * 2) Delete old translation, but not definition
+ * 3) Insert new translation or definition, if non-fuzzy
+ * The definition should never be fuzzied anyway.
+ *
+ * These only apply to known messages.
+ */
+
+ $update = $this->client->createUpdate();
+ $title = $handle->getTitle();
+
+ $doDelete = true;
+ $sourceLanguage = '';
+ if ( $handle->isValid() ) {
+ $sourceLanguage = $handle->getGroup()->getSourceLanguage();
+ if ( $handle->getCode() === $sourceLanguage ) {
+ $doDelete = false;
+ }
+ }
+
+ if ( $doDelete ) {
+ $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() );
+ $conds = [
+ 'wiki' => wfWikiID(),
+ 'language' => $handle->getCode(),
+ 'messageid' => $base->getPrefixedText(),
+ ];
+ foreach ( $conds as $key => &$value ) {
+ $value = "$key:" . $update->getHelper()->escapePhrase( $value );
+ }
+ $update->addDeleteQuery( implode( ' AND ', $conds ) );
+ }
+
+ if ( $targetText !== null ) {
+ if ( $handle->isValid() ) {
+ // Of the message definition page
+ $targetTitle = $handle->getTitle();
+ $sourceTitle = Title::makeTitle(
+ $targetTitle->getNamespace(),
+ $handle->getKey() . '/' . $sourceLanguage
+ );
+ $revId = (int)$sourceTitle->getLatestRevID();
+ /* Note: in some cases the source page might not exist, in this case
+ * we use 0 as message version identifier, to differentiate them from
+ * orphan messages */
+ } else {
+ $revId = 'orphan';
+ }
+
+ $doc = $this->createDocument( $handle, $targetText, $revId );
+ // Add document and commit within X seconds.
+ $update->addDocument( $doc, null, self::COMMIT_WITHIN );
+ }
+
+ try {
+ $this->client->update( $update );
+ } catch ( Solarium_Exception $e ) {
+ error_log( 'SolrTTMServer update-write failed' );
+
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * @see schema.xml
+ * @param MessageHandle $handle
+ * @param string $text
+ * @param int $revId
+ * @return Solarium_Document_ReadWrite
+ */
+ protected function createDocument( MessageHandle $handle, $text, $revId ) {
+ $language = $handle->getCode();
+ $translationTitle = $handle->getTitle();
+
+ $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() );
+ $wiki = wfWikiID();
+ $messageid = $title->getPrefixedText();
+ $globalid = "$wiki-$messageid-$revId/$language";
+
+ $doc = new Solarium_Document_ReadWrite();
+ $doc->wiki = $wiki;
+ $doc->uri = $translationTitle->getCanonicalURL();
+ $doc->messageid = $messageid;
+ $doc->globalid = $globalid;
+
+ $doc->language = $language;
+ $doc->content = $text;
+ $doc->setField( 'group', $handle->getGroupIds() );
+
+ return $doc;
+ }
+
+ public function beginBootstrap() {
+ $update = $this->client->createUpdate();
+ $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() );
+ $update->addDeleteQuery( $query );
+ $update->addCommit();
+ $this->client->update( $update );
+ }
+
+ public function beginBatch() {
+ // I hate the rule that forbids {}
+ }
+
+ public function batchInsertDefinitions( array $batch ) {
+ $lb = new LinkBatch();
+ foreach ( $batch as $data ) {
+ $lb->addObj( $data[0]->getTitle() );
+ }
+ $lb->execute();
+
+ $this->batchInsertTranslations( $batch );
+ }
+
+ public function batchInsertTranslations( array $batch ) {
+ $update = $this->client->createUpdate();
+ foreach ( $batch as $key => $data ) {
+ list( $handle, $sourceLanguage, $text ) = $data;
+ $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID();
+ $doc = $this->createDocument( $handle, $text, $revId );
+ // Add document and commit within X seconds.
+ $update->addDocument( $doc, null, self::COMMIT_WITHIN );
+ }
+
+ $retries = 5;
+
+ while ( $retries-- > 0 ) {
+ try {
+ $this->client->update( $update );
+ break;
+ } catch ( Solarium_Client_HttpException $e ) {
+ if ( $retries === 0 ) {
+ throw $e;
+ } else {
+ $c = get_class( $e );
+ $msg = $e->getMessage();
+ $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" );
+ sleep( 10 );
+ }
+ }
+ }
+ }
+
+ public function endBatch() {
+ $update = $this->client->createUpdate();
+ $this->client->update( $update );
+ }
+
+ public function endBootstrap() {
+ $update = $this->client->createUpdate();
+ $update->addCommit();
+ $update->addOptimize();
+ $this->client->update( $update );
+ }
+
+ public function getSolarium() {
+ return $this->client;
+ }
+
+ public function setLogger( $logger ) {
+ $this->logger = $logger;
+ }
+
+ // Can it get any uglier?
+ protected function logOutput( $text ) {
+ if ( $this->logger ) {
+ $this->logger->statusLine( "$text\n" );
+ }
+ }
+
+ /**
+ * Search interface
+ * @param string $queryString
+ * @param array $opts
+ * @param array $highlight
+ * @return array
+ * @throws TTMServerException
+ */
+ public function search( $queryString, $opts, $highlight ) {
+ $client = $this->getSolarium();
+
+ $query = $client->createSelect();
+ $dismax = $query->getDisMax();
+ $dismax->setQueryParser( 'edismax' );
+ $query->setQuery( $queryString );
+ $query->setRows( $opts['limit'] );
+ $query->setStart( $opts['offset'] );
+
+ list( $pre, $post ) = $highlight;
+ $hl = $query->getHighlighting();
+ $hl->setFields( 'text' );
+ $hl->setSimplePrefix( $pre );
+ $hl->setSimplePostfix( $post );
+ $hl->setMaxAnalyzedChars( '5000' );
+ $hl->setFragSize( '5000' );
+ $hl->setSnippets( 1 );
+
+ $languageFilter = $opts['language'];
+ if ( $languageFilter !== '' ) {
+ $query->createFilterQuery( 'languageFilter' )
+ ->setQuery( 'language:%P1%', [ $languageFilter ] )
+ ->addTag( 'filter' );
+ }
+
+ $groupFilter = $opts['group'];
+ if ( $groupFilter !== '' ) {
+ $query->createFilterQuery( 'groupFilter' )
+ ->setQuery( 'group:%P1%', [ $groupFilter ] )
+ ->addTag( 'filter' );
+ }
+
+ $facetSet = $query->getFacetSet();
+
+ $language = $facetSet->createFacetField( 'language' );
+ $language->setField( 'language' );
+ $language->setMinCount( 1 );
+ $language->addExclude( 'filter' );
+
+ $group = $facetSet->createFacetField( 'group' );
+ $group->setField( 'group' );
+ $group->setMinCount( 1 );
+ $group->setMissing( true );
+ $group->addExclude( 'filter' );
+
+ try {
+ return $client->select( $query );
+ } catch ( Solarium_Client_HttpException $e ) {
+ throw new TTMServerException( $e->getMessage() );
+ }
+ }
+
+ public function getFacets( $resultset ) {
+ return [
+ 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ),
+ 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ),
+ ];
+ }
+
+ public function getTotalHits( $resultset ) {
+ return $resultset->getNumFound();
+ }
+
+ public function getDocuments( $resultset ) {
+ $highlighting = $resultset->getHighlighting();
+ $ret = [];
+ foreach ( $resultset as $document ) {
+ $fields = iterator_to_array( $document );
+ // Compatibility mapping
+ $fields['localid'] = $fields['messageid'];
+
+ $hdoc = $highlighting->getResult( $document->globalid );
+ $text = $hdoc->getField( 'text' );
+ if ( $text === [] ) {
+ $text = $document->text;
+ } else {
+ $text = $text[0];
+ }
+
+ $fields['content'] = $text;
+ $ret[] = $fields;
+ }
+
+ return $ret;
+ }
+}