diff options
Diffstat (limited to 'www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php')
-rw-r--r-- | www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php new file mode 100644 index 00000000..375d544f --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/SolrTTMServer.php @@ -0,0 +1,446 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +/** + * TTMServer backed based on Solr instance. Depends on Solarium. + * @since 2012-06-27 + * @ingroup TTMServer + * @deprecated 1.27. Will be removed in 1.29. + */ +class SolrTTMServer + extends TTMServer + implements ReadableTTMServer, SearchableTTMServer, WritableTTMServer +{ + /** + * In case auto-commit is not enabled, or even if it is, tell solr to + * commit before this time has passed, in milliseconds. + */ + const COMMIT_WITHIN = 5000; + + protected $client; + + /** + * Reference to the maintenance script to relay logging output. + */ + protected $logger; + + public function __construct( $config ) { + wfDeprecated( __METHOD__, '1.24' ); + + parent::__construct( $config ); + + if ( isset( $config['config'] ) ) { + $this->client = new Solarium_Client( $config['config'] ); + } else { + $this->client = new Solarium_Client(); + } + } + + public function isLocalSuggestion( array $suggestion ) { + return $suggestion['wiki'] === wfWikiID(); + } + + public function expandLocation( array $suggestion ) { + return $suggestion['uri']; + } + + public function query( $sourceLanguage, $targetLanguage, $text ) { + try { + return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); + } catch ( Solarium_Exception $e ) { + throw new TranslationHelperException( 'Solarium exception: ' . $e ); + } + } + + /// @see ReadableTTMServer::query + protected function doQuery( $sourceLanguage, $targetLanguage, $text ) { + /* Two query system: + * 1) Find all strings in source language that match text + * 2) Do another query for translations for those strings + */ + // For now impose a length limit on query string to avoid doing + // very slow queries. Magic number. + if ( strlen( $text ) > 789 ) { + return []; + } + + $query = $this->client->createSelect(); + $query->setFields( [ 'globalid', 'content', 'score' ] ); + + /* The interface usually displays three best candidates. These might + * come from more than three matches, if the translation is the same. + * This might not find all suggestions, if the top N best matching + * source texts don't have translations, but worse matches do. We + * could loop with start parameter to fetch more until we have enough + * suggestions or the quality drops below the cutoff point. */ + $query->setRows( 25 ); + + /* Our string can contain all kind of nasty characters, so we need + * escape them with great pain. */ + $helper = $query->getHelper(); + $dist = $helper->escapePhrase( $text ); + // "edit" could also be ngram of other algorithm + $dist = "strdist($dist,content,edit)"; + /* Note how we need to escape twice here, first the string for strdist + * and then the strdist call itself for the query. And of course every- + * thing will be URL encoded once sent over the line. */ + $query->setQuery( '_val_:%P1%', [ $dist ] ); + + /* Filter queries are supposed to be efficient as they are separately + * cached, but I haven't done any benchmarks. */ + $query->createFilterQuery( 'lang' ) + ->setQuery( 'language:%P1%', [ $sourceLanguage ] ); + + $resultset = $this->client->select( $query ); + + /* This query is doing two unrelated things: + * 1) Collect the message contents and scores so that they can + * be accessed later for the translations we found. + * 2) Build the query string for the query that fetches the + * translations. + * This code is a bit uglier than I'd like it to be, since there + * there is no field that globally identifies a message (message + * definition and translations). */ + $contents = $scores = []; + $queryString = ''; + foreach ( $resultset as $doc ) { + $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); + $contents[$sourceId] = $doc->content; + $scores[$sourceId] = $doc->score; + + $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" ); + $queryString .= "globalid:$globalid "; + } + + // Second query to fetch available translations + $fetchQuery = $this->client->createSelect(); + $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] ); + // This come in random order, so have to fetch all and sort + $fetchQuery->setRows( 25 ); + $fetchQuery->setQuery( $queryString ); + // With AND we would not find anything, obviously. + $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR ); + + $translations = $this->client->select( $fetchQuery ); + + $suggestions = []; + foreach ( $translations as $doc ) { + /* Construct the matching source id */ + $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); + + /* Unfortunately we cannot do this on the search server, + * because score is not a real field and thus cannot be + * used in a filter query. */ + $quality = $scores[$sourceId]; + if ( $quality < $this->config['cutoff'] ) { + continue; + } + + $suggestions[] = [ + 'source' => $contents[$sourceId], + 'target' => $doc->content, + 'context' => $doc->messageid, + 'quality' => $quality, + 'wiki' => $doc->wiki, + 'location' => $doc->messageid . '/' . $targetLanguage, + 'uri' => $doc->uri, + ]; + } + + /* Like mentioned above, we get results in random order. Sort them + * now to have best matches first as expected by callers. */ + uasort( $suggestions, function ( $a, $b ) { + if ( $a['quality'] === $b['quality'] ) { + return 0; + } + + return ( $a['quality'] < $b['quality'] ) ? 1 : -1; + } ); + + return $suggestions; + } + + /* Write functions */ + + public function update( MessageHandle $handle, $targetText ) { + if ( $handle->getCode() === '' ) { + return false; + } + + /* There are various different cases here: + * [new or updated] [fuzzy|non-fuzzy] [translation|definition] + * 1) We don't distinguish between new or updated here. + * 2) Delete old translation, but not definition + * 3) Insert new translation or definition, if non-fuzzy + * The definition should never be fuzzied anyway. + * + * These only apply to known messages. + */ + + $update = $this->client->createUpdate(); + $title = $handle->getTitle(); + + $doDelete = true; + $sourceLanguage = ''; + if ( $handle->isValid() ) { + $sourceLanguage = $handle->getGroup()->getSourceLanguage(); + if ( $handle->getCode() === $sourceLanguage ) { + $doDelete = false; + } + } + + if ( $doDelete ) { + $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() ); + $conds = [ + 'wiki' => wfWikiID(), + 'language' => $handle->getCode(), + 'messageid' => $base->getPrefixedText(), + ]; + foreach ( $conds as $key => &$value ) { + $value = "$key:" . $update->getHelper()->escapePhrase( $value ); + } + $update->addDeleteQuery( implode( ' AND ', $conds ) ); + } + + if ( $targetText !== null ) { + if ( $handle->isValid() ) { + // Of the message definition page + $targetTitle = $handle->getTitle(); + $sourceTitle = Title::makeTitle( + $targetTitle->getNamespace(), + $handle->getKey() . '/' . $sourceLanguage + ); + $revId = (int)$sourceTitle->getLatestRevID(); + /* Note: in some cases the source page might not exist, in this case + * we use 0 as message version identifier, to differentiate them from + * orphan messages */ + } else { + $revId = 'orphan'; + } + + $doc = $this->createDocument( $handle, $targetText, $revId ); + // Add document and commit within X seconds. + $update->addDocument( $doc, null, self::COMMIT_WITHIN ); + } + + try { + $this->client->update( $update ); + } catch ( Solarium_Exception $e ) { + error_log( 'SolrTTMServer update-write failed' ); + + return false; + } + + return true; + } + + /** + * @see schema.xml + * @param MessageHandle $handle + * @param string $text + * @param int $revId + * @return Solarium_Document_ReadWrite + */ + protected function createDocument( MessageHandle $handle, $text, $revId ) { + $language = $handle->getCode(); + $translationTitle = $handle->getTitle(); + + $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); + $wiki = wfWikiID(); + $messageid = $title->getPrefixedText(); + $globalid = "$wiki-$messageid-$revId/$language"; + + $doc = new Solarium_Document_ReadWrite(); + $doc->wiki = $wiki; + $doc->uri = $translationTitle->getCanonicalURL(); + $doc->messageid = $messageid; + $doc->globalid = $globalid; + + $doc->language = $language; + $doc->content = $text; + $doc->setField( 'group', $handle->getGroupIds() ); + + return $doc; + } + + public function beginBootstrap() { + $update = $this->client->createUpdate(); + $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() ); + $update->addDeleteQuery( $query ); + $update->addCommit(); + $this->client->update( $update ); + } + + public function beginBatch() { + // I hate the rule that forbids {} + } + + public function batchInsertDefinitions( array $batch ) { + $lb = new LinkBatch(); + foreach ( $batch as $data ) { + $lb->addObj( $data[0]->getTitle() ); + } + $lb->execute(); + + $this->batchInsertTranslations( $batch ); + } + + public function batchInsertTranslations( array $batch ) { + $update = $this->client->createUpdate(); + foreach ( $batch as $key => $data ) { + list( $handle, $sourceLanguage, $text ) = $data; + $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); + $doc = $this->createDocument( $handle, $text, $revId ); + // Add document and commit within X seconds. + $update->addDocument( $doc, null, self::COMMIT_WITHIN ); + } + + $retries = 5; + + while ( $retries-- > 0 ) { + try { + $this->client->update( $update ); + break; + } catch ( Solarium_Client_HttpException $e ) { + if ( $retries === 0 ) { + throw $e; + } else { + $c = get_class( $e ); + $msg = $e->getMessage(); + $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); + sleep( 10 ); + } + } + } + } + + public function endBatch() { + $update = $this->client->createUpdate(); + $this->client->update( $update ); + } + + public function endBootstrap() { + $update = $this->client->createUpdate(); + $update->addCommit(); + $update->addOptimize(); + $this->client->update( $update ); + } + + public function getSolarium() { + return $this->client; + } + + public function setLogger( $logger ) { + $this->logger = $logger; + } + + // Can it get any uglier? + protected function logOutput( $text ) { + if ( $this->logger ) { + $this->logger->statusLine( "$text\n" ); + } + } + + /** + * Search interface + * @param string $queryString + * @param array $opts + * @param array $highlight + * @return array + * @throws TTMServerException + */ + public function search( $queryString, $opts, $highlight ) { + $client = $this->getSolarium(); + + $query = $client->createSelect(); + $dismax = $query->getDisMax(); + $dismax->setQueryParser( 'edismax' ); + $query->setQuery( $queryString ); + $query->setRows( $opts['limit'] ); + $query->setStart( $opts['offset'] ); + + list( $pre, $post ) = $highlight; + $hl = $query->getHighlighting(); + $hl->setFields( 'text' ); + $hl->setSimplePrefix( $pre ); + $hl->setSimplePostfix( $post ); + $hl->setMaxAnalyzedChars( '5000' ); + $hl->setFragSize( '5000' ); + $hl->setSnippets( 1 ); + + $languageFilter = $opts['language']; + if ( $languageFilter !== '' ) { + $query->createFilterQuery( 'languageFilter' ) + ->setQuery( 'language:%P1%', [ $languageFilter ] ) + ->addTag( 'filter' ); + } + + $groupFilter = $opts['group']; + if ( $groupFilter !== '' ) { + $query->createFilterQuery( 'groupFilter' ) + ->setQuery( 'group:%P1%', [ $groupFilter ] ) + ->addTag( 'filter' ); + } + + $facetSet = $query->getFacetSet(); + + $language = $facetSet->createFacetField( 'language' ); + $language->setField( 'language' ); + $language->setMinCount( 1 ); + $language->addExclude( 'filter' ); + + $group = $facetSet->createFacetField( 'group' ); + $group->setField( 'group' ); + $group->setMinCount( 1 ); + $group->setMissing( true ); + $group->addExclude( 'filter' ); + + try { + return $client->select( $query ); + } catch ( Solarium_Client_HttpException $e ) { + throw new TTMServerException( $e->getMessage() ); + } + } + + public function getFacets( $resultset ) { + return [ + 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ), + 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ), + ]; + } + + public function getTotalHits( $resultset ) { + return $resultset->getNumFound(); + } + + public function getDocuments( $resultset ) { + $highlighting = $resultset->getHighlighting(); + $ret = []; + foreach ( $resultset as $document ) { + $fields = iterator_to_array( $document ); + // Compatibility mapping + $fields['localid'] = $fields['messageid']; + + $hdoc = $highlighting->getResult( $document->globalid ); + $text = $hdoc->getField( 'text' ); + if ( $text === [] ) { + $text = $document->text; + } else { + $text = $text[0]; + } + + $fields['content'] = $text; + $ret[] = $fields; + } + + return $ret; + } +} |