client = new Solarium_Client( $config['config'] ); } else { $this->client = new Solarium_Client(); } } public function isLocalSuggestion( array $suggestion ) { return $suggestion['wiki'] === wfWikiID(); } public function expandLocation( array $suggestion ) { return $suggestion['uri']; } public function query( $sourceLanguage, $targetLanguage, $text ) { try { return $this->doQuery( $sourceLanguage, $targetLanguage, $text ); } catch ( Solarium_Exception $e ) { throw new TranslationHelperException( 'Solarium exception: ' . $e ); } } /// @see ReadableTTMServer::query protected function doQuery( $sourceLanguage, $targetLanguage, $text ) { /* Two query system: * 1) Find all strings in source language that match text * 2) Do another query for translations for those strings */ // For now impose a length limit on query string to avoid doing // very slow queries. Magic number. if ( strlen( $text ) > 789 ) { return []; } $query = $this->client->createSelect(); $query->setFields( [ 'globalid', 'content', 'score' ] ); /* The interface usually displays three best candidates. These might * come from more than three matches, if the translation is the same. * This might not find all suggestions, if the top N best matching * source texts don't have translations, but worse matches do. We * could loop with start parameter to fetch more until we have enough * suggestions or the quality drops below the cutoff point. */ $query->setRows( 25 ); /* Our string can contain all kind of nasty characters, so we need * escape them with great pain. */ $helper = $query->getHelper(); $dist = $helper->escapePhrase( $text ); // "edit" could also be ngram of other algorithm $dist = "strdist($dist,content,edit)"; /* Note how we need to escape twice here, first the string for strdist * and then the strdist call itself for the query. And of course every- * thing will be URL encoded once sent over the line. */ $query->setQuery( '_val_:%P1%', [ $dist ] ); /* Filter queries are supposed to be efficient as they are separately * cached, but I haven't done any benchmarks. */ $query->createFilterQuery( 'lang' ) ->setQuery( 'language:%P1%', [ $sourceLanguage ] ); $resultset = $this->client->select( $query ); /* This query is doing two unrelated things: * 1) Collect the message contents and scores so that they can * be accessed later for the translations we found. * 2) Build the query string for the query that fetches the * translations. * This code is a bit uglier than I'd like it to be, since there * there is no field that globally identifies a message (message * definition and translations). */ $contents = $scores = []; $queryString = ''; foreach ( $resultset as $doc ) { $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); $contents[$sourceId] = $doc->content; $scores[$sourceId] = $doc->score; $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" ); $queryString .= "globalid:$globalid "; } // Second query to fetch available translations $fetchQuery = $this->client->createSelect(); $fetchQuery->setFields( [ 'wiki', 'uri', 'content', 'messageid', 'globalid' ] ); // This come in random order, so have to fetch all and sort $fetchQuery->setRows( 25 ); $fetchQuery->setQuery( $queryString ); // With AND we would not find anything, obviously. $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR ); $translations = $this->client->select( $fetchQuery ); $suggestions = []; foreach ( $translations as $doc ) { /* Construct the matching source id */ $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid ); /* Unfortunately we cannot do this on the search server, * because score is not a real field and thus cannot be * used in a filter query. */ $quality = $scores[$sourceId]; if ( $quality < $this->config['cutoff'] ) { continue; } $suggestions[] = [ 'source' => $contents[$sourceId], 'target' => $doc->content, 'context' => $doc->messageid, 'quality' => $quality, 'wiki' => $doc->wiki, 'location' => $doc->messageid . '/' . $targetLanguage, 'uri' => $doc->uri, ]; } /* Like mentioned above, we get results in random order. Sort them * now to have best matches first as expected by callers. */ uasort( $suggestions, function ( $a, $b ) { if ( $a['quality'] === $b['quality'] ) { return 0; } return ( $a['quality'] < $b['quality'] ) ? 1 : -1; } ); return $suggestions; } /* Write functions */ public function update( MessageHandle $handle, $targetText ) { if ( $handle->getCode() === '' ) { return false; } /* There are various different cases here: * [new or updated] [fuzzy|non-fuzzy] [translation|definition] * 1) We don't distinguish between new or updated here. * 2) Delete old translation, but not definition * 3) Insert new translation or definition, if non-fuzzy * The definition should never be fuzzied anyway. * * These only apply to known messages. */ $update = $this->client->createUpdate(); $title = $handle->getTitle(); $doDelete = true; $sourceLanguage = ''; if ( $handle->isValid() ) { $sourceLanguage = $handle->getGroup()->getSourceLanguage(); if ( $handle->getCode() === $sourceLanguage ) { $doDelete = false; } } if ( $doDelete ) { $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() ); $conds = [ 'wiki' => wfWikiID(), 'language' => $handle->getCode(), 'messageid' => $base->getPrefixedText(), ]; foreach ( $conds as $key => &$value ) { $value = "$key:" . $update->getHelper()->escapePhrase( $value ); } $update->addDeleteQuery( implode( ' AND ', $conds ) ); } if ( $targetText !== null ) { if ( $handle->isValid() ) { // Of the message definition page $targetTitle = $handle->getTitle(); $sourceTitle = Title::makeTitle( $targetTitle->getNamespace(), $handle->getKey() . '/' . $sourceLanguage ); $revId = (int)$sourceTitle->getLatestRevID(); /* Note: in some cases the source page might not exist, in this case * we use 0 as message version identifier, to differentiate them from * orphan messages */ } else { $revId = 'orphan'; } $doc = $this->createDocument( $handle, $targetText, $revId ); // Add document and commit within X seconds. $update->addDocument( $doc, null, self::COMMIT_WITHIN ); } try { $this->client->update( $update ); } catch ( Solarium_Exception $e ) { error_log( 'SolrTTMServer update-write failed' ); return false; } return true; } /** * @see schema.xml * @param MessageHandle $handle * @param string $text * @param int $revId * @return Solarium_Document_ReadWrite */ protected function createDocument( MessageHandle $handle, $text, $revId ) { $language = $handle->getCode(); $translationTitle = $handle->getTitle(); $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); $wiki = wfWikiID(); $messageid = $title->getPrefixedText(); $globalid = "$wiki-$messageid-$revId/$language"; $doc = new Solarium_Document_ReadWrite(); $doc->wiki = $wiki; $doc->uri = $translationTitle->getCanonicalURL(); $doc->messageid = $messageid; $doc->globalid = $globalid; $doc->language = $language; $doc->content = $text; $doc->setField( 'group', $handle->getGroupIds() ); return $doc; } public function beginBootstrap() { $update = $this->client->createUpdate(); $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiID() ); $update->addDeleteQuery( $query ); $update->addCommit(); $this->client->update( $update ); } public function beginBatch() { // I hate the rule that forbids {} } public function batchInsertDefinitions( array $batch ) { $lb = new LinkBatch(); foreach ( $batch as $data ) { $lb->addObj( $data[0]->getTitle() ); } $lb->execute(); $this->batchInsertTranslations( $batch ); } public function batchInsertTranslations( array $batch ) { $update = $this->client->createUpdate(); foreach ( $batch as $key => $data ) { list( $handle, $sourceLanguage, $text ) = $data; $revId = $handle->getTitleForLanguage( $sourceLanguage )->getLatestRevID(); $doc = $this->createDocument( $handle, $text, $revId ); // Add document and commit within X seconds. $update->addDocument( $doc, null, self::COMMIT_WITHIN ); } $retries = 5; while ( $retries-- > 0 ) { try { $this->client->update( $update ); break; } catch ( Solarium_Client_HttpException $e ) { if ( $retries === 0 ) { throw $e; } else { $c = get_class( $e ); $msg = $e->getMessage(); $this->logOutput( "Batch failed ($c: $msg), trying again in 10 seconds" ); sleep( 10 ); } } } } public function endBatch() { $update = $this->client->createUpdate(); $this->client->update( $update ); } public function endBootstrap() { $update = $this->client->createUpdate(); $update->addCommit(); $update->addOptimize(); $this->client->update( $update ); } public function getSolarium() { return $this->client; } public function setLogger( $logger ) { $this->logger = $logger; } // Can it get any uglier? protected function logOutput( $text ) { if ( $this->logger ) { $this->logger->statusLine( "$text\n" ); } } /** * Search interface * @param string $queryString * @param array $opts * @param array $highlight * @return array * @throws TTMServerException */ public function search( $queryString, $opts, $highlight ) { $client = $this->getSolarium(); $query = $client->createSelect(); $dismax = $query->getDisMax(); $dismax->setQueryParser( 'edismax' ); $query->setQuery( $queryString ); $query->setRows( $opts['limit'] ); $query->setStart( $opts['offset'] ); list( $pre, $post ) = $highlight; $hl = $query->getHighlighting(); $hl->setFields( 'text' ); $hl->setSimplePrefix( $pre ); $hl->setSimplePostfix( $post ); $hl->setMaxAnalyzedChars( '5000' ); $hl->setFragSize( '5000' ); $hl->setSnippets( 1 ); $languageFilter = $opts['language']; if ( $languageFilter !== '' ) { $query->createFilterQuery( 'languageFilter' ) ->setQuery( 'language:%P1%', [ $languageFilter ] ) ->addTag( 'filter' ); } $groupFilter = $opts['group']; if ( $groupFilter !== '' ) { $query->createFilterQuery( 'groupFilter' ) ->setQuery( 'group:%P1%', [ $groupFilter ] ) ->addTag( 'filter' ); } $facetSet = $query->getFacetSet(); $language = $facetSet->createFacetField( 'language' ); $language->setField( 'language' ); $language->setMinCount( 1 ); $language->addExclude( 'filter' ); $group = $facetSet->createFacetField( 'group' ); $group->setField( 'group' ); $group->setMinCount( 1 ); $group->setMissing( true ); $group->addExclude( 'filter' ); try { return $client->select( $query ); } catch ( Solarium_Client_HttpException $e ) { throw new TTMServerException( $e->getMessage() ); } } public function getFacets( $resultset ) { return [ 'language' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'language' ) ), 'group' => iterator_to_array( $resultset->getFacetSet()->getFacet( 'group' ) ), ]; } public function getTotalHits( $resultset ) { return $resultset->getNumFound(); } public function getDocuments( $resultset ) { $highlighting = $resultset->getHighlighting(); $ret = []; foreach ( $resultset as $document ) { $fields = iterator_to_array( $document ); // Compatibility mapping $fields['localid'] = $fields['messageid']; $hdoc = $highlighting->getResult( $document->globalid ); $text = $hdoc->getField( 'text' ); if ( $text === [] ) { $text = $document->text; } else { $text = $text[0]; } $fields['content'] = $text; $ret[] = $fields; } return $ret; } }