diff options
Diffstat (limited to 'www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php')
-rw-r--r-- | www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php b/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php new file mode 100644 index 00000000..70c99b64 --- /dev/null +++ b/www/wiki/extensions/Translate/ttmserver/DatabaseTTMServer.php @@ -0,0 +1,282 @@ +<?php +/** + * TTMServer - The Translate extension translation memory interface + * + * @file + * @author Niklas Laxström + * @copyright Copyright © 2012-2013, Niklas Laxström + * @license GPL-2.0-or-later + * @ingroup TTMServer + */ + +use Wikimedia\Rdbms\DBQueryError; + +/** + * Mysql based backend. + * @ingroup TTMServer + * @since 2012-06-27 + */ +class DatabaseTTMServer extends TTMServer implements WritableTTMServer, ReadableTTMServer { + protected $sids; + + /** + * @param int $mode DB_REPLICA|DB_MASTER + * @return \Wikimedia\Rdbms\IDatabase + */ + protected function getDB( $mode = DB_REPLICA ) { + return wfGetDB( $mode, 'ttmserver', $this->config['database'] ); + } + + public function update( MessageHandle $handle, $targetText ) { + if ( !$handle->isValid() || $handle->getCode() === '' ) { + return false; + } + + $mkey = $handle->getKey(); + $group = $handle->getGroup(); + $targetLanguage = $handle->getCode(); + $sourceLanguage = $group->getSourceLanguage(); + + // Skip definitions to not slow down mass imports etc. + // These will be added when the first translation is made + if ( $targetLanguage === $sourceLanguage ) { + return false; + } + + $definition = $group->getMessage( $mkey, $sourceLanguage ); + if ( !is_string( $definition ) || !strlen( trim( $definition ) ) ) { + return false; + } + + $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $mkey ); + $dbw = $this->getDB( DB_MASTER ); + /* Check that the definition exists and fetch the sid. If not, add + * the definition and retrieve the sid. If the definition changes, + * we will create a new entry - otherwise we could at some point + * get suggestions which do not match the original definition any + * longer. The old translations are still kept until purged by + * rerunning the bootstrap script. */ + $conds = [ + 'tms_context' => $context->getPrefixedText(), + 'tms_text' => $definition, + ]; + + $sid = $dbw->selectField( 'translate_tms', 'tms_sid', $conds, __METHOD__ ); + if ( $sid === false ) { + $sid = $this->insertSource( $context, $sourceLanguage, $definition ); + } + + // Delete old translations for this message if any. Could also use replace + $deleteConds = [ + 'tmt_sid' => $sid, + 'tmt_lang' => $targetLanguage, + ]; + $dbw->delete( 'translate_tmt', $deleteConds, __METHOD__ ); + + // Insert the new translation + if ( $targetText !== null ) { + $row = $deleteConds + [ + 'tmt_text' => $targetText, + ]; + + $dbw->insert( 'translate_tmt', $row, __METHOD__ ); + } + + return true; + } + + protected function insertSource( Title $context, $sourceLanguage, $text ) { + $row = [ + 'tms_lang' => $sourceLanguage, + 'tms_len' => mb_strlen( $text ), + 'tms_text' => $text, + 'tms_context' => $context->getPrefixedText(), + ]; + + $dbw = $this->getDB( DB_MASTER ); + $dbw->insert( 'translate_tms', $row, __METHOD__ ); + $sid = $dbw->insertId(); + + $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); + if ( count( $fulltext ) ) { + $row = [ + 'tmf_sid' => $sid, + 'tmf_text' => implode( ' ', $fulltext ), + ]; + $dbw->insert( 'translate_tmf', $row, __METHOD__ ); + } + + return $sid; + } + + /** + * Tokenizes the text for fulltext search. + * Tries to find the most useful tokens. + * + * @param string $language Language code + * @param string $input + * @return array + */ + protected function filterForFulltext( $language, $input ) { + $lang = Language::factory( $language ); + + $text = preg_replace( '/[^[:alnum:]]/u', ' ', $input ); + $text = $lang->segmentByWord( $text ); + $text = $lang->lc( $text ); + $segments = preg_split( '/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY ); + if ( count( $segments ) < 4 ) { + return []; + } + + foreach ( $segments as $i => $segment ) { + // Yes strlen + $len = strlen( $segment ); + if ( $len < 4 || $len > 15 ) { + unset( $segments[$i] ); + } + } + + $segments = array_unique( $segments ); + $segments = array_slice( $segments, 0, 10 ); + + return $segments; + } + + public function beginBootstrap() { + $dbw = $this->getDB( DB_MASTER ); + $dbw->delete( 'translate_tms', '*', __METHOD__ ); + $dbw->delete( 'translate_tmt', '*', __METHOD__ ); + $dbw->delete( 'translate_tmf', '*', __METHOD__ ); + $table = $dbw->tableName( 'translate_tmf' ); + try { + $dbw->query( "DROP INDEX tmf_text ON $table" ); + } catch ( DBQueryError $e ) { + // Perhaps the script was aborted before it got + // chance to add the index back. + } + } + + public function beginBatch() { + $this->sids = []; + } + + public function batchInsertDefinitions( array $batch ) { + foreach ( $batch as $key => $item ) { + list( $title, $language, $text ) = $item; + $handle = new MessageHandle( $title ); + $context = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() ); + $this->sids[$key] = $this->insertSource( $context, $language, $text ); + } + wfWaitForSlaves( 10 ); + } + + public function batchInsertTranslations( array $batch ) { + $rows = []; + foreach ( $batch as $key => $data ) { + list( , $language, $text ) = $data; + $rows[] = [ + 'tmt_sid' => $this->sids[$key], + 'tmt_lang' => $language, + 'tmt_text' => $text, + ]; + } + + $dbw = $this->getDB( DB_MASTER ); + $dbw->insert( 'translate_tmt', $rows, __METHOD__ ); + wfWaitForSlaves( 10 ); + } + + public function endBatch() { + } + + public function endBootstrap() { + $dbw = $this->getDB( DB_MASTER ); + $table = $dbw->tableName( 'translate_tmf' ); + $dbw->query( "CREATE FULLTEXT INDEX tmf_text ON $table (tmf_text)" ); + } + + /* Reading interface */ + + public function isLocalSuggestion( array $suggestion ) { + return true; + } + + public function expandLocation( array $suggestion ) { + $title = Title::newFromText( $suggestion['location'] ); + + return $title->getCanonicalURL(); + } + + public function query( $sourceLanguage, $targetLanguage, $text ) { + // Calculate the bounds of the string length which are able + // to satisfy the cutoff percentage in edit distance. + $len = mb_strlen( $text ); + $min = ceil( max( $len * $this->config['cutoff'], 2 ) ); + $max = floor( $len / $this->config['cutoff'] ); + + // We could use fulltext index to narrow the results further + $dbr = $this->getDB( DB_REPLICA ); + $tables = [ 'translate_tmt', 'translate_tms' ]; + $fields = [ 'tms_context', 'tms_text', 'tmt_lang', 'tmt_text' ]; + + $conds = [ + 'tms_lang' => $sourceLanguage, + 'tmt_lang' => $targetLanguage, + "tms_len BETWEEN $min AND $max", + 'tms_sid = tmt_sid', + ]; + + $fulltext = $this->filterForFulltext( $sourceLanguage, $text ); + if ( $fulltext ) { + $tables[] = 'translate_tmf'; + $list = implode( ' ', $fulltext ); + $conds[] = 'tmf_sid = tmt_sid'; + $conds[] = "MATCH(tmf_text) AGAINST( '$list' )"; + } + + $res = $dbr->select( $tables, $fields, $conds, __METHOD__ ); + + return $this->processQueryResults( $res, $text, $targetLanguage ); + } + + protected function processQueryResults( $res, $text, $targetLanguage ) { + $timeLimit = microtime( true ) + 5; + + $lenA = mb_strlen( $text ); + $results = []; + foreach ( $res as $row ) { + if ( microtime( true ) > $timeLimit ) { + // Having no suggestions is better than preventing translation + // altogether by timing out the request :( + break; + } + + $a = $text; + $b = $row->tms_text; + $lenB = mb_strlen( $b ); + $len = min( $lenA, $lenB ); + if ( $len > 600 ) { + // two strings of length 1500 ~ 10s + // two strings of length 2250 ~ 30s + $dist = $len; + } else { + $dist = self::levenshtein( $a, $b, $lenA, $lenB ); + } + $quality = 1 - ( $dist * 0.9 / $len ); + + if ( $quality >= $this->config['cutoff'] ) { + $results[] = [ + 'source' => $row->tms_text, + 'target' => $row->tmt_text, + 'context' => $row->tms_context, + 'location' => $row->tms_context . '/' . $targetLanguage, + 'quality' => $quality, + 'wiki' => $row->tms_wiki ?? wfWikiID(), + ]; + } + } + $results = TTMServer::sortSuggestions( $results ); + + return $results; + } +} |