diff options
Diffstat (limited to 'www/wiki/extensions/SemanticMediaWiki/src/Maintenance/DataRebuilder.php')
-rw-r--r-- | www/wiki/extensions/SemanticMediaWiki/src/Maintenance/DataRebuilder.php | 532 |
1 files changed, 532 insertions, 0 deletions
diff --git a/www/wiki/extensions/SemanticMediaWiki/src/Maintenance/DataRebuilder.php b/www/wiki/extensions/SemanticMediaWiki/src/Maintenance/DataRebuilder.php new file mode 100644 index 00000000..46fc5688 --- /dev/null +++ b/www/wiki/extensions/SemanticMediaWiki/src/Maintenance/DataRebuilder.php @@ -0,0 +1,532 @@ +<?php + +namespace SMW\Maintenance; + +use Exception; +use LinkCache; +use Onoi\MessageReporter\MessageReporter; +use Onoi\MessageReporter\MessageReporterFactory; +use SMW\ApplicationFactory; +use SMW\DIWikiPage; +use SMW\MediaWiki\TitleFactory; +use SMW\Options; +use SMW\Store; +use Title; + +/** + * Is part of the `rebuildData.php` maintenance script to rebuild existing data + * for the store + * + * @note This is an internal class and should not be used outside of smw-core + * + * @license GNU GPL v2+ + * @since 1.9.2 + * + * @author mwjames + */ +class DataRebuilder { + + /** + * @var Store + */ + private $store; + + /** + * @var TitleFactory + */ + private $titleFactory; + + /** + * @var Options + */ + private $options; + + /** + * @var MessageReporter + */ + private $reporter; + + /** + * @var DistinctEntityDataRebuilder + */ + private $distinctEntityDataRebuilder; + + /** + * @var ExceptionFileLogger + */ + private $exceptionFileLogger; + + /** + * @var integer + */ + private $rebuildCount = 0; + + /** + * @var integer + */ + private $exceptionCount = 0; + + private $delay = false; + private $canWriteToIdFile = false; + private $start = 1; + private $end = false; + + /** + * @var int[] + */ + private $filters = []; + private $verbose = false; + private $startIdFile = false; + + /** + * @since 1.9.2 + * + * @param Store $store + * @param TitleFactory $titleFactory + */ + public function __construct( Store $store, TitleFactory $titleFactory ) { + $this->store = $store; + $this->titleFactory = $titleFactory; + $this->reporter = MessageReporterFactory::getInstance()->newNullMessageReporter(); + $this->distinctEntityDataRebuilder = new DistinctEntityDataRebuilder( $store, $titleFactory ); + $this->exceptionFileLogger = new ExceptionFileLogger( 'rebuilddata' ); + } + + /** + * @since 2.1 + * + * @param MessageReporter $reporter + */ + public function setMessageReporter( MessageReporter $reporter ) { + $this->reporter = $reporter; + } + + /** + * @since 1.9.2 + * + * @param Options $options + */ + public function setOptions( Options $options ) { + $this->options = $options; + + if ( $options->has( 'server' ) ) { + $GLOBALS['wgServer'] = $options->get( 'server' ); + } + + if ( $options->has( 'd' ) ) { + $this->delay = intval( $options->get( 'd' ) ) * 1000; // convert milliseconds to microseconds + } + + if ( $options->has( 's' ) ) { + $this->start = max( 1, intval( $options->get( 's' ) ) ); + } elseif ( $options->has( 'startidfile' ) ) { + + $this->canWriteToIdFile = $this->is_writable( $options->get( 'startidfile' ) ); + $this->startIdFile = $options->get( 'startidfile' ); + + if ( is_readable( $options->get( 'startidfile' ) ) ) { + $this->start = max( 1, intval( file_get_contents( $options->get( 'startidfile' ) ) ) ); + } + } + + // Note: this might reasonably be larger than the page count + if ( $options->has( 'e' ) ) { + $this->end = intval( $options->get( 'e' ) ); + } elseif ( $options->has( 'n' ) ) { + $this->end = $this->start + intval( $options->get( 'n' ) ); + } + + $this->verbose = $options->has( 'v' ); + $this->exceptionFileLogger->setOptions( $options ); + + $this->setFiltersFromOptions( $options ); + } + + /** + * @since 1.9.2 + * + * @return boolean + */ + public function rebuild() { + + $this->reportMessage( + "\nLong-running scripts may cause memory leaks, if a deteriorating\n" . + "rebuild process is detected (after many pages, typically more\n". + "than 10000), please abort with CTRL-C and resume this script\n" . + "at the last processed ID using the parameter -s. Continue this\n" . + "until all pages have been refreshed.\n" + ); + + $this->reportMessage( + "\nThe progress displayed is an estimation and is self-adjusting \n" . + "during the maintenance process.\n" + ); + + $storeName = get_class( $this->store ); + + if ( strpos( $storeName, "\\") !== false ) { + $storeName = explode("\\", $storeName ); + $storeName = end( $storeName ); + } + + $this->reportMessage( "\nRunning for storage: " . $storeName . "\n\n" ); + + if ( $this->options->has( 'f' ) ) { + $this->performFullDelete(); + } + + if ( $this->options->has( 'page' ) || $this->options->has( 'query' ) || $this->hasFilters() || $this->options->has( 'redirects' ) ) { + return $this->rebuild_selection(); + } + + return $this->rebuild_all(); + } + + private function hasFilters() { + return $this->filters !== []; + } + + /** + * @since 1.9.2 + * + * @return int + */ + public function getRebuildCount() { + return $this->rebuildCount; + } + + /** + * @since 3.0 + * + * @return int + */ + public function getExceptionCount() { + return $this->exceptionCount; + } + + private function rebuild_selection() { + + $this->distinctEntityDataRebuilder->setOptions( + $this->options + ); + + $this->distinctEntityDataRebuilder->setMessageReporter( + $this->reporter + ); + + $this->distinctEntityDataRebuilder->setExceptionFileLogger( + $this->exceptionFileLogger + ); + + $this->distinctEntityDataRebuilder->doRebuild(); + + $this->rebuildCount = $this->distinctEntityDataRebuilder->getRebuildCount(); + + if ( $this->options->has( 'ignore-exceptions' ) && $this->exceptionFileLogger->getExceptionCount() > 0 ) { + $count = $this->exceptionFileLogger->getExceptionCount(); + $this->exceptionFileLogger->doWrite(); + + $path_parts = pathinfo( + str_replace( [ '\\', '/' ], DIRECTORY_SEPARATOR, $this->exceptionFileLogger->getExceptionFile() ) + ); + + $this->reportMessage( "\nException log ..." ); + $this->reportMessage( "\n ... counted $count exceptions" ); + $this->reportMessage( "\n ... written to ... " . $path_parts['basename'] ); + $this->reportMessage( "\n ... done.\n" ); + + $this->exceptionCount += $count; + } + + return true; + } + + private function rebuild_all() { + + $this->entityRebuildDispatcher = $this->store->refreshData( + $this->start, + 1 + ); + + $this->entityRebuildDispatcher->setDispatchRangeLimit( 1 ); + + $this->entityRebuildDispatcher->setOptions( + [ + 'shallow-update' => $this->options->safeGet( 'shallow-update', false ), + 'force-update' => $this->options->safeGet( 'force-update', false ), + 'revision-mode' => $this->options->safeGet( 'revision-mode', false ), + 'use-job' => false + ] + ); + + // By default we expect the disposal action to take place whenever the + // script is run + $this->dispose_outdated(); + + // Only expected the disposal action? + if ( $this->options->has( 'dispose-outdated' ) ) { + return true; + } + + $this->reportMessage( "\n" ); + + if ( !$this->options->has( 'skip-properties' ) ) { + $this->options->set( 'p', true ); + $this->rebuild_selection(); + $this->reportMessage( "\n" ); + } + + $this->store->clear(); + + if ( $this->start > 1 && $this->end === false ) { + $this->end = $this->entityRebuildDispatcher->getMaxId(); + } + + $total = $this->end && $this->end - $this->start > 0 ? $this->end - $this->start : $this->entityRebuildDispatcher->getMaxId(); + $id = $this->start; + + $this->reportMessage( + "Rebuilding semantic data ..." + ); + + $this->reportMessage( + "\n ... selecting $this->start to " . + ( $this->end ? "$this->end" : $this->entityRebuildDispatcher->getMaxId() ) . " IDs ...\n" + ); + + $this->rebuildCount = 0; + $progress = 0; + $estimatedProgress = 0; + $skipped_update = 0; + + while ( ( ( !$this->end ) || ( $id <= $this->end ) ) && ( $id > 0 ) ) { + + $current_id = $id; + + // Changes the ID to next target! + $this->do_update( $id ); + + if ( $this->rebuildCount % 60 === 0 ) { + $estimatedProgress = $this->entityRebuildDispatcher->getEstimatedProgress(); + } + + $progress = round( ( $this->end - $this->start > 0 ? $this->rebuildCount / $total : $estimatedProgress ) * 100 ); + + foreach ( $this->entityRebuildDispatcher->getDispatchedEntities() as $value ) { + + if ( isset( $value['skipped'] ) ) { + $skipped_update++; + continue; + } + + $text = $this->getHumanReadableTextFrom( $current_id, $value ); + + $this->reportMessage( + sprintf( "%-16s%s\n", " ... updating", sprintf( "%-10s%s", $text[0], $text[1] ) ), + $this->options->has( 'v' ) + ); + } + + if ( !$this->options->has( 'v' ) && $id > 0 ) { + $this->reportMessage( + "\r". sprintf( "%-50s%s", " ... updating document no.", sprintf( "%s (%1.0f%%)", $current_id, min( 100, $progress ) ) ) + ); + } + } + + if ( !$this->options->has( 'v' ) ) { + $this->reportMessage( + "\r". sprintf( "%-50s%s", " ... updating document no.", sprintf( "%s (%1.0f%%)", $current_id, 100 ) ) + ); + } + + $this->write_to_file( $id ); + + $this->reportMessage( "\n ... $this->rebuildCount IDs checked or refreshed ..." ); + $this->reportMessage( "\n ... $skipped_update IDs skipped ..." ); + $this->reportMessage( "\n ... done.\n" ); + + if ( $this->options->has( 'ignore-exceptions' ) && $this->exceptionFileLogger->getExceptionCount() > 0 ) { + $this->exceptionCount += $this->exceptionFileLogger->getExceptionCount(); + $this->exceptionFileLogger->doWrite(); + + $path_parts = pathinfo( + str_replace( [ '\\', '/' ], DIRECTORY_SEPARATOR, $this->exceptionFileLogger->getExceptionFile() ) + ); + + $this->reportMessage( "\nException log ..." ); + $this->reportMessage( "\n ... counted $this->exceptionCount exceptions" ); + $this->reportMessage( "\n ... written to ... " . $path_parts['basename'] ); + $this->reportMessage( "\n ... done.\n" ); + } + + return true; + } + + private function do_update( &$id ) { + + if ( !$this->options->has( 'ignore-exceptions' ) ) { + $this->entityRebuildDispatcher->rebuild( $id ); + } else { + + try { + $this->entityRebuildDispatcher->rebuild( $id ); + } catch ( Exception $e ) { + $this->exceptionFileLogger->recordException( $id, $e ); + } + } + + if ( $this->delay !== false ) { + usleep( $this->delay ); + } + + if ( $this->rebuildCount % 100 === 0 ) { // every 100 pages only + LinkCache::singleton()->clear(); // avoid memory leaks + } + + $this->rebuildCount++; + } + + private function getHumanReadableTextFrom( $id, array $entities ) { + + if ( !$this->options->has( 'v' ) ) { + return [ '', '']; + } + + // Indicates whether this is a MW page (*) or SMW's object table + $text = $id . ( isset( $entities['t'] ) ? '*' : ' ' ); + + $entity = end( $entities ); + + if ( $entity instanceof \Title ) { + return [ $text, '[' . $entity->getPrefixedDBKey() .']' ]; + } + + if ( $entity instanceof DIWikiPage ) { + return [ $text, '[' . $entity->getHash() .']' ]; + } + + return [ $text, '[' . ( is_string( $entity ) && $entity !== '' ? $entity : 'N/A' ) . ']' ]; + } + + private function performFullDelete() { + + $this->reportMessage( + "Deleting all stored data completely and rebuilding it again later!\n\n" . + "Semantic data in the wiki might be incomplete for some time while\n". + "this operation runs.\n\n" . + "NOTE: It is usually necessary to run this script ONE MORE TIME\n". + "after this operation, given that some properties and types are not\n" . + "yet stored with the first run.\n\n" + ); + + if ( $this->options->has( 's' ) || $this->options->has( 'e' ) ) { + $this->reportMessage( + "WARNING: -s or -e are used, so some pages will not be refreshed at all!\n" . + "Data for those pages will only be available again when they have been\n" . + "refreshed as well!\n\n" + ); + } + + $obLevel = ob_get_level(); + + $this->reportMessage( 'Abort with control-c in the next five seconds ... ' ); + swfCountDown( 6 ); + + $this->reportMessage( "\nDeleting all data ..." ); + + $this->reportMessage( "\n ... dropping tables ..." ); + $this->store->drop( $this->verbose ); + + $this->reportMessage( "\n ... creating tables ..." ); + $this->store->setupStore( $this->verbose ); + + $this->reportMessage( "\n ... done.\n" ); + + // Be sure to have some buffer, otherwise some PHPs complain + while ( ob_get_level() > $obLevel ) { + ob_end_flush(); + } + + $this->reportMessage( "\nAll storage structures have been deleted and recreated.\n\n" ); + + return true; + } + + private function dispose_outdated() { + + $applicationFactory = ApplicationFactory::getInstance(); + $entityIdDisposerJob = $applicationFactory->newJobFactory()->newEntityIdDisposerJob( + Title::newFromText( __METHOD__ ) + ); + + $outdatedEntitiesResultIterator = $entityIdDisposerJob->newOutdatedEntitiesResultIterator(); + $matchesCount = $outdatedEntitiesResultIterator->count(); + $counter = 0; + + $this->reportMessage( "Removing outdated entities ..." ); + + if ( $matchesCount > 0 ) { + $this->reportMessage( "\n" ); + + $chunkedIterator = $applicationFactory->getIteratorFactory()->newChunkedIterator( + $outdatedEntitiesResultIterator, + 200 + ); + + foreach ( $chunkedIterator as $chunk ) { + foreach ( $chunk as $row ) { + $counter++; + $msg = sprintf( "%s (%1.0f%%)", $row->smw_id, round( $counter / $matchesCount * 100 ) ); + + $this->reportMessage( + "\r". sprintf( "%-50s%s", " ... cleaning up document no.", $msg ) + ); + + $entityIdDisposerJob->dispose( $row ); + } + } + + $this->reportMessage( "\n ... {$matchesCount} IDs removed ..." ); + } + + $this->reportMessage( "\n ... done.\n" ); + } + + private function is_writable( $startIdFile ) { + + if ( !is_writable( file_exists( $startIdFile ) ? $startIdFile : dirname( $startIdFile ) ) ) { + die( "Cannot use a startidfile that we can't write to.\n" ); + } + + return true; + } + + private function write_to_file( $id ) { + if ( $this->canWriteToIdFile ) { + file_put_contents( $this->startIdFile, "$id" ); + } + } + + /** + * @param array $options + */ + private function setFiltersFromOptions( Options $options ) { + $this->filters = []; + + if ( $options->has( 'categories' ) ) { + $this->filters[] = NS_CATEGORY; + } + + if ( $options->has( 'p' ) ) { + $this->filters[] = SMW_NS_PROPERTY; + } + } + + private function reportMessage( $message, $output = true ) { + if ( $output ) { + $this->reporter->reportMessage( $message ); + } + } + +} |