summaryrefslogtreecommitdiff
path: root/www/wiki/maintenance/cleanupInvalidDbKeys.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/maintenance/cleanupInvalidDbKeys.php')
-rw-r--r--www/wiki/maintenance/cleanupInvalidDbKeys.php311
1 files changed, 311 insertions, 0 deletions
diff --git a/www/wiki/maintenance/cleanupInvalidDbKeys.php b/www/wiki/maintenance/cleanupInvalidDbKeys.php
new file mode 100644
index 00000000..54ed3aad
--- /dev/null
+++ b/www/wiki/maintenance/cleanupInvalidDbKeys.php
@@ -0,0 +1,311 @@
+<?php
+/**
+ * Cleans up invalid titles in various tables.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that cleans up invalid titles in various tables.
+ *
+ * @since 1.29
+ * @ingroup Maintenance
+ */
+class CleanupInvalidDbKeys extends Maintenance {
+ /** @var array List of tables to clean up, and the field prefix for that table */
+ protected static $tables = [
+ // Data tables
+ [ 'page', 'page' ],
+ [ 'redirect', 'rd', 'idField' => 'rd_from' ],
+ [ 'archive', 'ar' ],
+ [ 'logging', 'log' ],
+ [ 'protected_titles', 'pt', 'idField' => 0 ],
+ [ 'category', 'cat', 'nsField' => 14 ],
+ [ 'recentchanges', 'rc' ],
+ [ 'watchlist', 'wl' ],
+ // The querycache tables' qc(c)_title and qcc_titletwo may contain titles,
+ // but also usernames or other things like that, so we leave them alone
+
+ // Links tables
+ [ 'pagelinks', 'pl', 'idField' => 'pl_from' ],
+ [ 'templatelinks', 'tl', 'idField' => 'tl_from' ],
+ [ 'categorylinks', 'cl', 'idField' => 'cl_from', 'nsField' => 14, 'titleField' => 'cl_to' ],
+ ];
+
+ public function __construct() {
+ parent::__construct();
+ $this->addDescription( <<<'TEXT'
+This script cleans up the title fields in various tables to remove entries that
+will be rejected by the constructor of TitleValue. This constructor throws an
+exception when invalid data is encountered, which will not normally occur on
+regular page views, but can happen on query special pages.
+
+The script targets titles matching the regular expression /^_|[ \r\n\t]|_$/.
+Because any foreign key relationships involving these titles will already be
+broken, the titles are corrected to a valid version or the rows are deleted
+entirely, depending on the table.
+
+The script runs with the expectation that STDOUT is redirected to a file.
+TEXT
+ );
+ $this->addOption( 'fix', 'Actually clean up invalid titles. If this parameter is ' .
+ 'not specified, the script will report invalid titles but not clean them up.',
+ false, false );
+ $this->addOption( 'table', 'The table(s) to process. This option can be specified ' .
+ 'more than once (e.g. -t category -t watchlist). If not specified, all available ' .
+ 'tables will be processed. Available tables are: ' .
+ implode( ', ', array_column( static::$tables, 0 ) ), false, true, 't', true );
+
+ $this->setBatchSize( 500 );
+ }
+
+ public function execute() {
+ $tablesToProcess = $this->getOption( 'table' );
+ foreach ( static::$tables as $tableParams ) {
+ if ( !$tablesToProcess || in_array( $tableParams[0], $tablesToProcess ) ) {
+ $this->cleanupTable( $tableParams );
+ }
+ }
+
+ $this->outputStatus( 'Done!' );
+ if ( $this->hasOption( 'fix' ) ) {
+ $this->outputStatus( ' Cleaned up invalid DB keys on ' . wfWikiID() . "!\n" );
+ }
+ }
+
+ /**
+ * Prints text to STDOUT, and STDERR if STDOUT was redirected to a file.
+ * Used for progress reporting.
+ *
+ * @param string $str Text to write to both places
+ * @param string|null $channel Ignored
+ */
+ protected function outputStatus( $str, $channel = null ) {
+ // Make it easier to find progress lines in the STDOUT log
+ if ( trim( $str ) ) {
+ fwrite( STDOUT, '*** ' . trim( $str ) . "\n" );
+ }
+ fwrite( STDERR, $str );
+ }
+
+ /**
+ * Prints text to STDOUT. Used for logging output.
+ *
+ * @param string $str Text to write
+ */
+ protected function writeToReport( $str ) {
+ fwrite( STDOUT, $str );
+ }
+
+ /**
+ * Identifies, and optionally cleans up, invalid titles.
+ *
+ * @param array $tableParams A child array of self::$tables
+ */
+ protected function cleanupTable( $tableParams ) {
+ $table = $tableParams[0];
+ $prefix = $tableParams[1];
+ $idField = isset( $tableParams['idField'] ) ?
+ $tableParams['idField'] :
+ "{$prefix}_id";
+ $nsField = isset( $tableParams['nsField'] ) ?
+ $tableParams['nsField'] :
+ "{$prefix}_namespace";
+ $titleField = isset( $tableParams['titleField'] ) ?
+ $tableParams['titleField'] :
+ "{$prefix}_title";
+
+ $this->outputStatus( "Looking for invalid $titleField entries in $table...\n" );
+
+ // Do all the select queries on the replicas, as they are slow (they use
+ // unanchored LIKEs). Naturally this could cause problems if rows are
+ // modified after selecting and before deleting/updating, but working on
+ // the hypothesis that invalid rows will be old and in all likelihood
+ // unreferenced, we should be fine to do it like this.
+ $dbr = $this->getDB( DB_REPLICA, 'vslow' );
+
+ // Find all TitleValue-invalid titles.
+ $percent = $dbr->anyString(); // DBMS-agnostic equivalent of '%' LIKE wildcard
+ $res = $dbr->select(
+ $table,
+ [
+ 'id' => $idField,
+ 'ns' => $nsField,
+ 'title' => $titleField,
+ ],
+ // The REGEXP operator is not cross-DBMS, so we have to use lots of LIKEs
+ [ $dbr->makeList( [
+ $titleField . $dbr->buildLike( $percent, ' ', $percent ),
+ $titleField . $dbr->buildLike( $percent, "\r", $percent ),
+ $titleField . $dbr->buildLike( $percent, "\n", $percent ),
+ $titleField . $dbr->buildLike( $percent, "\t", $percent ),
+ $titleField . $dbr->buildLike( '_', $percent ),
+ $titleField . $dbr->buildLike( $percent, '_' ),
+ ], LIST_OR ) ],
+ __METHOD__,
+ [ 'LIMIT' => $this->getBatchSize() ]
+ );
+
+ $this->outputStatus( "Number of invalid rows: " . $res->numRows() . "\n" );
+ if ( !$res->numRows() ) {
+ $this->outputStatus( "\n" );
+ return;
+ }
+
+ // Write a table of titles to the report file. Also keep a list of the found
+ // IDs, as we might need it later for DB updates
+ $this->writeToReport( sprintf( "%10s | ns | dbkey\n", $idField ) );
+ $ids = [];
+ foreach ( $res as $row ) {
+ $this->writeToReport( sprintf( "%10d | %3d | %s\n", $row->id, $row->ns, $row->title ) );
+ $ids[] = $row->id;
+ }
+
+ // If we're doing a dry run, output the new titles we would use for the UPDATE
+ // queries (if relevant), and finish
+ if ( !$this->hasOption( 'fix' ) ) {
+ if ( $table === 'logging' || $table === 'archive' ) {
+ $this->writeToReport( "The following updates would be run with the --fix flag:\n" );
+ foreach ( $res as $row ) {
+ $newTitle = self::makeValidTitle( $row->title );
+ $this->writeToReport(
+ "$idField={$row->id}: update '{$row->title}' to '$newTitle'\n" );
+ }
+ }
+
+ if ( $table !== 'page' && $table !== 'redirect' ) {
+ $this->outputStatus( "Run with --fix to clean up these rows\n" );
+ }
+ $this->outputStatus( "\n" );
+ return;
+ }
+
+ // Fix the bad data, using different logic for the various tables
+ $dbw = $this->getDB( DB_MASTER );
+ switch ( $table ) {
+ case 'page':
+ case 'redirect':
+ // This shouldn't happen on production wikis, and we already have a script
+ // to handle 'page' rows anyway, so just notify the user and let them decide
+ // what to do next.
+ $this->outputStatus( <<<TEXT
+IMPORTANT: This script does not fix invalid entries in the $table table.
+Consider repairing these rows, and rows in related tables, by hand.
+You may like to run, or borrow logic from, the cleanupTitles.php script.
+
+TEXT
+ );
+ break;
+
+ case 'archive':
+ case 'logging':
+ // Rename the title to a corrected equivalent. Any foreign key relationships
+ // to the page_title field are already broken, so this will just make sure
+ // users can still access the log entries/deleted revisions from the interface
+ // using a valid page title.
+ $this->outputStatus(
+ "Updating these rows, setting $titleField to the closest valid DB key...\n" );
+ $affectedRowCount = 0;
+ foreach ( $res as $row ) {
+ $newTitle = self::makeValidTitle( $row->title );
+ $this->writeToReport(
+ "$idField={$row->id}: updating '{$row->title}' to '$newTitle'\n" );
+
+ $dbw->update( $table,
+ [ $titleField => $newTitle ],
+ [ $idField => $row->id ],
+ __METHOD__ );
+ $affectedRowCount += $dbw->affectedRows();
+ }
+ wfWaitForSlaves();
+ $this->outputStatus( "Updated $affectedRowCount rows on $table.\n" );
+
+ break;
+
+ case 'recentchanges':
+ case 'watchlist':
+ case 'category':
+ // Since these broken titles can't exist, there's really nothing to watch,
+ // nothing can be categorised in them, and they can't have been changed
+ // recently, so we can just remove these rows.
+ $this->outputStatus( "Deleting invalid $table rows...\n" );
+ $dbw->delete( $table, [ $idField => $ids ], __METHOD__ );
+ wfWaitForSlaves();
+ $this->outputStatus( 'Deleted ' . $dbw->affectedRows() . " rows from $table.\n" );
+ break;
+
+ case 'protected_titles':
+ // Since these broken titles can't exist, there's really nothing to protect,
+ // so we can just remove these rows. Made more complicated by this table
+ // not having an ID field
+ $this->outputStatus( "Deleting invalid $table rows...\n" );
+ $affectedRowCount = 0;
+ foreach ( $res as $row ) {
+ $dbw->delete( $table,
+ [ $nsField => $row->ns, $titleField => $row->title ],
+ __METHOD__ );
+ $affectedRowCount += $dbw->affectedRows();
+ }
+ wfWaitForSlaves();
+ $this->outputStatus( "Deleted $affectedRowCount rows from $table.\n" );
+ break;
+
+ case 'pagelinks':
+ case 'templatelinks':
+ case 'categorylinks':
+ // Update links tables for each page where these bogus links are supposedly
+ // located. If the invalid rows don't go away after these jobs go through,
+ // they're probably being added by a buggy hook.
+ $this->outputStatus( "Queueing link update jobs for the pages in $idField...\n" );
+ foreach ( $res as $row ) {
+ $wp = WikiPage::newFromID( $row->id );
+ if ( $wp ) {
+ RefreshLinks::fixLinksFromArticle( $row->id );
+ } else {
+ // This link entry points to a nonexistent page, so just get rid of it
+ $dbw->delete( $table,
+ [ $idField => $row->id, $nsField => $row->ns, $titleField => $row->title ],
+ __METHOD__ );
+ }
+ }
+ wfWaitForSlaves();
+ $this->outputStatus( "Link update jobs have been added to the job queue.\n" );
+ break;
+ }
+
+ $this->outputStatus( "\n" );
+ return;
+ }
+
+ /**
+ * Fix possible validation issues in the given title (DB key).
+ *
+ * @param string $invalidTitle
+ * @return string
+ */
+ protected static function makeValidTitle( $invalidTitle ) {
+ return strtr( trim( $invalidTitle, '_' ),
+ [ ' ' => '_', "\r" => '', "\n" => '', "\t" => '_' ] );
+ }
+}
+
+$maintClass = CleanupInvalidDbKeys::class;
+require_once RUN_MAINTENANCE_IF_MAIN;