summaryrefslogtreecommitdiff
path: root/www/wiki/maintenance/recountCategories.php
diff options
context:
space:
mode:
Diffstat (limited to 'www/wiki/maintenance/recountCategories.php')
-rw-r--r--www/wiki/maintenance/recountCategories.php172
1 files changed, 172 insertions, 0 deletions
diff --git a/www/wiki/maintenance/recountCategories.php b/www/wiki/maintenance/recountCategories.php
new file mode 100644
index 00000000..7e8f0636
--- /dev/null
+++ b/www/wiki/maintenance/recountCategories.php
@@ -0,0 +1,172 @@
+<?php
+/**
+ * Refreshes category counts.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+use MediaWiki\MediaWikiServices;
+
+/**
+ * Maintenance script that refreshes category membership counts in the category
+ * table.
+ *
+ * (The populateCategory.php script will also recalculate counts, but
+ * recountCategories only updates rows that need to be updated, making it more
+ * efficient.)
+ *
+ * @ingroup Maintenance
+ */
+class RecountCategories extends Maintenance {
+ public function __construct() {
+ parent::__construct();
+ $this->addDescription( <<<'TEXT'
+This script refreshes the category membership counts stored in the category
+table. As time passes, these counts often drift from the actual number of
+category members. The script identifies rows where the value in the category
+table does not match the number of categorylinks rows for that category, and
+updates the category table accordingly.
+
+To fully refresh the data in the category table, you need to run this script
+three times: once in each mode. Alternatively, just one mode can be run if
+required.
+TEXT
+ );
+ $this->addOption(
+ 'mode',
+ '(REQUIRED) Which category count column to recompute: "pages", "subcats" or "files".',
+ true,
+ true
+ );
+ $this->addOption(
+ 'begin',
+ 'Only recount categories with cat_id greater than the given value',
+ false,
+ true
+ );
+ $this->addOption(
+ 'throttle',
+ 'Wait this many milliseconds after each batch. Default: 0',
+ false,
+ true
+ );
+
+ $this->setBatchSize( 500 );
+ }
+
+ public function execute() {
+ $this->mode = $this->getOption( 'mode' );
+ if ( !in_array( $this->mode, [ 'pages', 'subcats', 'files' ] ) ) {
+ $this->fatalError( 'Please specify a valid mode: one of "pages", "subcats" or "files".' );
+ }
+
+ $this->minimumId = intval( $this->getOption( 'begin', 0 ) );
+
+ // do the work, batch by batch
+ $affectedRows = 0;
+ while ( ( $result = $this->doWork() ) !== false ) {
+ $affectedRows += $result;
+ usleep( $this->getOption( 'throttle', 0 ) * 1000 );
+ }
+
+ $this->output( "Done! Updated the {$this->mode} counts of $affectedRows categories.\n" .
+ "Now run the script using the other --mode options if you haven't already.\n" );
+ if ( $this->mode === 'pages' ) {
+ $this->output(
+ "Also run 'php cleanupEmptyCategories.php --mode remove' to remove empty,\n" .
+ "nonexistent categories from the category table.\n\n" );
+ }
+ }
+
+ protected function doWork() {
+ $this->output( "Finding up to {$this->getBatchSize()} drifted rows " .
+ "starting at cat_id {$this->getBatchSize()}...\n" );
+
+ $countingConds = [ 'cl_to = cat_title' ];
+ if ( $this->mode === 'subcats' ) {
+ $countingConds['cl_type'] = 'subcat';
+ } elseif ( $this->mode === 'files' ) {
+ $countingConds['cl_type'] = 'file';
+ }
+
+ $dbr = $this->getDB( DB_REPLICA, 'vslow' );
+ $countingSubquery = $dbr->selectSQLText( 'categorylinks',
+ 'COUNT(*)',
+ $countingConds,
+ __METHOD__ );
+
+ // First, let's find out which categories have drifted and need to be updated.
+ // The query counts the categorylinks for each category on the replica DB,
+ // but this data can't be used for updating the master, so we don't include it
+ // in the results.
+ $idsToUpdate = $dbr->selectFieldValues( 'category',
+ 'cat_id',
+ [
+ 'cat_id > ' . $this->minimumId,
+ "cat_{$this->mode} != ($countingSubquery)"
+ ],
+ __METHOD__,
+ [ 'LIMIT' => $this->getBatchSize() ]
+ );
+ if ( !$idsToUpdate ) {
+ return false;
+ }
+ $this->output( "Updating cat_{$this->mode} field on " .
+ count( $idsToUpdate ) . " rows...\n" );
+
+ // In the next batch, start where this query left off. The rows selected
+ // in this iteration shouldn't be selected again after being updated, but
+ // we still keep track of where we are up to, as extra protection against
+ // infinite loops.
+ $this->minimumId = end( $idsToUpdate );
+
+ // Now, on master, find the correct counts for these categories.
+ $dbw = $this->getDB( DB_MASTER );
+ $res = $dbw->select( 'category',
+ [ 'cat_id', 'count' => "($countingSubquery)" ],
+ [ 'cat_id' => $idsToUpdate ],
+ __METHOD__ );
+
+ // Update the category counts on the rows we just identified.
+ // This logic is equivalent to Category::refreshCounts, except here, we
+ // don't remove rows when cat_pages is zero and the category description page
+ // doesn't exist - instead we print a suggestion to run
+ // cleanupEmptyCategories.php.
+ $affectedRows = 0;
+ foreach ( $res as $row ) {
+ $dbw->update( 'category',
+ [ "cat_{$this->mode}" => $row->count ],
+ [
+ 'cat_id' => $row->cat_id,
+ "cat_{$this->mode} != " . (int)( $row->count ),
+ ],
+ __METHOD__ );
+ $affectedRows += $dbw->affectedRows();
+ }
+
+ MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
+
+ return $affectedRows;
+ }
+}
+
+$maintClass = RecountCategories::class;
+require_once RUN_MAINTENANCE_IF_MAIN;