summaryrefslogtreecommitdiff
path: root/www/wiki/includes/jobqueue/jobs/RefreshLinksJob.php
blob: 8854c6560fa520ebbc2240e10bd2f19d6f799cda (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
<?php
/**
 * Job to update link tables for pages
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup JobQueue
 */
use MediaWiki\MediaWikiServices;
use Wikimedia\Rdbms\DBReplicationWaitError;

/**
 * Job to update link tables for pages
 *
 * This job comes in a few variants:
 *   - a) Recursive jobs to update links for backlink pages for a given title.
 *        These jobs have (recursive:true,table:<table>) set.
 *   - b) Jobs to update links for a set of pages (the job title is ignored).
 *        These jobs have (pages:(<page ID>:(<namespace>,<title>),...) set.
 *   - c) Jobs to update links for a single page (the job title)
 *        These jobs need no extra fields set.
 *
 * @ingroup JobQueue
 */
class RefreshLinksJob extends Job {
	/** @var float Cache parser output when it takes this long to render */
	const PARSE_THRESHOLD_SEC = 1.0;
	/** @var int Lag safety margin when comparing root job times to last-refresh times */
	const CLOCK_FUDGE = 10;
	/** @var int How many seconds to wait for replica DBs to catch up */
	const LAG_WAIT_TIMEOUT = 15;

	function __construct( Title $title, array $params ) {
		parent::__construct( 'refreshLinks', $title, $params );
		// Avoid the overhead of de-duplication when it would be pointless
		$this->removeDuplicates = (
			// Ranges rarely will line up
			!isset( $params['range'] ) &&
			// Multiple pages per job make matches unlikely
			!( isset( $params['pages'] ) && count( $params['pages'] ) != 1 )
		);
		$this->params += [ 'causeAction' => 'unknown', 'causeAgent' => 'unknown' ];
	}

	/**
	 * @param Title $title
	 * @param array $params
	 * @return RefreshLinksJob
	 */
	public static function newPrioritized( Title $title, array $params ) {
		$job = new self( $title, $params );
		$job->command = 'refreshLinksPrioritized';

		return $job;
	}

	/**
	 * @param Title $title
	 * @param array $params
	 * @return RefreshLinksJob
	 */
	public static function newDynamic( Title $title, array $params ) {
		$job = new self( $title, $params );
		$job->command = 'refreshLinksDynamic';

		return $job;
	}

	function run() {
		global $wgUpdateRowsPerJob;

		// Job to update all (or a range of) backlink pages for a page
		if ( !empty( $this->params['recursive'] ) ) {
			// When the base job branches, wait for the replica DBs to catch up to the master.
			// From then on, we know that any template changes at the time the base job was
			// enqueued will be reflected in backlink page parses when the leaf jobs run.
			if ( !isset( $this->params['range'] ) ) {
				try {
					$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
					$lbFactory->waitForReplication( [
						'wiki'    => wfWikiID(),
						'timeout' => self::LAG_WAIT_TIMEOUT
					] );
				} catch ( DBReplicationWaitError $e ) { // only try so hard
					$stats = MediaWikiServices::getInstance()->getStatsdDataFactory();
					$stats->increment( 'refreshlinks.lag_wait_failed' );
				}
			}
			// Carry over information for de-duplication
			$extraParams = $this->getRootJobParams();
			$extraParams['triggeredRecursive'] = true;
			// Carry over cause information for logging
			$extraParams['causeAction'] = $this->params['causeAction'];
			$extraParams['causeAgent'] = $this->params['causeAgent'];
			// Convert this into no more than $wgUpdateRowsPerJob RefreshLinks per-title
			// jobs and possibly a recursive RefreshLinks job for the rest of the backlinks
			$jobs = BacklinkJobUtils::partitionBacklinkJob(
				$this,
				$wgUpdateRowsPerJob,
				1, // job-per-title
				[ 'params' => $extraParams ]
			);
			JobQueueGroup::singleton()->push( $jobs );
		// Job to update link tables for a set of titles
		} elseif ( isset( $this->params['pages'] ) ) {
			foreach ( $this->params['pages'] as $nsAndKey ) {
				list( $ns, $dbKey ) = $nsAndKey;
				$this->runForTitle( Title::makeTitleSafe( $ns, $dbKey ) );
			}
		// Job to update link tables for a given title
		} else {
			$this->runForTitle( $this->title );
		}

		return true;
	}

	/**
	 * @param Title $title
	 * @return bool
	 */
	protected function runForTitle( Title $title ) {
		$services = MediaWikiServices::getInstance();
		$stats = $services->getStatsdDataFactory();
		$lbFactory = $services->getDBLoadBalancerFactory();
		$ticket = $lbFactory->getEmptyTransactionTicket( __METHOD__ );

		$page = WikiPage::factory( $title );
		$page->loadPageData( WikiPage::READ_LATEST );

		// Serialize links updates by page ID so they see each others' changes
		$dbw = $lbFactory->getMainLB()->getConnection( DB_MASTER );
		/** @noinspection PhpUnusedLocalVariableInspection */
		$scopedLock = LinksUpdate::acquirePageLock( $dbw, $page->getId(), 'job' );
		// Get the latest ID *after* acquirePageLock() flushed the transaction.
		// This is used to detect edits/moves after loadPageData() but before the scope lock.
		// The works around the chicken/egg problem of determining the scope lock key.
		$latest = $title->getLatestRevID( Title::GAID_FOR_UPDATE );

		if ( !empty( $this->params['triggeringRevisionId'] ) ) {
			// Fetch the specified revision; lockAndGetLatest() below detects if the page
			// was edited since and aborts in order to avoid corrupting the link tables
			$revision = Revision::newFromId(
				$this->params['triggeringRevisionId'],
				Revision::READ_LATEST
			);
		} else {
			// Fetch current revision; READ_LATEST reduces lockAndGetLatest() check failures
			$revision = Revision::newFromTitle( $title, false, Revision::READ_LATEST );
		}

		if ( !$revision ) {
			$stats->increment( 'refreshlinks.rev_not_found' );
			$this->setLastError( "Revision not found for {$title->getPrefixedDBkey()}" );
			return false; // just deleted?
		} elseif ( $revision->getId() != $latest || $revision->getPage() !== $page->getId() ) {
			// Do not clobber over newer updates with older ones. If all jobs where FIFO and
			// serialized, it would be OK to update links based on older revisions since it
			// would eventually get to the latest. Since that is not the case (by design),
			// only update the link tables to a state matching the current revision's output.
			$stats->increment( 'refreshlinks.rev_not_current' );
			$this->setLastError( "Revision {$revision->getId()} is not current" );
			return false;
		}

		$content = $revision->getContent( Revision::RAW );
		if ( !$content ) {
			// If there is no content, pretend the content is empty
			$content = $revision->getContentHandler()->makeEmptyContent();
		}

		$parserOutput = false;
		$parserOptions = $page->makeParserOptions( 'canonical' );
		// If page_touched changed after this root job, then it is likely that
		// any views of the pages already resulted in re-parses which are now in
		// cache. The cache can be reused to avoid expensive parsing in some cases.
		if ( isset( $this->params['rootJobTimestamp'] ) ) {
			$opportunistic = !empty( $this->params['isOpportunistic'] );

			$skewedTimestamp = $this->params['rootJobTimestamp'];
			if ( $opportunistic ) {
				// Neither clock skew nor DB snapshot/replica DB lag matter much for such
				// updates; focus on reusing the (often recently updated) cache
			} else {
				// For transclusion updates, the template changes must be reflected
				$skewedTimestamp = wfTimestamp( TS_MW,
					wfTimestamp( TS_UNIX, $skewedTimestamp ) + self::CLOCK_FUDGE
				);
			}

			if ( $page->getLinksTimestamp() > $skewedTimestamp ) {
				// Something already updated the backlinks since this job was made
				$stats->increment( 'refreshlinks.update_skipped' );
				return true;
			}

			if ( $page->getTouched() >= $this->params['rootJobTimestamp'] || $opportunistic ) {
				// Cache is suspected to be up-to-date. As long as the cache rev ID matches
				// and it reflects the job's triggering change, then it is usable.
				$parserOutput = $services->getParserCache()->getDirty( $page, $parserOptions );
				if ( !$parserOutput
					|| $parserOutput->getCacheRevisionId() != $revision->getId()
					|| $parserOutput->getCacheTime() < $skewedTimestamp
				) {
					$parserOutput = false; // too stale
				}
			}
		}

		// Fetch the current revision and parse it if necessary...
		if ( $parserOutput ) {
			$stats->increment( 'refreshlinks.parser_cached' );
		} else {
			$start = microtime( true );
			// Revision ID must be passed to the parser output to get revision variables correct
			$parserOutput = $content->getParserOutput(
				$title, $revision->getId(), $parserOptions, false );
			$elapsed = microtime( true ) - $start;
			// If it took a long time to render, then save this back to the cache to avoid
			// wasted CPU by other apaches or job runners. We don't want to always save to
			// cache as this can cause high cache I/O and LRU churn when a template changes.
			if ( $elapsed >= self::PARSE_THRESHOLD_SEC
				&& $page->shouldCheckParserCache( $parserOptions, $revision->getId() )
				&& $parserOutput->isCacheable()
			) {
				$ctime = wfTimestamp( TS_MW, (int)$start ); // cache time
				$services->getParserCache()->save(
					$parserOutput, $page, $parserOptions, $ctime, $revision->getId()
				);
			}
			$stats->increment( 'refreshlinks.parser_uncached' );
		}

		$updates = $content->getSecondaryDataUpdates(
			$title,
			null,
			!empty( $this->params['useRecursiveLinksUpdate'] ),
			$parserOutput
		);

		// For legacy hook handlers doing updates via LinksUpdateConstructed, make sure
		// any pending writes they made get flushed before the doUpdate() calls below.
		// This avoids snapshot-clearing errors in LinksUpdate::acquirePageLock().
		$lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );

		foreach ( $updates as $update ) {
			// Carry over cause in case so the update can do extra logging
			$update->setCause( $this->params['causeAction'], $this->params['causeAgent'] );
			// FIXME: This code probably shouldn't be here?
			// Needed by things like Echo notifications which need
			// to know which user caused the links update
			if ( $update instanceof LinksUpdate ) {
				$update->setRevision( $revision );
				if ( !empty( $this->params['triggeringUser'] ) ) {
					$userInfo = $this->params['triggeringUser'];
					if ( $userInfo['userId'] ) {
						$user = User::newFromId( $userInfo['userId'] );
					} else {
						// Anonymous, use the username
						$user = User::newFromName( $userInfo['userName'], false );
					}
					$update->setTriggeringUser( $user );
				}
			}
		}

		foreach ( $updates as $update ) {
			$update->setTransactionTicket( $ticket );
			$update->doUpdate();
		}

		InfoAction::invalidateCache( $title );

		// Commit any writes here in case this method is called in a loop.
		// In that case, the scoped lock will fail to be acquired.
		$lbFactory->commitAndWaitForReplication( __METHOD__, $ticket );

		return true;
	}

	public function getDeduplicationInfo() {
		$info = parent::getDeduplicationInfo();
		unset( $info['causeAction'] );
		unset( $info['causeAgent'] );
		if ( is_array( $info['params'] ) ) {
			// For per-pages jobs, the job title is that of the template that changed
			// (or similar), so remove that since it ruins duplicate detection
			if ( isset( $info['params']['pages'] ) ) {
				unset( $info['namespace'] );
				unset( $info['title'] );
			}
		}

		return $info;
	}

	public function workItemCount() {
		if ( !empty( $this->params['recursive'] ) ) {
			return 0; // nothing actually refreshed
		} elseif ( isset( $this->params['pages'] ) ) {
			return count( $this->params['pages'] );
		}

		return 1; // one title
	}
}